In [1]:
import features_nodewise as nw
import features_pairwise as pw
import preprocessing as prep

import importlib
import matplotlib.pyplot as plt
import random
import numpy as np
import igraph
from sklearn import preprocessing
import nltk
import csv
from functools import reduce
from sklearn import metrics
from nltk.tokenize import RegexpTokenizer
import pickle

## Read train / test node pairs
nltk.download('punkt') # for tokenization
nltk.download('stopwords')
stpwds = set(nltk.corpus.stopwords.words("english"))
stemmer = nltk.stem.PorterStemmer()

with open("./data/train_train_set.csv", "r") as f:
    reader = csv.reader(f)
    training_set  = list(reader)

with open("./data/train_test_set.csv", "r") as f:
    reader = csv.reader(f)
    testing_set  = list(reader)

random.seed(0)
training_set = [element[0].split(" ") for element in training_set]
to_keep_train = random.sample( range(len(training_set)),k=int(round(len(training_set)*0.05)) )
training_set_reduced = [training_set[i] for i in to_keep_train]

testing_set = [element[0].split(" ") for element in testing_set]
to_keep_test = random.sample( range(len(testing_set)),k=int(round(len(testing_set)*1)) )
testing_set_reduced = [testing_set[i] for i in to_keep_test]

with open("./data/node_information.csv", "r") as f:
    reader = csv.reader(f)
    node_info  = list(reader)

corpus = [element[5] for element in node_info]
t = prep.tfidf(corpus)
l = nw.LSA(t)
IDs = [element[0] for element in node_info]
node_dict = prep.to_dict( [element[0] for element in node_info],range(len(node_info)) )
index_dict = prep.to_dict(range(len(IDs)),IDs)


[nltk_data] Downloading package punkt to /home/lucas/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/lucas/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
n_samples: 27770, n_features: 10000
Performing dimensionality reduction using LSA
Explained variance of the SVD step: 19%


In [2]:
# create graphs (train vs test), (reduced vs full)

train_IDs = set([training_set[i][0] for i in range(len(training_set))])
train_IDs = train_IDs | set([training_set[i][1] for i in range(len(training_set))])
train_IDs = list(train_IDs) #igraph doesn't like sets...
train_edges = [(element[0],element[1]) for element in training_set]
train_graph = prep.article_graph(train_IDs,train_edges)

train_IDs_reduced = set([training_set_reduced[i][0] for i in range(len(training_set_reduced))])
train_IDs_reduced = train_IDs_reduced | set([training_set_reduced[i][1] for i in range(len(training_set_reduced))])
train_IDs_reduced = list(train_IDs_reduced)
train_edges_reduced = [(element[0],element[1]) for element in training_set_reduced]
train_graph_reduced = prep.article_graph(train_IDs_reduced,train_edges_reduced)

test_IDs = set([testing_set[i][0] for i in range(len(testing_set))])
test_IDs = test_IDs | set([testing_set[i][1] for i in range(len(testing_set))])
test_IDs = list(test_IDs)
test_edges = [(element[0],element[1]) for element in testing_set]
test_graph = prep.article_graph(test_IDs,test_edges)

test_IDs_reduced = set([testing_set_reduced[i][0] for i in range(len(testing_set_reduced))])
test_IDs_reduced = test_IDs_reduced | set([testing_set_reduced[i][1] for i in range(len(testing_set_reduced))])
test_IDs_reduced = list(test_IDs_reduced)
test_edges_reduced = [(element[0],element[1]) for element in testing_set_reduced]
test_graph_reduced = prep.article_graph(test_IDs_reduced,test_edges_reduced)

'9507159'

In [3]:
## Read all the features that we have stored in files
import os.path

def to_feature_shape(feat):
    feat = np.array(feat)
    if len(feat.shape) == 1:#not a real array but just a long list
        feat = np.reshape(feat,(feat.shape[0],1))
    return feat

#This method should throw an error if something goes wrong
def read_feature(file_path):
    f = open(file_path, 'rb')
    feat = to_feature_shape(pickle.load(f))
    f.close()
    return feat
    
features_to_read = ["overlap_title",
                 "comm_auth",
                 "temp_diff",
                "citation_check",
                "max_sim",
                "peer_popularity",
                "edge_check",
                "LSA_distance",
                "node_degree"]

train_features_dict = dict()
train_features_reduced_dict = dict()
test_features_dict = dict()
test_features_reduced_dict = dict()
for name in features_to_read:
    # Train
    file_path = './features_train/'+name
    try:
        this_feat = read_feature(file_path)
        train_features_dict[name] = this_feat
        train_features_reduced_dict[name] = this_feat[to_keep_train,:]
    except:
        pass
    # Train reduced
    file_path = './features_train/'+name+'_reduced'
    try:
        this_feat = read_feature(file_path)
        train_features_reduced_dict[name] = this_feat
    except:
        pass
    # Test
    file_path = './features_test/'+name
    try:
        this_feat = read_feature(file_path)
        test_features_dict[name] = this_feat
        test_features_reduced_dict[name] = this_feat[to_keep_test,:]
    except:
        pass
    # Test reduced
    file_path = './features_test/'+name+'_reduced'
    try:
        this_feat = read_feature(file_path)
        test_features_reduced_dict[name] = this_feat
    except:
        pass

In [69]:
np.reshape(train_true_labels,(train_true_labels.shape[0],))

(554602,)

In [4]:
train_true_labels = read_feature('./features_train/true_labels')
train_true_labels = np.reshape(train_true_labels,(train_true_labels.shape[0],))
train_true_labels_reduced = train_true_labels[to_keep_train]
test_true_labels = read_feature('./features_test/true_labels')
test_true_labels = np.reshape(test_true_labels,(test_true_labels.shape[0],))
test_true_labels_reduced = test_true_labels[to_keep_test]

In [43]:
############################################
###  Construct features on TRAINING_SET  ###
############################################

#Build KDTree on training_set
train_l = [l[node_dict[i]] for i in train_IDs]
train_kdtree = nw.KDTree(train_l)

train_true_labels = []
features_to_create = ['citation_check']
# Where to insert created features
insert_features_dict = train_features_dict
for feat in features_to_create:
    insert_features_dict[feat] = []
set_to_use = training_set

for i,triple in enumerate(set_to_use):
    source = triple[0]
    target = triple[1]
    index_source = node_dict[source]
    index_target = node_dict[target]
    
    source_info = node_info[index_source]
    target_info = node_info[index_target]

    # convert to lowercase and tokenize
    source_title = source_info[2].lower().split(" ")
    # remove stopwords
    source_title = [token for token in source_title if token not in stpwds]
    source_title = [stemmer.stem(token) for token in source_title]

    target_title = target_info[2].lower().split(" ")
    target_title = [token for token in target_title if token not in stpwds]
    target_title = [stemmer.stem(token) for token in target_title]

    source_auth = source_info[3].split(",")
    target_auth = target_info[3].split(",") 
    
    
    # Creating features
    # Baseline #
    #overlap_title = len(set(source_title).intersection(set(target_title)))
    #insert_features_dict["overlap_title"].append(overlap_title)
    #temp_diff = int(source_info[1]) - int(target_info[1])
    #insert_features_dict["temp_diff"].append(temp_diff)
    #comm_auth = len(set(source_auth).intersection(set(target_auth)))
    #insert_features_dict["comm_auth"].append(comm_auth)

    #peer_pop = pw.peer_popularity(train_graph,source,target)
    #insert_features_dict["peer_popularity"].append(peer_pop)

    #max_sim = pw.Max_Sim(source,target,l,train_graph,node_dict)
    #insert_features_dict["max_sim"].append(max_sim)

    #edge_check = pw.edge_check(source,target,train_graph)
    #insert_features_dict["edge_check"].append(edge_check)

    #LSA_dist = pw.LSA_distance(source,target,node_dict,l)
    #insert_features_dict["LSA_distance"].append(LSA_dist)
    
    #citation_check = pw.Citation_Check(source,target,train_kdtree,l,train_graph,node_dict,index_dict,k=500)
    #insert_features_dict["citation_check"].append(citation_check)

    #degree = pw.node_degree(source,target,train_graph)
    #insert_features_dict["node_degree"].append(degree)

    train_true_labels.append(triple[2])

    if i%1000==0:
        print(i,"/",len(set_to_use))

# Reshape features into np column arrays, one row per node pair
for (name,value) in insert_features_dict.items():
    print(name,len(value))
    insert_features_dict[name] = to_feature_shape(value)
        
# Concatenate all features
feats_train = np.concatenate([feat for feat in insert_features_dict.values()])
train_true_labels = np.array(train_true_labels)

0 / 554602
1000 / 554602
2000 / 554602
3000 / 554602
4000 / 554602
5000 / 554602
6000 / 554602
7000 / 554602
8000 / 554602
9000 / 554602
10000 / 554602
11000 / 554602
12000 / 554602
13000 / 554602
14000 / 554602
15000 / 554602
16000 / 554602
17000 / 554602
18000 / 554602
19000 / 554602
20000 / 554602
21000 / 554602
22000 / 554602
23000 / 554602
24000 / 554602
25000 / 554602
26000 / 554602
27000 / 554602
28000 / 554602
29000 / 554602
30000 / 554602
31000 / 554602
32000 / 554602
33000 / 554602
34000 / 554602
35000 / 554602
36000 / 554602
37000 / 554602
38000 / 554602
39000 / 554602
40000 / 554602
41000 / 554602
42000 / 554602
43000 / 554602
44000 / 554602
45000 / 554602
46000 / 554602
47000 / 554602
48000 / 554602
49000 / 554602
50000 / 554602
51000 / 554602
52000 / 554602
53000 / 554602
54000 / 554602
55000 / 554602
56000 / 554602
57000 / 554602
58000 / 554602
59000 / 554602
60000 / 554602
61000 / 554602
62000 / 554602
63000 / 554602
64000 / 554602
65000 / 554602
66000 / 554602
67000 / 

520000 / 554602
521000 / 554602
522000 / 554602
523000 / 554602
524000 / 554602
525000 / 554602
526000 / 554602
527000 / 554602
528000 / 554602
529000 / 554602
530000 / 554602
531000 / 554602
532000 / 554602
533000 / 554602
534000 / 554602
535000 / 554602
536000 / 554602
537000 / 554602
538000 / 554602
539000 / 554602
540000 / 554602
541000 / 554602
542000 / 554602
543000 / 554602
544000 / 554602
545000 / 554602
546000 / 554602
547000 / 554602
548000 / 554602
549000 / 554602
550000 / 554602
551000 / 554602
552000 / 554602
553000 / 554602
554000 / 554602
overlap_title 554602
comm_auth 554602
temp_diff 554602
peer_popularity 554602
edge_check 554602
LSA_distance 554602
citation_check 0


0

In [10]:
############################################
###   Write features to disk - Training  ###
############################################
import os
import pickle

def write_feature_to_disk(feat,file_path):
    with open(file_path, 'wb') as file:
        pickle.dump(feat,file)

print("Current wd: ",os.getcwd())
write_feature_to_disk(train_features_dict['citation_check'],'./features_train/citation_check_reduced')

Current wd:  /home/lucas/Documents/École Polytechnique/Courses/ML1/Challenge/git_repository/ML_project


AttributeError: 'list' object has no attribute 'shape'

In [47]:
###########################################
###  Construct features on TESTING_SET  ###
###########################################

#Build KDTree on training_set
train_l = [l[node_dict[i]] for i in train_IDs]
train_kdtree = nw.KDTree(train_l)

train_true_labels = []
features_to_create = []

# Where to insert created features
insert_features_dict = test_features_dict
for feat in features_to_create:
    insert_features_dict[feat] = []
set_to_use = testing_set

test_true_labels = []
for i,triple in enumerate(set_to_use):
    source = triple[0]
    target = triple[1]
    index_source = node_dict[source]
    index_target = node_dict[target]
    
    source_info = node_info[index_source]
    target_info = node_info[index_target]

    # convert to lowercase and tokenize
    source_title = source_info[2].lower().split(" ")
    # remove stopwords
    source_title = [token for token in source_title if token not in stpwds]
    source_title = [stemmer.stem(token) for token in source_title]

    target_title = target_info[2].lower().split(" ")
    target_title = [token for token in target_title if token not in stpwds]
    target_title = [stemmer.stem(token) for token in target_title]

    source_auth = source_info[3].split(",")
    target_auth = target_info[3].split(",") 
    
    
    # Creating features
    # Baseline #
    #overlap_title = len(set(source_title).intersection(set(target_title)))
    #insert_features_dict["overlap_title"].append(overlap_title)
    #temp_diff = int(source_info[1]) - int(target_info[1])
    #insert_features_dict["temp_diff"].append(temp_diff)
    #comm_auth = len(set(source_auth).intersection(set(target_auth)))
    #insert_features_dict["comm_auth"].append(comm_auth)

    #peer_pop = pw.peer_popularity(train_graph,source,target)
    #insert_features_dict["peer_popularity"].append(peer_pop)

    #max_sim = pw.Max_Sim(source,target,l,train_graph,node_dict)
    #insert_features_dict["max_sim"].append(max_sim)

    #edge_check = pw.edge_check(source,target,train_graph)
    #insert_features_dict["edge_check"].append(edge_check)

    #LSA_dist = pw.LSA_distance(source,target,node_dict,l)
    #insert_features_dict["LSA_distance"].append(LSA_dist)
    
    #citation_check = pw.Citation_Check(source,target,train_kdtree,l,train_graph,node_dict,index_dict,k=500)
    #insert_features_dict["citation_check"].append(citation_check)

    #degree = pw.node_degree(source,target,train_graph)
    #insert_features_dict["node_degree"].append(degree)

    test_true_labels.append(triple[2])

    if i%1000==0:
        print(i,"/",len(set_to_use))

# Reshape features into np column arrays, one row per node pair
for (name,value) in insert_features_dict.items():
    print(name,len(value))
    insert_features_dict[name] = to_feature_shape(value)
        
# Concatenate all features
feats_test = np.concatenate([feat for feat in insert_features_dict.values()])
test_true_labels = np.array(test_true_labels)

0 / 60910
1000 / 60910
2000 / 60910
3000 / 60910
4000 / 60910
5000 / 60910
6000 / 60910
7000 / 60910
8000 / 60910
9000 / 60910
10000 / 60910
11000 / 60910
12000 / 60910
13000 / 60910
14000 / 60910
15000 / 60910
16000 / 60910
17000 / 60910
18000 / 60910
19000 / 60910
20000 / 60910
21000 / 60910
22000 / 60910
23000 / 60910
24000 / 60910
25000 / 60910
26000 / 60910
27000 / 60910
28000 / 60910
29000 / 60910
30000 / 60910
31000 / 60910
32000 / 60910
33000 / 60910
34000 / 60910
35000 / 60910
36000 / 60910
37000 / 60910
38000 / 60910
39000 / 60910
40000 / 60910
41000 / 60910
42000 / 60910
43000 / 60910
44000 / 60910
45000 / 60910
46000 / 60910
47000 / 60910
48000 / 60910
49000 / 60910
50000 / 60910
51000 / 60910
52000 / 60910
53000 / 60910
54000 / 60910
55000 / 60910
56000 / 60910
57000 / 60910
58000 / 60910
59000 / 60910
60000 / 60910
overlap_title 60910
comm_auth 60910
temp_diff 60910
citation_check 60910
max_sim 60910
peer_popularity 60910
edge_check 60910
LSA_distance 60910


ValueError: all the input array dimensions except for the concatenation axis must match exactly

In [16]:
###########################################
###   Write features to disk - Testing  ###
###########################################
import os
import pickle

def write_feature_to_disk(feat,file_path):
    with open(file_path, 'wb') as file:
        pickle.dump(feat,file)

print("Current wd: ",os.getcwd())
write_feature_to_disk(test_features_dict['overlap_title'],'./features_test/overlap_title')
write_feature_to_disk(test_features_dict['temp_diff'],'./features_test/temp_diff')
write_feature_to_disk(test_features_dict['comm_auth'],'./features_test/comm_auth')
write_feature_to_disk(test_features_dict['peer_popularity'],'./features_test/peer_popularity')
write_feature_to_disk(test_features_dict['edge_check'],'./features_test/edge_check')
write_feature_to_disk(test_features_dict['LSA_distance'],'./features_test/LSA_distance')



Current wd:  /home/lucas/Documents/École Polytechnique/Courses/ML1/Challenge/git_repository/ML_project


In [13]:
#Combine all features to one vector
train_features_reduced_dict.keys()
test_features_dict.keys()
for key,feat in train_features_reduced_dict.items():
    print(key,feat.shape)
train_features_reduced = np.concatenate([#train_features_reduced_dict['overlap_title'],
                                         #train_features_reduced_dict['comm_auth'],
                                         train_features_reduced_dict['temp_diff'],
                                         train_features_reduced_dict['citation_check'],
                                         train_features_reduced_dict['max_sim'],
                                         train_features_reduced_dict['peer_popularity'],
                                         #train_features_reduced_dict['edge_check'],
                                         train_features_reduced_dict['LSA_distance']]
                                        ,axis = 1)

test_features_reduced = np.concatenate([#test_features_reduced_dict['overlap_title'],
                                        #test_features_reduced_dict['comm_auth'],
                                        test_features_reduced_dict['temp_diff'],
                                        test_features_reduced_dict['citation_check'],
                                        test_features_reduced_dict['max_sim'],
                                        test_features_reduced_dict['peer_popularity'],
                                        #test_features_reduced_dict['edge_check'],
                                        test_features_reduced_dict['LSA_distance']]
                                        ,axis = 1)

test_features = np.concatenate([#test_features_dict['overlap_title'],
                                        #test_features_dict['comm_auth'],
                                        test_features_dict['temp_diff'],
                                        test_features_dict['citation_check'],
                                        test_features_dict['max_sim'],
                                        test_features_dict['peer_popularity'],
                                        #test_features_dict['edge_check'],
                                        test_features_dict['LSA_distance']]
                                        ,axis = 1)

overlap_title (27730, 1)
comm_auth (27730, 1)
temp_diff (27730, 1)
citation_check (27730, 2)
max_sim (27730, 9)
peer_popularity (27730, 1)
edge_check (27730, 1)
LSA_distance (27730, 1)


In [63]:
from sklearn.preprocessing import normalize

normalized_train_features = normalize(train_features_reduced,axis=0)
normalized_test_features = normalize(test_features_reduced,axis=0)

(27730, 1)

In [14]:
from sklearn import svm

classifier = svm.LinearSVC()

# train
classifier.fit(train_features_reduced, train_true_labels_reduced)
preds= list(classifier.predict(test_features))
acc = metrics.accuracy_score(list(map(int,test_true_labels)), list(map(int,preds)))
f1 = metrics.f1_score(list(map(int,test_true_labels)), list(map(int,preds)))
print('acc:',acc,'f1:',f1)

acc: 0.8573961582662946 f1: 0.8677205165692008


In [16]:
from sklearn.linear_model import LogisticRegression as lr
model = lr().fit(train_features_reduced, train_true_labels_reduced)
preds = list(model.predict(test_features))
acc = metrics.accuracy_score(list(map(int,test_true_labels)), list(map(int,preds)))
f1 = metrics.f1_score(list(map(int,test_true_labels)), list(map(int,preds)))
print('acc:',acc,'f1:',f1)

acc: 0.7751436545723198 f1: 0.7900416973264657


In [None]:
from sklearn.neighbors import KNeighborsClassifier

nNhbr = KNeighborsClassifier()
nNhbr.fit(features,true_labels) # do Ytrain.ravel() for length one Y values
preds = nNhbr.predict(test_features)
acc = metrics.accuracy_score(list(map(int,test_true_labels)), list(map(int,preds)))
f1 = metrics.f1_score(list(map(int,test_true_labels)), list(map(int,preds)))
print('acc:',acc,'f1:',f1)

In [None]:
from sklearn.tree import DecisionTreeClassifier

dTree = DecisionTreeClassifier()
dTree.fit(features,true_labels) # do Ytrain.ravel() for length one Y values
preds = dTree.predict(test_features)
acc = metrics.accuracy_score(list(map(int,test_true_labels)), list(map(int,preds)))
f1 = metrics.f1_score(list(map(int,test_true_labels)), list(map(int,preds)))
print('acc:',acc,'f1:',f1)

In [None]:
import tensorflow as tf
import numpy as np
import time
import math


class Class_Net():

    def __init__(self, learning_rate=0.001, batch_size=60):
        ''' initialize the classifier with default (best) parameters '''
        tf.reset_default_graph()
        self.alpha = learning_rate
        self.beta = batch_size
        self.warm = False

    def fit(self,X,Y,warm_start=True,n_epochs=50):
        ''' train the network, and if warm_start, then do not reinit. the network
            (if it has already been initialized)
        '''
        self.epochs=n_epochs

        self.n_batch = int(len(X)/self.beta)
        
        if warm_start==False or self.warm==False:
            self.x = tf.placeholder(tf.float32,shape=[None,len(X[0])])
            self.y = tf.placeholder(tf.float32,shape=[None,1])
            
            self.wZero = tf.get_variable('wZero',shape=[len(X[0]),50],initializer=tf.glorot_uniform_initializer())
            self.bZero = tf.Variable(tf.zeros([50]))

            self.wOne = tf.get_variable('wOne',shape=[50,1],initializer=tf.glorot_uniform_initializer())
            self.bOne = tf.Variable(tf.zeros([1]))
            self.keep_prob = 0.9
            self.drop_out = tf.nn.dropout(self.x, self.keep_prob)
            self.model = tf.nn.sigmoid(tf.matmul(tf.nn.relu(tf.matmul(self.drop_out, self.wZero) 
                                              + self.bZero),self.wOne)+self.bOne)
            self.cost = tf.losses.log_loss(self.y,self.model)
            
#             self.optimizer = tf.train.GradientDescentOptimizer(self.alpha).minimize(self.cost)
            self.optimizer = tf.train.AdamOptimizer(learning_rate=self.alpha).minimize(self.cost)

            #without dropout
#             self.model = tf.matmul(tf.nn.relu(tf.matmul(self.x, self.wZero) + self.bZero),self.wOne)+self.bOne

    
            self.saver = tf.train.Saver()
            self.init = tf.global_variables_initializer()
            
        with tf.Session() as sess:
            if warm_start==False or self.warm==False:
                sess.run(self.init)
            else:
                self.saver.restore(sess, './tempVariables.ckpt')
            for epoch in range(self.epochs):
                self.avg_cost = 0
                for i in range(self.n_batch):
                    _, self.c = sess.run([self.optimizer,self.cost], feed_dict={self.x: X[i*self.beta:min([(i+1)*
                                    self.beta,len(X)]),:],self.y:Y[i*self.beta:min([(i+1)*self.beta,len(X)])]})
                    
                    self.avg_cost = self.avg_cost+np.mean(self.c)/self.n_batch
                print("Epoch:", '%s' % (epoch+1), "cost=", "%s"% (self.avg_cost))
            self.saver.save(sess,'./tempVariables.ckpt')
            
        self.warm = True
        
        return self
    
    def predict_proba(self,X):
        ''' return a matrix P where P[i,j] = P(Y[i,j]=1), 
        for all instances i, and labels j. '''

        with tf.Session() as sess:
            self.saver.restore(sess, './tempVariables.ckpt')
            self.preds = sess.run(tf.nn.softmax(self.model), feed_dict={self.x: X}) 
        return self.preds
    
    def predict(self,X):
        ''' return a matrix of predictions for X '''
        return (self.predict_proba(X) >= 0.5).astype(int)
    
    


In [None]:
net = Class_Net(learning_rate=0.01,batch_size=250)
net.fit(features,list(map(lambda x: [x],true_labels)),n_epochs=35)
preds=net.predict(test_features)
acc = metrics.accuracy_score(list(map(int,test_true_labels)), list(map(int,preds)))
f1 = metrics.f1_score(list(map(int,test_true_labels)), list(map(int,preds)))
print('acc:',acc,'f1:',f1)