In [43]:
import io
import scipy
import numpy as np
import pandas as pd
import os
import sys
from sklearn.linear_model import LogisticRegression
import re
import random
from sklearn.preprocessing import normalize
import networkx as nx
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

In [2]:
from keras.models import Sequential
from keras.layers import Dense, Activation, Flatten
from keras.optimizers import SGD, Adam
from keras.utils import np_utils
from keras.callbacks import EarlyStopping
from keras import optimizers

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


## NODES INFO INDEX

In [3]:
#NODES INFO INDEX
#1: YEAR
#2: TITLE
#3: AUTHORS
#4: DOMAIN
#5: ABSTRACT

## UTILS

In [3]:
PATH_TO_DATA = os.getcwd()+'/data/'

In [4]:
#get the files path
train_file = 'training_set.txt'
test_file = 'testing_set.txt'
nodes_info_file = 'node_information.csv'

In [5]:
#load the stopwords file
def loadStopwords(path):
    # import stopwords file
    stopwords_file = open(path, 'r')
    stopwords = []
    for word in stopwords_file:
        stopwords.append(word.strip('\n'))

    return stopwords

In [6]:
# script to clean sentences (lower case, stopwords, punctuation...) and authors
def clean_sentence(sentence, stopwords, auth=False):
    rx = re.compile('\W+')
    if auth is False:
        sentence = str(sentence).lower().split()
    else: 
        sentence = str(sentence).lower().split(',')
    sentence = [i for i in sentence if i not in stopwords and len(str(i))>2]
    
    return sentence

In [7]:
#scripts to convert probability predictions to bool predictions
def raw_to_label(raw_preds):
    text_label =[]
    for i in range(len(raw_preds)):
        num_label = np.argmax(raw_preds[i])
        text_label.append(int(num_label))
    
    return np.asarray(text_label)

In [8]:
#convert a domain to a dummy variable
def map_field():
    #initialize the domain dictionary
    nodes = pd.read_csv('node_information.csv', header = None, index_col = 0)
    fields = set(nodes[4])
    i = 0
    field_dic = dict()
    for f in fields:
        field_dic[f] = i
        i+=1
    return field_dic

## NLP Analysis

In [9]:
#word2vec class to proceed to NLP analysis
class Word2vec():
    def __init__(self, fname, nmax=100000):
        self.word2id = {}
        self.load_wordvec(fname, nmax)
        self.id2word = {v: k for k, v in self.word2id.items()}
        self.embeddings = np.array(self.word2vec.values())
  
    def load_wordvec(self, fname, nmax):
        self.word2vec = {}
        with io.open(fname, encoding='utf-8') as f:
            next(f)
            for i, line in enumerate(f):
                word, vec = line.split(' ', 1)
                self.word2vec[word] = np.fromstring(vec, sep=' ')
                self.word2id[word] = i
                if i == (nmax - 1):
                    break
        print('Loaded %s pretrained word vectors' % (len(self.word2vec)))
        
    def most_similar(self, w, K=5):
        # K most similar words: self.score  -  np.argsort
        words = np.array(self.word2vec.keys())
        if w in words:
            compute_score = lambda x: self.score(w, x)
            vscore = np.vectorize(compute_score)
            top_scores = np.argsort(vscore(words))[-K:]
            return [words[i] for i in top_scores][::-1]
        else:
            return 'KeyError - Target word is out of vocabulary'

    def score(self, w1, w2):
        # cosine similarity: np.dot  -  np.linalg.norm
        try: 
            return np.dot(self.word2vec[w1], self.word2vec[w2]) / (np.linalg.norm(self.word2vec[w1])*np.linalg.norm(self.word2vec[w2]))
        except KeyError:
            return 0

In [10]:
#Bag of vectors class to process the nlp analysis of the sentences
class BoV():
    def __init__(self, w2v):
        self.w2v = w2v
    
    def encode(self, sentences):
        # takes a list of sentences, outputs a numpy array of sentence embeddings
        sentemb = []
        
        for sent in sentences:
            sent_mean = np.mean([self.w2v.word2vec[w] if w in self.w2v.word2vec else np.ones(300,)*0.001 for w in sent.split()], axis=0)
            assert sent_mean.shape == (300,)
            sentemb.append(sent_mean)

        #print('Encoding completed')
        return np.vstack(sentemb)  

    def get_simil(self, s, sentences): #get the percentile of the similarity between the two sentences in the samples set
        
        # get most similar sentences and **print** them
        sentemb = self.encode(sentences)
        idx = sentences.values.tolist().index(s)
        #idx0 = sentences.index[sentences == s]
        keys = sentemb.copy()
        keys = keys / np.linalg.norm(keys, 2, 1)[:, None]  # normalize embeddings
        scores = keys[idx].dot(keys.T)  # dot-product of normalized vector = cosine similarity

        #idxs = scores.argsort()[::-1][1:K+1]
        
        return scores#[sentences[idx] for idx in idxs]

    def score(self, s1, s2, idf=False):
        sentences = []
        sentences.append(s1)
        sentences.append(s2)
        sentemb = self.encode(sentences)
        
        src_index = sentences.index(s1)
        trg_index = sentences.index(s2)
        
 
        keys = sentemb.copy()
        src_key = keys[src_index] / np.linalg.norm(keys[src_index])
        trg_key = keys[trg_index] / np.linalg.norm(keys[trg_index])
 
        if not isinstance(src_key.dot(trg_key.T), float):
            return 0
        else:
            return src_key.dot(trg_key.T)  # dot-product of normalized vector = cosine similarity

In [11]:
#instantiate the base of the NLP analysis
w2v = Word2vec(os.path.join(PATH_TO_DATA, 'crawl-300d-200k.vec'), nmax=300000)
s2v = BoV(w2v)

nodes = pd.read_csv('node_information.csv', header = None, index_col = 0)
titles = nodes[2]
abstracts = nodes[5]

Loaded 200000 pretrained word vectors


## GRAPH ANALYSIS

In [12]:
#initialize graph
G = nx.Graph()

In [13]:
#read nodes file
train_data = np.genfromtxt(train_file, delimiter = ' ', dtype = int)

In [14]:
#create the graph based on the edge list
for i in range(train_data.shape[0]):
    if train_data[i][2] == 1:
        G.add_edge(train_data[i][0], train_data[i][1])
print('Number of nodes:', len(G.nodes()))
print('Number of edges:', len(G.edges()))

Number of nodes: 27684
Number of edges: 334690


In [15]:
#check that the number of edges is quite close from the number of edges in the recently created graph
nodes_features = pd.read_csv(nodes_info_file, header = None, index_col = 0)
print('Number of nodes in the original file:', len(nodes_features))

Number of nodes in the original file: 27770


In [16]:
## delta of 27770 - 27684 nodes which means that only 16 nodes are not connected but present in the graph

In [17]:
#compute the Graph degree dictionnary
degree_dict = dict(G.degree())

## FEATURES LIST

In [18]:
# Temporal distance feature between the two papers
def tmp_dist(tuple_i):
    source = tuple_i[0]
    target = tuple_i[1]
    source_features = LP.nodes_features.loc[source]
    target_features = LP.nodes_features.loc[target]
    return source_features[1] - target_features[1]

In [19]:
# BOOL feature if source is older than target
def older_than(tuple_i):
    source = tuple_i[0]
    target = tuple_i[1]
    source_features = LP.nodes_features.loc[source]
    target_features = LP.nodes_features.loc[target]
    return (source_features[1] < target_features[1])*1

In [20]:
# NBR of common words in the title after removing stopwords
def common_w_title(tuple_i):
    source = tuple_i[0]
    target = tuple_i[1]
    source_features = LP.nodes_features.loc[source]
    target_features = LP.nodes_features.loc[target]
    src_w = set(clean_sentence(source_features[2], stopwords))
    trg_w = set(clean_sentence(target_features[2], stopwords))
    return len(list(set(src_w).intersection(trg_w)))

In [21]:
# NBR of common words in the abstract after removing stopwords
def common_w(tuple_i):
    source = tuple_i[0]
    target = tuple_i[1]
    source_features = LP.nodes_features.loc[source]
    target_features = LP.nodes_features.loc[target]
    src_w = set(clean_sentence(source_features[5], stopwords))
    trg_w = set(clean_sentence(target_features[5], stopwords))
    return len(list(set(src_w).intersection(trg_w)))

In [22]:
#NBR common author (CAN BE IMPROVED, 'univeristy' is caught as an author)
def common_auth(tuple_i):
    source = tuple_i[0]
    target = tuple_i[1]
    source_features = LP.nodes_features.loc[source]
    target_features = LP.nodes_features.loc[target]
    src_auth = set(clean_sentence(source_features[3], stopwords, auth=True))
    trg_auth = set(clean_sentence(target_features[3], stopwords, auth =True))
    common_auth = list(set(src_auth).intersection(trg_auth))
    if common_auth == ['nan']:
        common_auth = []
    return len(common_auth)

In [23]:
#features to check if the two papers are part of a common domain, ex: physics
def common_dom(tuple_i):
    source = tuple_i[0]
    target = tuple_i[1]
    source_features = LP.nodes_features.loc[source]
    target_features = LP.nodes_features.loc[target]
    src_dom = source_features[4] 
    trg_dom = target_features[4]
    
    return (trg_dom == src_dom)*1

In [24]:
#Word2vec similarity of the two titles
def title_score(tuple_i):
    source = tuple_i[0]
    target = tuple_i[1]
    source_features = LP.nodes_features.loc[source]
    target_features = LP.nodes_features.loc[target]
    src_title = source_features[2]
    trg_title = target_features[2]
    
    return np.exp(np.exp(s2v.score(src_title, trg_title)))

In [25]:
#Word2vec similarity of the two abstracts
def ab_score(tuple_i):
    source = tuple_i[0]
    target = tuple_i[1]
    source_features = LP.nodes_features.loc[source]
    target_features = LP.nodes_features.loc[target]
    src_ab = source_features[5]
    trg_ab = target_features[5]
    
    return np.exp(np.exp(s2v.score(src_ab, trg_ab)))

In [26]:
#First node degree centrality
def Adegree(tuple_i):
    source = tuple_i[0]
    try:
        return degree_dict[source]
    except KeyError:
        return 0

In [27]:
#Second node degree centrality
def Bdegree(tuple_i):
    target = tuple_i[1]
    try:
        return degree_dict[target]
    except KeyError:
        return 0

In [28]:
#Jaccard similarity of nodes
def jac_sim(tuple_i):
    source = tuple_i[0]
    target = tuple_i[1]
    if (source in G.nodes) and (target in G.nodes):
        preds = nx.jaccard_coefficient(G, [tuple_i])
        for u, v, p in preds:
            return p
    else:
        return 0

## PREPROCESSING

In [None]:
#Preprocessing class to simplify the computation of a training/testing dataset
class Preprocessor:

    def __init__(self, part):
        self.part= part #fraction of the training on which we want to train the model

    def train_preprocess(self, train_file, nodes_info_file, features_list):
        print('Building the train edges list...')
        edges_train, y_train = self.get_train_edges(train_file)
        to_keep = random.sample(range(len(edges_train)), k=int((len(edges_train)*self.part/100)))
        edges_train_sub = [edges_train[i] for i in to_keep]
        y_train_sub0 = [y_train[i] for i in to_keep]
        y_train_sub = np_utils.to_categorical(y_train_sub0, num_classes=2)
        print('Training on', self.part, '% of the the available data...')
        print('Training size:', len(edges_train_sub))
        print('Building x_train...')
        x_train_sub0 = self.build_x_list(edges_train_sub, nodes_info_file, features_list)
        x_train_sub = np.asarray(x_train_sub0)
        print('Preprocessing finished...')
        
        return x_train_sub, y_train_sub
    
    def test_preprocess(self, test_file):
        print('Building the test edges list...')
        edges_test = self.get_test_edges(test_file)
        edges_test_sub = edges_test[:]
        print('Building x_test...')
        x_test_sub0 = self.build_x_list(edges_test_sub, nodes_info_file, features_list)
        x_test_sub = np.asarray(x_test_sub0)
        #x_test_sub = normalize(x_test_sub)
        return x_test_sub

    def build_x_list(self, tuples_list, nodes_info_file, features_list): #method for building the features of the tuples 
        self.nodes_features = self.load_nodes_info(nodes_info_file)
        features = []
        for i in range(len(tuples_list)):#main loop over the edges list to find the features of each tuple
 
            features.append(self.compute_tuple_features(tuples_list[i], features_list))       
            
        return features
    

    def compute_tuple_features(self, tuple_i, features_list): #method to get the features of a single tuples based on a list a features to compute
        return self.map_funcs(tuple_i, features_list)
     
        
    def map_funcs(self, obj, func_list): #map list of functions to an object and return a list comprehensions of the results
        return [func(obj) for func in func_list] 
        
    
    def load_nodes_info(self, node_file): #get load the features indexed by node (use .loc to get the features of a node)
        nodes_features = pd.read_csv(node_file, header = None, index_col = 0)
        
        return nodes_features

    
    def get_train_edges(self, train_file): #build the links_train and y_train
        train_data = np.genfromtxt(train_file, delimiter = ' ', dtype = int)
        edges_arr = train_data[:,:-1]
        edges_tup = [tuple(row) for row in edges_arr] #list of tuple (edges) to be predicted for training
        y_train = train_data[:,-1]

        return edges_tup, y_train
    

    def get_test_edges(self, test_file): #build the links_test
        test_data = np.genfromtxt(test_file, delimiter = ' ', dtype = int)
        edges_tup = [tuple(row) for row in test_data] #list of tuple (edges) to be predicted for testing

        return edges_tup

### /!\ Long computation time

In [35]:
#load stopwords
stopwords = loadStopwords('stopwords.csv')

In [36]:
#define the features you want
features_list = [older_than, tmp_dist, title_score, ab_score, common_w, common_w_title, common_auth, common_dom, Adegree, Bdegree, jac_sim]
print(len(features_list), 'features chosen !')

11 features chosen !


In [336]:
#instantiate the model
P = Preprocessor(100)

In [338]:
#compute the training file
print('Preprocessing training...')
x_train, y_train = P.train_preprocess(train_file, nodes_info_file, features_list)
x_test = P.test_preprocess(test_file)
#save the training files to avoid recomputing all the training
print('Saving training files')
np.savetxt('train.csv', x_train, delimiter=',') 
np.savetxt('y_train.csv', y_train, delimiter=',')
#preprocess test file
print('Preprocessing testing...')
x_test = P.test_preprocess(test_file)
#save preprocessing to csv
print('Saving testing')
np.savetxt('test.csv', x_test, delimiter=',') 

Preprocessing training...
Building the train edges list...
Training on 100 % of the the available data...
Training size: 615512
Building x_train...
Preprocessing finished...
Building the test edges list...
Building x_test...
Saving training files
Preprocessing testing...
Building the test edges list...
Building x_test...
Saving testing


## LOAD PREPROCESSED FILES

### Loading the preprocessed files to avoid recomputing all the training/testing file

In [32]:
#get saved training file name
x_train_f = 'train.csv'
y_train_f = 'y_train.csv'
test_f = 'test.csv'

In [33]:
#load saved files
x_train = np.loadtxt(x_train_f, delimiter=',')
y_train = np.loadtxt(y_train_f, delimiter=',')
test = np.loadtxt(test_f, delimiter =',')

In [34]:
#check loaded files size
print('x train shape:', x_train.shape)
print('y _train shape:', y_train.shape)
print('test shape:', test.shape)

x train shape: (615512, 11)
y _train shape: (615512, 2)
test shape: (32648, 11)


## MODELLING

## DEEP LEARNING

In [47]:
model = Sequential()
nhid = 16
model.add(Dense(nhid, input_dim = len(features_list), activation = 'relu'))
model.add(Dense(nhid, activation = 'relu'))
model.add(Dense(8, activation = 'relu'))
model.add(Dense(2, activation = 'softmax'))
optim = Adam(lr=0.01)
model.compile(loss='categorical_crossentropy', optimizer=optim, metrics=['accuracy'])
model.fit(x_train, y_train, validation_split = 0.1, epochs = 5)

Train on 553960 samples, validate on 61552 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7faa8dfdb4a8>

## LOGISTIC REGRESSSION

In [36]:
#transform one hot encoded y_train to y_train
y_train_h = [np.argmax(y_train[i]) for i in range(len(y_train))]

In [60]:
lr = LogisticRegression()

In [61]:
lr = LogisticRegression(C = 1)
lr.fit(x_train, y_train_h)
print(lr.score(x_train, y_train_h))

0.9573477040252668


## RANDOM FOREST

In [41]:
rf = RandomForestClassifier(max_depth=13, random_state=0, max_features = 'auto', n_estimators = 20) #15
rf.fit(x_train, y_train_h)
print('Features importance:', rf.feature_importances_)
print('Training set score:', rf.score(x_train, y_train_h))

Features importance: [9.73412712e-02 9.36310089e-02 5.59695782e-03 4.85462452e-03
 4.56978611e-02 3.12778102e-02 7.28961927e-03 3.81447973e-04
 4.00711411e-02 1.22992030e-01 5.50866228e-01]
Training set score: 0.9696577808393663


In [42]:
#estimate final leaderboard score with the cv validation score
cv = ShuffleSplit(n_splits=2, test_size=0.1, random_state=0)
print('Cross Validation score')
cross_val_score(rf, x_train, y_train_h, cv=cv)

Cross Validation score


array([0.96614245, 0.96820574])

### Random Forest GridSearch

In [45]:
#run grid search to find the best parameters of the random forest
parameters = {'max_depth':[8,10,12,14], 'n_estimators':[10, 15, 20, 25], 'max_features':[11,8,6]}
rf = RandomForestClassifier()
gd_rf = GridSearchCV(rf, parameters)
gd_rf.fit(x_train, y_train_h)

GridSearchCV(cv=None, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'max_features': [11, 8, 6], 'max_depth': [8, 10, 12, 14], 'n_estimators': [10, 15, 20, 25]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [None]:
#print best model found via gridsearch
print('Best parameters found:', gd_rf.best_params_)
print('Cross Validation score:', gd_rf.best_score_)

In [None]:
#train best grid search model
gd_rf.best_estimator_.fit(x_train, y_train_h)
print('Features importance:', gd_rf.best_estimator_.feature_importances_)
print('Training set score:', gd_rf.best_estimator_.score(x_train, y_train_h))

## TEST PREDICTIONS

In [72]:
#compute prediction based on selected features
y_predicts = rf.predict(test)

In [73]:
#check predictions 
y_predicts

array([0, 1, 1, ..., 0, 0, 1])

In [51]:
#convert raw predictions to label predictions
y_preds = raw_to_label(y_predicts)

In [66]:
#check mean of predictions
np.mean(y_preds)

0.5218390100465572

In [74]:
#convert prediction to dataframe in order to save them
df = pd.DataFrame(y_predicts)

In [75]:
#save DF to csv
df.to_csv('predictions.csv', index = True, header = ['category'], index_label = 'id')