In [1]:
import random
import numpy as np
from sklearn import svm
from sklearn.metrics.pairwise import linear_kernel
from sklearn import preprocessing
import nltk
import csv

In [2]:
nltk.download('stopwords')
stpwds = set(nltk.corpus.stopwords.words("english"))

[nltk_data] Error loading stopwords: <urlopen error [Errno -2] Name or
[nltk_data]     service not known>


In [3]:
with open("testing_set.txt", "r") as f:
    reader = csv.reader(f)
    testing_set  = list(reader)

testing_set = [element[0].split(" ") for element in testing_set]

In [4]:
with open("training_set.txt", "r") as f:
    reader = csv.reader(f)
    training_set  = list(reader)

training_set = [element[0].split(" ") for element in training_set]

with open("node_information.csv", "r") as f:
    reader = csv.reader(f)
    node_info  = list(reader)

IDs = [element[0] for element in node_info]

In [5]:
# randomly select 30% of training set
to_keep = random.sample(range(len(training_set)), k=int(round(len(training_set)*1)))
training_set_reduced = [training_set[i] for i in to_keep]

In [6]:
from sklearn.metrics.pairwise import cosine_similarity as cosine
def isselfcite(source_auth, target_auth): 
    selfcite = 0
    for sauth in source_auth: 
        if sauth in target_auth: 
            selfcite = 1
            break 
    return selfcite

def issamejournal(source_journal, target_journal): 
    
    if source_journal == target_journal: 
        same_journal = 1
    else:
        same_journal = 0
    return same_journal
        
        
def cosine_similarity(s_1, s_2): 
    #remove stopwords 
    s_1 = [word for word in s_1.split(" ") if word.lower() not in stpwds]
    s_2 = [word for word in s_2.split(" ") if word.lower() not in stpwds]
    
 
    features = list(set(s_1).union(set(s_2)))
    

    # project the two sentences in the BOW space
    p_1 = []
    p_2 = []
    for feature in features:
        if feature in s_1:
            p_1.append(1)
        else:
            p_1.append(0)
            
        if feature in s_2:
            p_2.append(1)
        else:
            p_2.append(0)
    p_1 = np.reshape(p_1,(1,-1)  )
    p_2 = np.reshape(p_2,(1,-1)  )
    
    return round(cosine(p_1,p_2), 5)

In [7]:
# in this baseline we will use three basic features:
# number of overlapping words in title
overlap_title = []

# temporal distance between the papers
temp_diff = []

# number of common authors
comm_auth = []

#is self citation
self_cite = []

#is published in same journal 
same_journal = []

#cosine  similarity 
cosine_sim = []

In [8]:
%%time
from nltk.stem.porter import *
stemmer = PorterStemmer()
counter = 0
for i in xrange(len(training_set_reduced)):
    source = training_set_reduced[i][0]
    target = training_set_reduced[i][1]
    
    index_source = IDs.index(source)
    index_target = IDs.index(target)
    
    source_info = [element for element in node_info if element[0]==source][0]
    target_info = [element for element in node_info if element[0]==target][0]
    
	# convert to lowercase and tokenize
    source_title = source_info[2].lower().split(" ")
	# remove stopwords
    source_title = [token for token in source_title if token not in stpwds]
    source_title = [stemmer.stem(token) for token in source_title]
    
    target_title = target_info[2].lower().split(" ")
    target_title = [token for token in target_title if token not in stpwds]
    target_title = [stemmer.stem(token) for token in target_title]
    
    source_auth = source_info[3].split(",")
    target_auth = target_info[3].split(",")
    
    source_journal = source_info[4].lower()
    target_journal = target_info[4].lower()
    
    source_abstract = source_info[5].lower()
    target_abstract = target_info[5].lower()
    
    
    overlap_title.append(len(set(source_title).intersection(set(target_title))))
    temp_diff.append(int(source_info[1]) - int(target_info[1]))
    comm_auth.append(len(set(source_auth).intersection(set(target_auth))))
    self_cite.append(isselfcite(source_auth,target_auth))
    same_journal.append(issamejournal(source_journal, target_journal))
    cosine_sim.append(cosine_similarity(source_abstract, target_abstract))
   
    counter += 1
    if counter % 1000 == True:
        print counter, "training examples processsed"

1 training examples processsed
1001 training examples processsed
2001 training examples processsed
3001 training examples processsed
4001 training examples processsed
5001 training examples processsed
6001 training examples processsed
7001 training examples processsed
8001 training examples processsed
9001 training examples processsed
10001 training examples processsed
11001 training examples processsed
12001 training examples processsed
13001 training examples processsed
14001 training examples processsed
15001 training examples processsed
16001 training examples processsed
17001 training examples processsed
18001 training examples processsed
19001 training examples processsed
20001 training examples processsed
21001 training examples processsed
22001 training examples processsed
23001 training examples processsed
24001 training examples processsed
25001 training examples processsed
26001 training examples processsed
27001 training examples processsed
28001 training examples processse

In [9]:
# convert list of lists into array
# documents as rows, unique words as columns (i.e., example as rows, features as columns)
training_features = np.array([overlap_title, temp_diff, comm_auth,cosine_sim,same_journal, self_cite]).T





In [None]:
# scale
training_features = preprocessing.scale(training_features)

In [35]:
# convert labels into integers then into column array
labels = [int(element[2]) for element in training_set_reduced]
labels = list(labels)
labels_array = np.array(labels)

from sklearn.ensemble import (RandomTreesEmbedding, RandomForestClassifier,
                              GradientBoostingClassifier)
# initialize basic SVM
clf = GradientBoostingClassifier(n_estimators = 300)



In [11]:
print training_features.shape
print len(labels)
print len(training_set_reduced )
print len(cosine_sim)

(615512, 6)
615512
615512
615512


In [None]:
%%time 
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(training_features, labels_array , test_size=0.2, random_state=42)

clf.fit(X_train, y_train)
labels_predicted = clf.predict(X_test )
from sklearn.metrics import accuracy_score
print accuracy_score(y_test, labels_predicted)


In [12]:
with open("graph_features_train.csv", "r") as f:
    reader = csv.reader(f)
    graph_features_train  = list(reader)
    
with open("graph_features_test.csv", "r") as f:
    reader = csv.reader(f)
    graph_features_test  = list(reader)

In [13]:
graph_features_train

[['',
  'From',
  'To',
  'Nb_common_neighbors',
  'Jaccard_coef',
  'In_link_diff',
  'Betweeness_diff'],
 ['0', '9510123', '9502114', '1', '0.0588235294118', '5', '2.40649393823e-05'],
 ['1',
  '9707075',
  '9604178',
  '20',
  '0.0970873786408',
  '113',
  '0.000181630337407'],
 ['2', '9312155', '9506142', '0', '0.0', '1', '8.34229951197e-06'],
 ['3', '9911255', '302165', '0', '0.0', '-2', '-1.74913338434e-06'],
 ['4', '9701033', '209076', '0', '0.0', '-5', '-2.68242283131e-06'],
 ['5', '9710020', '9709228', '14', '0.237288135593', '7', '7.93893528878e-05'],
 ['6',
  '9901042',
  '9510135',
  '12',
  '0.0152284263959',
  '690',
  '0.00860253832733'],
 ['7', '209146', '9502077', '0', '0.0', '63', '0.00014522923181'],
 ['8', '9705079', '9702201', '5', '0.0367647058824', '68', '0.0002719434169'],
 ['9', '3016', '9207067', '0', '0.0', '27', '0.000298530676992'],
 ['10', '9402099', '105041', '0', '0.0', '-1', '2.16236817917e-05'],
 ['11', '9705061', '9503216', '0', '0.0', '-5', '-1.58224

In [14]:
graph_features_train = graph_features_train[1:]
graph_features_test = graph_features_test[1:]


In [15]:
graph_features_train

[['0', '9510123', '9502114', '1', '0.0588235294118', '5', '2.40649393823e-05'],
 ['1',
  '9707075',
  '9604178',
  '20',
  '0.0970873786408',
  '113',
  '0.000181630337407'],
 ['2', '9312155', '9506142', '0', '0.0', '1', '8.34229951197e-06'],
 ['3', '9911255', '302165', '0', '0.0', '-2', '-1.74913338434e-06'],
 ['4', '9701033', '209076', '0', '0.0', '-5', '-2.68242283131e-06'],
 ['5', '9710020', '9709228', '14', '0.237288135593', '7', '7.93893528878e-05'],
 ['6',
  '9901042',
  '9510135',
  '12',
  '0.0152284263959',
  '690',
  '0.00860253832733'],
 ['7', '209146', '9502077', '0', '0.0', '63', '0.00014522923181'],
 ['8', '9705079', '9702201', '5', '0.0367647058824', '68', '0.0002719434169'],
 ['9', '3016', '9207067', '0', '0.0', '27', '0.000298530676992'],
 ['10', '9402099', '105041', '0', '0.0', '-1', '2.16236817917e-05'],
 ['11', '9705061', '9503216', '0', '0.0', '-5', '-1.58224478422e-05'],
 ['12', '109090', '9107', '0', '0.0', '2', '0.0001278540782'],
 ['13', '107016', '9304156', '

In [16]:
graph_features_train_reduced = [graph_features_train[i] for i in to_keep]
print len(graph_features_train_reduced)
gftr = np.array(graph_features_train_reduced)
print gftr.shape
gftr = gftr[:,3:]
print gftr.shape

615512
(615512, 7)
(615512, 4)


In [17]:
gftr

array([['0', '0.0', '-3', '-6.37877261858e-05'],
       ['0', '0.0', '-30', '-1.14208676719e-05'],
       ['5', '0.263157894737', '-6', '-1.96434581044e-05'],
       ..., 
       ['4', '0.0645161290323', '24', '-1.49412893527e-05'],
       ['9', '0.0671641791045', '62', '0.000110412999515'],
       ['0', '0.0', '0', '1.04656681059e-06']], 
      dtype='|S18')

In [18]:
# keeping only nb_common_neighbours Jaccard coef in_link_diff 
gftr_3 = gftr[:,:3]

In [19]:
gftr_3.shape

(615512, 3)

In [20]:
# scale
gftr_3 = preprocessing.scale(gftr_3)



In [21]:
gftr_3


array([[-0.55959594, -0.64645644, -0.27048564],
       [-0.55959594, -0.64645644, -0.38642282],
       [-0.11065778,  2.26508371, -0.28336755],
       ..., 
       [-0.20044541,  0.0673405 , -0.15454847],
       [ 0.24849275,  0.09663814,  0.00862237],
       [-0.55959594, -0.64645644, -0.25760374]])

In [22]:
new_training_features = np.concatenate((training_features, gftr_3), axis = 1)
print new_training_features.shape

(615512, 9)


In [36]:
%%time 
# initialize basic SVM

from sklearn.ensemble import (RandomTreesEmbedding, RandomForestClassifier,
                              GradientBoostingClassifier)
# initialize basic SVM
clf = GradientBoostingClassifier(n_estimators = 300)
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(new_training_features, labels_array , test_size=0.1, random_state=42)

clf.fit(X_train, y_train)
labels_predicted = clf.predict(X_test )
from sklearn.metrics import accuracy_score
print accuracy_score(y_test, labels_predicted)

0.967685859111
CPU times: user 4min 45s, sys: 860 ms, total: 4min 46s
Wall time: 4min 46s


In [None]:
from sklearn.grid_search import GridSearchCV
tuned_parameters = [{'n_estimators': [300, 400,500],
                      'max_depth': [3,4,6]}
                    ]
gs= GridSearchCV(GradientBoostingClassifier(), tuned_parameters, cv = 5)
gs.fit(new_training_features, labels_array )

In [24]:
gftr_test = np.array(graph_features_test) 
gftr_test_3 = gftr_test[:,:3]
gftr_test_3  = preprocessing.scale(gftr_test_3 )



In [25]:
gftr_test_3.shape

(32648, 3)

In [27]:
#transforming test features 
# number of overlapping words in title
overlap_title_test = []

# temporal distance between the papers
temp_diff_test = []

# number of common authors
comm_auth_test = []

#is self citation
self_cite_test = []

#is published in same journal 
same_journal_test = []

#cosine  similarity 
cosine_sim_test = []

In [28]:
counter = 0
for i in xrange(len(testing_set)):
    source = testing_set[i][0]
    target = testing_set[i][1]
    
    index_source = IDs.index(source)
    index_target = IDs.index(target)
    
    source_info = [element for element in node_info if element[0]==source][0]
    target_info = [element for element in node_info if element[0]==target][0]
    
	# convert to lowercase and tokenize
    source_title = source_info[2].lower().split(" ")
	# remove stopwords
    source_title = [token for token in source_title if token not in stpwds]
    source_title = [stemmer.stem(token) for token in source_title]
    
    target_title = target_info[2].lower().split(" ")
    target_title = [token for token in target_title if token not in stpwds]
    target_title = [stemmer.stem(token) for token in target_title]
    
    source_auth = source_info[3].split(",")
    target_auth = target_info[3].split(",")
    
    source_journal = source_info[4].lower()
    target_journal = target_info[4].lower()
    
    source_abstract = source_info[5].lower()
    target_abstract = target_info[5].lower()
    
    
    overlap_title_test.append(len(set(source_title).intersection(set(target_title))))
    temp_diff_test.append(int(source_info[1]) - int(target_info[1]))
    comm_auth_test.append(len(set(source_auth).intersection(set(target_auth))))
    self_cite_test.append(isselfcite(source_auth,target_auth))
    same_journal_test.append(issamejournal(source_journal, target_journal))
    cosine_sim_test.append(cosine_similarity(source_abstract, target_abstract))
   
    counter += 1
    if counter % 1000 == True:
        print counter, "test examples processsed"

1 test examples processsed
1001 test examples processsed
2001 test examples processsed
3001 test examples processsed
4001 test examples processsed
5001 test examples processsed
6001 test examples processsed
7001 test examples processsed
8001 test examples processsed
9001 test examples processsed
10001 test examples processsed
11001 test examples processsed
12001 test examples processsed
13001 test examples processsed
14001 test examples processsed
15001 test examples processsed
16001 test examples processsed
17001 test examples processsed
18001 test examples processsed
19001 test examples processsed
20001 test examples processsed
21001 test examples processsed
22001 test examples processsed
23001 test examples processsed
24001 test examples processsed
25001 test examples processsed
26001 test examples processsed
27001 test examples processsed
28001 test examples processsed
29001 test examples processsed
30001 test examples processsed
31001 test examples processsed
32001 test examples p

In [29]:
testing_features = np.array([overlap_title_test, temp_diff_test, comm_auth_test,cosine_sim_test,same_journal_test, self_cite_test]).T

# scale
testing_features = preprocessing.scale(testing_features)


In [30]:
new_testing_features = np.concatenate((testing_features, gftr_test_3), axis = 1)
print new_testing_features.shape

(32648, 9)


In [37]:
%%time
new_labels_predicted = clf.predict(new_testing_features)

CPU times: user 152 ms, sys: 0 ns, total: 152 ms
Wall time: 149 ms


In [38]:
new_labels_predicted .shape 

(32648,)

In [None]:
testing_features.shape

In [39]:
predictions_SVM = list(new_labels_predicted) 
predictions_SVM = zip(range(len(testing_set)), predictions_SVM)
with open("improved_predictions2230.csv","wb") as pred1:
    csv_out = csv.writer(pred1)
    csv_out.writerow(["ID", "category"])
    for row in predictions_SVM:
        csv_out.writerow(row)