In [1]:
import random
import numpy as np
from sklearn import svm
from sklearn.metrics.pairwise import linear_kernel
from sklearn import preprocessing
import nltk
import csv

In [2]:
nltk.download('stopwords')
stpwds = set(nltk.corpus.stopwords.words("english"))

[nltk_data] Downloading package stopwords to /home/asus/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
with open("testing_set.txt", "r") as f:
    reader = csv.reader(f)
    testing_set  = list(reader)

testing_set = [element[0].split(" ") for element in testing_set]

In [4]:
with open("training_set.txt", "r") as f:
    reader = csv.reader(f)
    training_set  = list(reader)

training_set = [element[0].split(" ") for element in training_set]

with open("node_information.csv", "r") as f:
    reader = csv.reader(f)
    node_info  = list(reader)

IDs = [element[0] for element in node_info]

In [5]:
# randomly select 30% of training set
to_keep = random.sample(range(len(training_set)), k=int(round(len(training_set)*1)))
training_set_reduced = [training_set[i] for i in to_keep]

In [6]:
from sklearn.metrics.pairwise import cosine_similarity as cosine
def isselfcite(source_auth, target_auth): 
    selfcite = 0
    for sauth in source_auth: 
        if sauth in target_auth: 
            selfcite = 1
            break 
    return selfcite

def issamejournal(source_journal, target_journal): 
    
    if source_journal == target_journal: 
        same_journal = 1
    else:
        same_journal = 0
    return same_journal
        
        
def cosine_similarity(s_1, s_2): 
    #remove stopwords 
    s_1 = [word for word in s_1.split(" ") if word.lower() not in stpwds]
    s_2 = [word for word in s_2.split(" ") if word.lower() not in stpwds]
    
 
    features = list(set(s_1).union(set(s_2)))
    

    # project the two sentences in the BOW space
    p_1 = []
    p_2 = []
    for feature in features:
        if feature in s_1:
            p_1.append(1)
        else:
            p_1.append(0)
            
        if feature in s_2:
            p_2.append(1)
        else:
            p_2.append(0)
    p_1 = np.reshape(p_1,(1,-1)  )
    p_2 = np.reshape(p_2,(1,-1)  )
    
    return round(cosine(p_1,p_2), 5)

In [8]:
# in this baseline we will use three basic features:
# number of overlapping words in title
overlap_title = []

# temporal distance between the papers
temp_diff = []

# number of common authors
comm_auth = []

#is self citation
self_cite = []

#is published in same journal 
same_journal = []

#cosine  similarity 
cosine_sim = []

In [None]:

from nltk.stem.porter import *
stemmer = PorterStemmer()
counter = 0
for i in xrange(len(training_set_reduced)):
    source = training_set_reduced[i][0]
    target = training_set_reduced[i][1]
    
    index_source = IDs.index(source)
    index_target = IDs.index(target)
    
    source_info = [element for element in node_info if element[0]==source][0]
    target_info = [element for element in node_info if element[0]==target][0]
    
	# convert to lowercase and tokenize
    source_title = source_info[2].lower().split(" ")
	# remove stopwords
    source_title = [token for token in source_title if token not in stpwds]
    source_title = [stemmer.stem(token) for token in source_title]
    
    target_title = target_info[2].lower().split(" ")
    target_title = [token for token in target_title if token not in stpwds]
    target_title = [stemmer.stem(token) for token in target_title]
    
    source_auth = source_info[3].split(",")
    target_auth = target_info[3].split(",")
    
    source_journal = source_info[4].lower()
    target_journal = target_info[4].lower()
    
    source_abstract = source_info[5].lower()
    target_abstract = target_info[5].lower()
    
    
    overlap_title.append(len(set(source_title).intersection(set(target_title))))
    temp_diff.append(int(source_info[1]) - int(target_info[1]))
    comm_auth.append(len(set(source_auth).intersection(set(target_auth))))
    self_cite.append(isselfcite(source_auth,target_auth))
    same_journal.append(issamejournal(source_journal, target_journal))
    cosine_sim.append(cosine_similarity(source_abstract, target_abstract))
   
    counter += 1
    if counter % 1000 == True:
        print counter, "training examples processsed"

In [None]:
# convert list of lists into array
# documents as rows, unique words as columns (i.e., example as rows, features as columns)
training_features = np.array([overlap_title, temp_diff, comm_auth,cosine_sim,same_journal, self_cite]).T

# scale
training_features = preprocessing.scale(training_features)


In [None]:
np.unique(same_journal)

In [None]:
# convert labels into integers then into column array
labels = [int(element[2]) for element in training_set_reduced]
labels = list(labels)
labels_array = np.array(labels)

# initialize basic SVM
clf = svm.SVC(C=0.1, kernel = 'rbf')



In [None]:
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(training_features, labels_array , test_size=0.2, random_state=42)

clf.fit(X_train, y_train)
labels_predicted = clf.predict(X_test )
from sklearn.metrics import accuracy_score
accuracy_score(y_test, labels_predicted)


In [None]:
#grid search on the value of C 
from sklearn.grid_search import GridSearchCV
tuned_parameters = [{'kernel': ['rbf'],
                     'C': [0.01,0.1,1, 10, 100, 1000]}]
clf = GridSearchCV(svm.SVC(C=1), tuned_parameters, cv=5) 
                    
clf.fit(X_train, y_train)

print("Best parameters set found on development set:")
print(clf.best_params_)
print("Grid scores on training set:")
for params, mean_score, scores in clf.grid_scores_:
    print("%0.3f (+/-%0.03f) for %r"
          % (mean_score, scores.std() * 2, params))

In [None]:
#transforming test features 
# number of overlapping words in title
overlap_title_test = []

# temporal distance between the papers
temp_diff_test = []

# number of common authors
comm_auth_test = []

#is self citation
self_cite_test = []

#is published in same journal 
same_journal_test = []

#cosine  similarity 
cosine_sim_test = []

In [None]:
counter = 0
for i in xrange(len(testing_set)):
    source = testing_set[i][0]
    target = testing_set[i][1]
    
    index_source = IDs.index(source)
    index_target = IDs.index(target)
    
    source_info = [element for element in node_info if element[0]==source][0]
    target_info = [element for element in node_info if element[0]==target][0]
    
	# convert to lowercase and tokenize
    source_title = source_info[2].lower().split(" ")
	# remove stopwords
    source_title = [token for token in source_title if token not in stpwds]
    source_title = [stemmer.stem(token) for token in source_title]
    
    target_title = target_info[2].lower().split(" ")
    target_title = [token for token in target_title if token not in stpwds]
    target_title = [stemmer.stem(token) for token in target_title]
    
    source_auth = source_info[3].split(",")
    target_auth = target_info[3].split(",")
    
    source_journal = source_info[4].lower()
    target_journal = target_info[4].lower()
    
    source_abstract = source_info[5].lower()
    target_abstract = target_info[5].lower()
    
    
    overlap_title_test.append(len(set(source_title).intersection(set(target_title))))
    temp_diff_test.append(int(source_info[1]) - int(target_info[1]))
    comm_auth_test.append(len(set(source_auth).intersection(set(target_auth))))
    self_cite_test.append(isselfcite(source_auth,target_auth))
    same_journal_test.append(issamejournal(source_journal, target_journal))
    cosine_sim_test.append(cosine_similarity(source_abstract, target_abstract))
   
    counter += 1
    if counter % 1000 == True:
        print counter, "test examples processsed"

In [None]:
testing_features = np.array([overlap_title_test, temp_diff_test, comm_auth_test,cosine_sim_test,same_journal_test, self_cite_test]).T

# scale
testing_features = preprocessing.scale(testing_features)
