In [1]:
import random
import numpy as np
from sklearn import svm
from sklearn.metrics.pairwise import linear_kernel
from sklearn import preprocessing
import nltk
import csv
from bs4 import BeautifulSoup

In [3]:
with open("testing_set.txt", "r") as f:
    reader = csv.reader(f)
    testing_set  = list(reader)

testing_set = [element[0].split(" ") for element in testing_set]
with open("training_set.txt", "r") as f:
    reader = csv.reader(f)
    training_set  = list(reader)

training_set = [element[0].split(" ") for element in training_set]

with open("node_information.csv", "r") as f:
    reader = csv.reader(f)
    node_info  = list(reader)

IDs = [element[0] for element in node_info]

In [8]:
#training word embeddings on the abstract of the node information 
abstracts = [element[5] for element in node_info ]
print("total nulber of abstracts: %d" %len(abstracts))

total nulber of abstracts: 27770


In [10]:
abstracts_w = [element.lower().split() for element in abstracts]

In [13]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',level=logging.INFO)

In [42]:
# Set values for various parameters
num_features = 200    # Word vector dimensionality                      
min_word_count = 40   # Minimum word count                        
num_workers = 4       # Number of threads to run in parallel
context = 10          # Context window size                                                                                    
downsampling = 1e-3   # Downsample setting for frequent words

In [43]:
%%time
from gensim.models import word2vec
print "Training model..."
model = word2vec.Word2Vec(abstracts_w, workers=num_workers, size=num_features, min_count = min_word_count, \
            window = context, sample = downsampling)


Training model...
CPU times: user 1min 3s, sys: 64 ms, total: 1min 3s
Wall time: 16.6 s


In [44]:
model.init_sims(replace=True)

# It can be helpful to create a meaningful model name and 
# save the model for later use. You can load it later using Word2Vec.load()
model_name = "300features_40minwords_10context"
model.save(model_name)

In [45]:
def makeFeatureVec(words, model, num_features):
    # Function to average all of the word vectors in a given
    # paragraph
    #
    # Pre-initialize an empty numpy array (for speed)
    featureVec = np.zeros((num_features,),dtype="float32")
    #
    nwords = 0.
    # 
    # Index2word is a list that contains the names of the words in 
    # the model's vocabulary. Convert it to a set, for speed 
    index2word_set = set(model.index2word)
    #
    # Loop over each word in the review and, if it is in the model's
    # vocaublary, add its feature vector to the total
    for word in words:
        if word in index2word_set: 
            nwords = nwords + 1.
            featureVec = np.add(featureVec,model[word])
    # 
    # Divide the result by the number of words to get the average
    featureVec = np.divide(featureVec,nwords)
    return featureVec


def getAvgFeatureVecs(reviews, model, num_features):
    # Given a set of reviews (each one a list of words), calculate 
    # the average feature vector for each one and return a 2D numpy array 
    # 
    # Initialize a counter
    counter = 0.
    # 
    # Preallocate a 2D numpy array, for speed
    reviewFeatureVecs = np.zeros((len(reviews),num_features),dtype="float32")
    # 
    # Loop through the reviews
    for review in reviews:
        if counter%1000. ==0.:
            print "Review %d of %d" % (counter, len(reviews))
        reviewFeatureVecs[counter] = makeFeatureVec(review, model, num_features)
        counter = counter + 1.
       
        
    return reviewFeatureVecs

In [46]:
#create word list for each abstract without stop words
nltk.download('stopwords')
stpwds = set(nltk.corpus.stopwords.words("english"))
abstracts_stp =  [[word for word in element.split(" ") if word.lower() not in stpwds] for element in abstracts ]

[nltk_data] Downloading package stopwords to /home/asus/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [47]:
DataVecs = getAvgFeatureVecs( abstracts_stp, model, num_features )

Review 0 of 27770
Review 1000 of 27770
Review 2000 of 27770
Review 3000 of 27770
Review 4000 of 27770
Review 5000 of 27770
Review 6000 of 27770
Review 7000 of 27770
Review 8000 of 27770
Review 9000 of 27770
Review 10000 of 27770
Review 11000 of 27770
Review 12000 of 27770
Review 13000 of 27770
Review 14000 of 27770
Review 15000 of 27770
Review 16000 of 27770
Review 17000 of 27770
Review 18000 of 27770
Review 19000 of 27770
Review 20000 of 27770
Review 21000 of 27770
Review 22000 of 27770
Review 23000 of 27770
Review 24000 of 27770
Review 25000 of 27770
Review 26000 of 27770
Review 27000 of 27770




In [48]:
DataVecs.shape

(27770, 200)

In [49]:
from sklearn.metrics.pairwise import cosine_similarity as cosine
s_1 = DataVecs[0]
s_2 = DataVecs[1]
print round(cosine(s_1,s_2), 5)

0.54666




In [50]:
from sklearn.metrics.pairwise import cosine_similarity as cosine
def isselfcite(source_auth, target_auth): 
    selfcite = 0
    for sauth in source_auth: 
        if sauth in target_auth: 
            selfcite = 1
            break 
    return selfcite

def issamejournal(source_journal, target_journal): 
    
    if source_journal == target_journal: 
        same_journal = 1
    else:
        same_journal = 0
    return same_journal
        
        
def cosine_similarity(s_1, s_2): 
    #remove stopwords 
    s_1 = np.reshape(s_1,(1,-1)  )
    s_2 = np.reshape(s_2,(1,-1)  )
    return round(cosine(s_1,s_2), 5)

In [51]:
# in this baseline we will use three basic features:
# number of overlapping words in title
overlap_title = []

# temporal distance between the papers
temp_diff = []

# number of common authors
comm_auth = []

#is self citation
self_cite = []

#is published in same journal 
same_journal = []

#cosine  similarity 
cosine_sim = []

In [52]:
%%time
from nltk.stem.porter import *
stemmer = PorterStemmer()
counter = 0
for i in xrange(len(training_set)):
    source = training_set[i][0]
    target = training_set[i][1]
    
    index_source = IDs.index(source)
    index_target = IDs.index(target)
    
    source_info = [element for element in node_info if element[0]==source][0]
    target_info = [element for element in node_info if element[0]==target][0]
    
	# convert to lowercase and tokenize
    source_title = source_info[2].lower().split(" ")
	# remove stopwords
    source_title = [token for token in source_title if token not in stpwds]
    source_title = [stemmer.stem(token) for token in source_title]
    
    target_title = target_info[2].lower().split(" ")
    target_title = [token for token in target_title if token not in stpwds]
    target_title = [stemmer.stem(token) for token in target_title]
    
    source_auth = source_info[3].split(",")
    target_auth = target_info[3].split(",")
    
    source_journal = source_info[4].lower()
    target_journal = target_info[4].lower()
    
    source_abstract = DataVecs[index_source]
    target_abstract = DataVecs[index_target]
    
    
    overlap_title.append(len(set(source_title).intersection(set(target_title))))
    temp_diff.append(int(source_info[1]) - int(target_info[1]))
    comm_auth.append(len(set(source_auth).intersection(set(target_auth))))
    self_cite.append(isselfcite(source_auth,target_auth))
    same_journal.append(issamejournal(source_journal, target_journal))
    cosine_sim.append(cosine_similarity(source_abstract, target_abstract))
   
    counter += 1
    if counter % 1000 == True:
        print counter, "training examples processsed"

1 training examples processsed
1001 training examples processsed
2001 training examples processsed
3001 training examples processsed
4001 training examples processsed
5001 training examples processsed
6001 training examples processsed
7001 training examples processsed
8001 training examples processsed
9001 training examples processsed
10001 training examples processsed
11001 training examples processsed
12001 training examples processsed
13001 training examples processsed
14001 training examples processsed
15001 training examples processsed
16001 training examples processsed
17001 training examples processsed
18001 training examples processsed
19001 training examples processsed
20001 training examples processsed
21001 training examples processsed
22001 training examples processsed
23001 training examples processsed
24001 training examples processsed
25001 training examples processsed
26001 training examples processsed
27001 training examples processsed
28001 training examples processse

In [53]:
#transforming test features 
# number of overlapping words in title
overlap_title_test = []

# temporal distance between the papers
temp_diff_test = []

# number of common authors
comm_auth_test = []

#is self citation
self_cite_test = []

#is published in same journal 
same_journal_test = []

#cosine  similarity 
cosine_sim_test = []

In [54]:
counter = 0
for i in xrange(len(testing_set)):
    source = testing_set[i][0]
    target = testing_set[i][1]
    
    index_source = IDs.index(source)
    index_target = IDs.index(target)
    
    source_info = [element for element in node_info if element[0]==source][0]
    target_info = [element for element in node_info if element[0]==target][0]
    
	# convert to lowercase and tokenize
    source_title = source_info[2].lower().split(" ")
	# remove stopwords
    source_title = [token for token in source_title if token not in stpwds]
    source_title = [stemmer.stem(token) for token in source_title]
    
    target_title = target_info[2].lower().split(" ")
    target_title = [token for token in target_title if token not in stpwds]
    target_title = [stemmer.stem(token) for token in target_title]
    
    source_auth = source_info[3].split(",")
    target_auth = target_info[3].split(",")
    
    source_journal = source_info[4].lower()
    target_journal = target_info[4].lower()
    
    source_abstract = DataVecs[index_source]
    target_abstract = DataVecs[index_target]
    
    
    overlap_title_test.append(len(set(source_title).intersection(set(target_title))))
    temp_diff_test.append(int(source_info[1]) - int(target_info[1]))
    comm_auth_test.append(len(set(source_auth).intersection(set(target_auth))))
    self_cite_test.append(isselfcite(source_auth,target_auth))
    same_journal_test.append(issamejournal(source_journal, target_journal))
    cosine_sim_test.append(cosine_similarity(source_abstract, target_abstract))
   
    counter += 1
    if counter % 1000 == True:
        print counter, "test examples processsed"

1 test examples processsed
1001 test examples processsed
2001 test examples processsed
3001 test examples processsed
4001 test examples processsed
5001 test examples processsed
6001 test examples processsed
7001 test examples processsed
8001 test examples processsed
9001 test examples processsed
10001 test examples processsed
11001 test examples processsed
12001 test examples processsed
13001 test examples processsed
14001 test examples processsed
15001 test examples processsed
16001 test examples processsed
17001 test examples processsed
18001 test examples processsed
19001 test examples processsed
20001 test examples processsed
21001 test examples processsed
22001 test examples processsed
23001 test examples processsed
24001 test examples processsed
25001 test examples processsed
26001 test examples processsed
27001 test examples processsed
28001 test examples processsed
29001 test examples processsed
30001 test examples processsed
31001 test examples processsed
32001 test examples p

In [55]:
testing_features = np.array([overlap_title_test, temp_diff_test, comm_auth_test,cosine_sim_test,same_journal_test, self_cite_test]).T

# scale
testing_features = preprocessing.scale(testing_features)

In [56]:
training_features = np.array([overlap_title, temp_diff, comm_auth,cosine_sim,same_journal, self_cite]).T
training_features = preprocessing.scale(training_features)

In [57]:
print training_features.shape 
print testing_features.shape 

(615512, 6)
(32648, 6)


In [59]:
np.savetxt('training_features_6.csv', training_features, delimiter=",")
np.savetxt('testing_features_6.csv', testing_features, delimiter=",")