In [1]:
import numpy as np
import pandas as pd
import csv
import matplotlib.pyplot as plt
from sklearn import svm
from sklearn.metrics.pairwise import linear_kernel
from sklearn import preprocessing
from sklearn.decomposition import PCA
from sklearn.metrics import euclidean_distances
from sklearn.metrics.pairwise import cosine_similarity as cosine
from sklearn.datasets import fetch_20newsgroups
from sklearn.metrics import classification_report
from sklearn.cross_validation import train_test_split
import nltk
from scipy import sparse
import time
import pdb


%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [2]:
# Download stop-words, stemmer
nltk.download('stopwords')
stpwds = set(nltk.corpus.stopwords.words("english"))
stemmer = nltk.stem.PorterStemmer()

[nltk_data] Downloading package stopwords to /Users/ariel/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
# Data directory
data_dir = '../../data/'

###First extract training and testing data

In [4]:
with open(data_dir+"testing_set.txt", "r") as f:
    reader = csv.reader(f)
    testing_set  = list(reader)

In [5]:
testing_set = [element[0].split(" ") for element in testing_set]
print "Size fo the testing set:", len(testing_set)

Size fo the testing set: 32648


In [6]:
with open(data_dir+"training_set.txt", "r") as f:
    reader = csv.reader(f)
    training_set  = list(reader)

training_set = [element[0].split(" ") for element in training_set]
labels = [element[-1] for element in training_set]
training_set = [element[:-1] for element in training_set]
print "Size fo the training set:", len(training_set)

Size fo the training set: 615512


###Extract node information
1. unique ID, 
2. publication year (between 1993 and 2003), 
3. title, 
4. authors, 
5. name of journal (not available for all papers), and 
6. abstract. Abstracts are already in lowercase, common English stopwords have been removed, and punctuation marks have been removed except for intra-word dashes.

In [7]:
with open(data_dir+"node_information.csv", "r") as f:
    reader = csv.reader(f)
    node_info  = list(reader)

# [info_dict[el[0] = el[1:] for el in node_info]
# IDs = [element[0] for element in node_info]
info_dict = dict([[el[0], el[1:]] for el in node_info])

###Training and testing with baseline classifier (linear SVM)

In [9]:
def compute_features_baseline(data_set, info_dict):
    
    ## Features for training
    # number of overlapping words in title
    overlap_title = []
    # temporal distance between the papers
    temp_diff = []
    # number of common authors
    comm_auth = []
    
    counter = 0
    for (source, target) in data_set:

        source_info = info_dict[source]
        target_info = info_dict[target]

        # convert to lowercase and tokenize
        source_title = source_info[1].lower().split(" ")
        # remove stopwords
        source_title = [token for token in source_title if token not in stpwds]
        source_title = [stemmer.stem(token) for token in source_title]

        target_title = target_info[1].lower().split(" ")
        target_title = [token for token in target_title if token not in stpwds]
        target_title = [stemmer.stem(token) for token in target_title]

        source_auth = source_info[2].split(",")
        target_auth = target_info[2].split(",")

        overlap_title.append(len(set(source_title).intersection(set(target_title))))
        temp_diff.append(int(source_info[0]) - int(target_info[0]))
        comm_auth.append(len(set(source_auth).intersection(set(target_auth))))

        counter += 1
        if counter % 10000 == 0:
            print counter, "training examples processsed"
    
    features = np.array([overlap_title, temp_diff, comm_auth]).T.astype(float)
    return features

In [8]:
# Divide training into train and test
n = len(training_set)
ntrain = 2*n/3
ntest = n - ntrain
train, test, label_train, label_test = train_test_split(training_set, labels, train_size=ntrain, test_size=ntest)

In [108]:
# training features
training_features = compute_features_baseline(train, info_dict)

10000 training examples processsed
20000 training examples processsed
30000 training examples processsed
40000 training examples processsed
50000 training examples processsed
60000 training examples processsed
70000 training examples processsed
80000 training examples processsed
90000 training examples processsed
100000 training examples processsed
110000 training examples processsed
120000 training examples processsed
130000 training examples processsed
140000 training examples processsed
150000 training examples processsed
160000 training examples processsed
170000 training examples processsed
180000 training examples processsed
190000 training examples processsed
200000 training examples processsed
210000 training examples processsed
220000 training examples processsed
230000 training examples processsed
240000 training examples processsed
250000 training examples processsed
260000 training examples processsed
270000 training examples processsed
280000 training examples processsed
2

In [109]:
# initialize basic SVM
classifier = svm.LinearSVC()
# convert list of lists into array
# documents as rows, unique words as columns (i.e., example as rows, features as columns)
#training_features = np.array([overlap_title, temp_diff, comm_auth]).T.astype(float)
# scale
training_features = preprocessing.scale(training_features)
# convert labels into integers then into column array
labels_array = np.array(label_train)
# initialize basic SVM
classifier = svm.LinearSVC()
# train
classifier.fit(training_features, labels_array)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [111]:
# training features
testing_features = compute_features_baseline(test, info_dict)

10000 training examples processsed
20000 training examples processsed
30000 training examples processsed
40000 training examples processsed
50000 training examples processsed
60000 training examples processsed
70000 training examples processsed
80000 training examples processsed
90000 training examples processsed
100000 training examples processsed
110000 training examples processsed
120000 training examples processsed
130000 training examples processsed
140000 training examples processsed
150000 training examples processsed
160000 training examples processsed
170000 training examples processsed
180000 training examples processsed
190000 training examples processsed
200000 training examples processsed


In [112]:
testing_features = preprocessing.scale(testing_features)
predictions_SVM = list(classifier.predict(testing_features))

In [115]:
acc = np.sum((np.array(predictions_SVM)==np.array(label_test)).astype(float))/len(test_labels)
print "Baseline accuracy:", acc

Baseline accuracy: 0.663012803954


### Adding features on abstract (words in common)

In [8]:
# Divide training into train and test
n = len(training_set)
ntrain = 2*n/3
ntest = n - ntrain
train, test, label_train, label_test = train_test_split(training_set, labels, train_size=ntrain, test_size=ntest)

In [9]:
def train_word_doc_mat(data_set, info_dict):
    
    
    unique_doc = np.unique(np.array(data_set).ravel())
    word_dict = {}
    doc_dict = {}
    row = []
    col = []
    data = []
    prev_len_col = 0
    
    counter = 0
    for i, doc in enumerate(unique_doc):

        doc_dict[doc] = i 
        abst = info_dict[doc][-1]
        
        # convert to lowercase and tokenize
        abst = abst.lower().split(" ")
        # remove stopwords
        abst = [token for token in abst if token not in stpwds]
        abst = [stemmer.stem(token) for token in abst]
        tmplist = []
        for w in abst:
            if w not in word_dict:
                word_dict[w] = len(word_dict)
            idx = word_dict[w]
            tmplist.append(idx)
        data += [1./sum(tmplist)]*len(abst)
        col += tmplist
        row += [i]*len(abst)

        counter += 1
        if counter % 5000 == 0:
            print counter, "training doc processsed"

    mat = sparse.csr_matrix( (data, (row, col)) , shape=(len(unique_doc), len(word_dict)))
    
    return mat, word_dict, doc_dict

In [10]:
tic = time.time()
mat, word_dict, doc_dict = train_word_doc_mat(train, info_dict)
print time.time() - tic

5000 training doc processsed
10000 training doc processsed
15000 training doc processsed
20000 training doc processsed
25000 training doc processsed
26.2572929859


In [11]:
def build_features_abstract(data_set, doc_word_mat, info_dict, doc_dict):
    
    ## Features for training
    # number of overlapping words in title
    overlap_title = []
    # temporal distance between the papers
    temp_diff = []
    # number of common authors
    comm_auth = []
    # abstract similarities
    abstract_feat = []
        
    counter = 0
    for (source, target) in data_set:

        source_info = info_dict[source]
        target_info = info_dict[target]

        # convert to lowercase and tokenize
        source_title = source_info[1].lower().split(" ")
        # remove stopwords
        source_title = [token for token in source_title if token not in stpwds]
        source_title = [stemmer.stem(token) for token in source_title]

        target_title = target_info[1].lower().split(" ")
        target_title = [token for token in target_title if token not in stpwds]
        target_title = [stemmer.stem(token) for token in target_title]

        source_auth = source_info[2].split(",")
        target_auth = target_info[2].split(",")

        overlap_title.append(len(set(source_title).intersection(set(target_title))))
        temp_diff.append(int(source_info[0]) - int(target_info[0]))
        comm_auth.append(len(set(source_auth).intersection(set(target_auth))))
        
        # abstract
        ids = doc_dict[source]
        idt = doc_dict[target]
        dist = float(doc_word_mat[ids,:].toarray().dot(doc_word_mat[idt,:].toarray().T))
        abstract_feat.append(dist)
        
        counter += 1
        if counter % 10000 == 0:
            print counter, "training examples processsed"
    
    features = np.array([overlap_title, temp_diff, comm_auth, abstract_feat]).T.astype(float)
    return features

In [12]:
# training features
training_features = build_features_abstract(train, mat, info_dict, doc_dict)

10000 training examples processsed
20000 training examples processsed
30000 training examples processsed
40000 training examples processsed
50000 training examples processsed
60000 training examples processsed
70000 training examples processsed
80000 training examples processsed
90000 training examples processsed
100000 training examples processsed
110000 training examples processsed
120000 training examples processsed
130000 training examples processsed
140000 training examples processsed
150000 training examples processsed
160000 training examples processsed
170000 training examples processsed
180000 training examples processsed
190000 training examples processsed
200000 training examples processsed
210000 training examples processsed
220000 training examples processsed
230000 training examples processsed
240000 training examples processsed
250000 training examples processsed
260000 training examples processsed
270000 training examples processsed
280000 training examples processsed
2

In [26]:
# scale
training_features = preprocessing.scale(training_features)
# convert labels into integers then into column array
labels_array = np.array(label_train)
# initialize basic SVM
classifier = svm.LinearSVC()
# train
classifier.fit(training_features, labels_array)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [21]:
def test_word_doc_mat(data_set, info_dict, word_dict):
    
    unique_doc = np.unique(np.array(data_set).ravel())
    mat = sparse.csr_matrix((len(unique_doc), len(word_dict)), dtype=float)
    doc_dict = {}
    row = []
    col = []
    data = []
    prev_len_col = 0
    
    counter = 0
    for i, doc in enumerate(unique_doc):

        doc_dict[doc] = i 
        abst = info_dict[doc][-1]
        
        # convert to lowercase and tokenize
        abst = abst.lower().split(" ")
        # remove stopwords
        abst = [token for token in abst if token not in stpwds]
        abst = [stemmer.stem(token) for token in abst]
        tmplist = []
        for w in abst:
            if w in word_dict:
                idx = word_dict[w]
                tmplist.append(idx)
        data += [1./sum(tmplist)]*len(tmplist)
        col += tmplist
        row += [i]*len(tmplist)

        counter += 1
        if counter % 100 == 0:
            #print counter, "training examples processsed"
            mat += sparse.csr_matrix( (data, (row, col)) , shape=(len(unique_doc), len(word_dict)))
            row = []
            col = []
            data = []
    
    return mat, doc_dict

In [22]:
# testing features
mat_test, doc_dict_test = test_word_doc_mat(test, info_dict, word_dict)

In [23]:
testing_features = build_features_abstract(test, mat_test, info_dict, doc_dict_test)

10000 training examples processsed
20000 training examples processsed
30000 training examples processsed
40000 training examples processsed
50000 training examples processsed
60000 training examples processsed
70000 training examples processsed
80000 training examples processsed
90000 training examples processsed
100000 training examples processsed
110000 training examples processsed
120000 training examples processsed
130000 training examples processsed
140000 training examples processsed
150000 training examples processsed
160000 training examples processsed
170000 training examples processsed
180000 training examples processsed
190000 training examples processsed
200000 training examples processsed


In [27]:
testing_features = preprocessing.scale(testing_features)
predictions_SVM = list(classifier.predict(testing_features))

In [28]:
acc = np.sum((np.array(predictions_SVM)==np.array(label_test)).astype(float))/len(label_test)
print "Baseline accuracy:", acc

Baseline accuracy: 0.726730385873


In [29]:
# Vary parameters
Ctry = np.arange(1,10,2)
for C in Ctry:
    # initialize basic SVM
    classifier = svm.LinearSVC(C=C)
    # train
    classifier.fit(training_features, labels_array)
    predictions_SVM = list(classifier.predict(testing_features))
    acc = np.sum((np.array(predictions_SVM)==np.array(label_test)).astype(float))/len(label_test)
    print C, acc

1 0.726749881806
3 0.727368877668
5 0.728012243446
7 0.728314430402
9 0.728553255577


TF-IDF

In [27]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [33]:
# retrieve abstracts and process
unique_doc = np.unique(np.array(training_set).ravel())
abst_corpus = []
doc_dict = {}
for i, doc in enumerate(unique_doc):
    doc_dict[doc] = i
    abst = info_dict[doc][-1]
    # convert to lowercase and tokenize
    abst = abst.lower().split(" ")
    # remove stopwords
    abst = [token for token in abst if token not in stpwds]
    abst = [stemmer.stem(token) for token in abst]
    abst_corpus.append(abst)

In [None]:
# vocabulary 
corpus = [str(" ".join(abstract)) for abstract in abst_corpus]
u_words = np.unique(corpus)

### BOW approach

In [33]:
i = 1
s, t = train[i]
l = label_train[i]
print s, t, l

9612192 9512053 1


In [34]:
abst_s = info_dict[s][-1]
abst_t = info_dict[t][-1]

In [77]:
print abst_s
print abst_t

in this lecture we review some of the recent developments in string theory on an introductory and qualitative level in particular we focus on s-t-u dualities of toroidally compactified ten-dimensional string theories and outline the connection to m-theory dualities among string vacua with less supersymmetries in six and four space-time dimensions is discussed and the concept of f-theory is briefly presented lecture given by j louis at the workshop on gauge theories applied supersymmetry and quantum gravity imperial college london uk july 5 10 1996
the conditions for the cancellation of all gauge gravitational and mixed anomalies of n 1 supersymmetric models in six dimensions are reviewed and illustrated by a number of examples of particular interest are models that cannot be realized perturbatively in string theory an example of this type which we verify satisfies the anomaly cancellation conditions is the k3 compactification of the so 32 theory with small instantons recently proposed 

In [38]:
abst_pair = []
for abst in [abst_s, abst_t]:    
    abst = abst.lower().split(" ")
    # remove stopwords
    abst = [token for token in abst if token not in stpwds]
    abst = [stemmer.stem(token) for token in abst]
    abst_pair.append(abst)

In [41]:
ids = doc_dict[s]
idt = doc_dict[t]

In [72]:
inner_prod = (mat[ids,:].toarray()*mat[idt,:].toarray()).squeeze()

In [78]:
common_w = np.argsort(inner_prod)[::-1][:5]

In [79]:
inner_prod[common_w]

array([  1.48327788e-09,   1.48327788e-09,   7.41638938e-10,
         2.47212979e-10,   2.47212979e-10])

In [81]:
word_dict.values()[0]

20886

In [83]:
print [word_dict.keys()[np.where(word_dict.values()==idx)[0].squeeze()] for idx in common_w]

[u'theori', u'string', u'gaug', u'review', u'recent']


First step: keywords extraction

Keywords are chosen among words shared between linked abstracts

In [None]:
def extract_keywords(train, labels, mat, doc_dict, n=3):
    
    for ((s, t), l) in zip(train, labels):
        
        if l==1:
            ids = doc_dict[s]
            idt = doc_dict[t]
            inner_prod = (mat[ids,:].toarray()*mat[idt,:].toarray()).squeeze()
            common_w = np.argsort(inner_prod)[::-1][:n]
            prod_val = inner_prod[common_w]
            common_w = common_w[prod_val>0]
        