In [1]:
import numpy as np
import pandas as pd
import nltk
import sklearn
import re
from gensim import corpora, models, similarities
from nltk.corpus import stopwords
from sklearn.decomposition import TruncatedSVD
import pickle
import networkx as nx
from networkx.algorithms.shortest_paths.generic import shortest_path



** Data import **

In [2]:
stop = set(stopwords.words('english')+['1','2','3','4','5','6','7','8','9','0'])

In [3]:
columes = ["id", "year", "title", "authors", "journal", "abstract"]
df= pd.read_csv("dataset/node_information.csv",names=columes)

In [4]:
df['title'] = pd.read_pickle('clean_title.pkl')['text'].values.tolist()
df['abstract'] = pd.read_pickle('clean_abstract.pkl')['text'].values.tolist()

In [5]:
data_set = np.loadtxt("dataset/training_set.txt").astype(np.int64)
data = data_set[:, :2]
test = np.loadtxt("dataset/testing_set.txt").astype(np.int64)
label = data_set[:, -1]

In [6]:
data_dic = {}
for index, row in df.iterrows():
    data_dic[row['id']] = (row['year'], row['title'], row['authors'], row['journal'], row['abstract'])

** creating empty feature dataframe **

In [7]:
features = pd.DataFrame()
features_test = pd.DataFrame()

** Feature: years of source and target, difference of their years **

In [8]:
features['year1'] = [data_dic[id][0] for id in data[:, 0]]
features['year2'] = [data_dic[id][0] for id in data[:, 1]]
features['year_differ'] = list(map(lambda x: x[0]-x[1], zip(features['year1'].values.tolist(), features['year2'].values.tolist())))

features_test['year1'] = [data_dic[id][0] for id in test[:, 0]]
features_test['year2'] = [data_dic[id][0] for id in test[:, 1]]
features_test['year_differ'] = list(map(lambda x: x[0]-x[1], zip(features_test['year1'].values.tolist(), features_test['year2'].values.tolist())))

** creating author list for each paper **

In [9]:
id_aut_dic = {}
id_aut_list = []
for index, row in df.iterrows():
    all_auts = row['authors']
    if not isinstance(all_auts, float):
        splited_auts = re.sub(r" ?\([^)]*\)", "", all_auts.strip()).split(',')
        temp = []
        for aut in splited_auts:
            aut_re1 = re.sub(r" ?\([^)]*", "", aut)
            aut_re2 = re.sub(r" ?", "", aut_re1)
            if aut_re2 != '':
                temp.append(aut_re2)
        id_aut_dic[row['id']] = temp
        id_aut_list.append(temp)
    else:
        id_aut_dic[row['id']]=[]
        id_aut_list.append([])

df['authors_list']=id_aut_list

** creating a set for all the authors **

In [10]:
aut_set = set([])
for index, row in df.iterrows():
    for aut in row['authors_list']:
        aut_set.add(aut)

** Feature: same author count between each pair of papers **

In [11]:
temp = []
for id1, id2 in data:
    c = 0
    for aut in id_aut_dic[id1]:
        if aut in id_aut_dic[id2]: c += 1
    temp.append(c)

features['sameAut']=temp

In [12]:
temp = []
for id1, id2 in test:
    c = 0
    for aut in id_aut_dic[id1]:
        if aut in id_aut_dic[id2]: c += 1
    temp.append(c)

features_test['sameAut']=temp

** generating a graph for authors based on the citation network both directed an in-directed **

In [13]:
aut_G = nx.Graph()
for aut in aut_set:
    aut_G.add_node(aut)
count = 0
for id1, id2, lab in data_set:
    if lab==1:
        for aut1 in id_aut_dic[id1]:
            for aut2 in id_aut_dic[id2]:
                if aut_G.has_edge(aut1, aut2):
                    aut_G[aut1][aut2]['w'] += 1
                else:
                    aut_G.add_edge(aut1, aut2, {'w': 1})
    count+=1
    if count%2000==0: print(count/6155.12)

0.3249327389230429
0.6498654778460858
0.9747982167691288
1.2997309556921717
1.6246636946152146
1.9495964335382576
2.2745291724613006
2.5994619113843433
2.9243946503073865
3.2493273892304293
3.5742601281534725
3.8991928670765152
4.224125605999558
4.549058344922601
4.873991083845644
5.198923822768687
5.52385656169173
5.848789300614773
6.173722039537815
6.498654778460859
6.823587517383902
7.148520256306945
7.473452995229987
7.7983857341530305
8.123318473076074
8.448251211999116
8.77318395092216
9.098116689845202
9.423049428768245
9.747982167691289
10.072914906614331
10.397847645537373
10.722780384460417
11.04771312338346
11.372645862306502
11.697578601229546
12.022511340152588
12.34744407907563
12.672376817998675
12.997309556921717
13.32224229584476
13.647175034767804
13.972107773690846
14.29704051261389
14.621973251536932
14.946905990459975
15.271838729383019
15.596771468306061
15.921704207229103
16.246636946152147
16.57156968507519
16.896502423998232
17.221435162921274
17.54636790184432

In [14]:
aut_G_di = nx.DiGraph()
for aut in aut_set:
    aut_G_di.add_node(aut)
count = 0
for id1, id2, lab in data_set:
    if lab==1:
        for aut1 in id_aut_dic[id1]:
            for aut2 in id_aut_dic[id2]:
                if aut_G_di.has_edge(aut1, aut2):
                    aut_G_di[aut1][aut2]['w'] += 1
                else:
                    aut_G_di.add_edge(aut1, aut2, {'w': 1})
    count+=1
    if count%2000==0: print(count/6155.12)

0.3249327389230429
0.6498654778460858
0.9747982167691288
1.2997309556921717
1.6246636946152146
1.9495964335382576
2.2745291724613006
2.5994619113843433
2.9243946503073865
3.2493273892304293
3.5742601281534725
3.8991928670765152
4.224125605999558
4.549058344922601
4.873991083845644
5.198923822768687
5.52385656169173
5.848789300614773
6.173722039537815
6.498654778460859
6.823587517383902
7.148520256306945
7.473452995229987
7.7983857341530305
8.123318473076074
8.448251211999116
8.77318395092216
9.098116689845202
9.423049428768245
9.747982167691289
10.072914906614331
10.397847645537373
10.722780384460417
11.04771312338346
11.372645862306502
11.697578601229546
12.022511340152588
12.34744407907563
12.672376817998675
12.997309556921717
13.32224229584476
13.647175034767804
13.972107773690846
14.29704051261389
14.621973251536932
14.946905990459975
15.271838729383019
15.596771468306061
15.921704207229103
16.246636946152147
16.57156968507519
16.896502423998232
17.221435162921274
17.54636790184432

** creat a dictionary recording time presence in the database **

In [15]:
dic_aut = {}
for index, row in df.iterrows():
    names = row['authors_list']
    for name in names:
        if name not in dic_aut:
            dic_aut[name] = 1
        else:
            dic_aut[name] += 1


** Feature: authors presence sum **

In [16]:
temp1 = []
temp2 = []
for id1, id2, lab in data_set:
    auts1 = id_aut_dic[id1]
    auts2 = id_aut_dic[id2]
    sum_a1, sum_a2 = 0, 0
    for a in auts1:
        sum_a1 += dic_aut[a]
    temp1.append(sum_a1)
    for a in auts2:
        sum_a2 += dic_aut[a]
    temp2.append(sum_a2)
features['aut1_sum'] = temp1
features['aut2_sum'] = temp2

In [17]:
temp1 = []
temp2 = []
for id1, id2 in test:
    auts1 = id_aut_dic[id1]
    auts2 = id_aut_dic[id2]
    sum_a1, sum_a2 = 0, 0
    for a in auts1:
        sum_a1 += dic_aut[a]
    temp1.append(sum_a1)
    for a in auts2:
        sum_a2 += dic_aut[a]
    temp2.append(sum_a2)
features_test['aut1_sum'] = temp1
features_test['aut2_sum'] = temp2

** Feature: author citation count between two papers based on the Graph of authors **

In [18]:
temp = []
for id1, id2, lab in data_set:
    auts1 = id_aut_dic[id1]
    auts2 = id_aut_dic[id2]
    c=0
    for a1 in auts1:
        for a2 in auts2:
            if aut_G.has_edge(a1, a2):
                c+=aut_G[a1][a2]['w']-lab
    #if len(auts1)!=0 and len(auts2)!=0: c = -1
    temp.append(c)
features['=A-A=']=temp

In [19]:
temp = []
for id1, id2 in test:
    auts1 = id_aut_dic[id1]
    auts2 = id_aut_dic[id2]
    c=0
    for a1 in auts1:
        for a2 in auts2:
            if aut_G.has_edge(a1, a2):
                c+=aut_G[a1][a2]['w']
    #if len(auts1)!=0 and len(auts2)!=0: c = -1
    temp.append(c)
features_test['=A-A=']=temp

** dictionary for paper node degree in the citation graph **

In [20]:
dic_doc = {id: 0 for id in df['id'].values.tolist()}
doc_src_count = {id: 0 for id in df['id'].values.tolist()}
doc_tgt_count = {id: 0 for id in df['id'].values.tolist()}
for id1, id2, lab in data_set:
    if lab == 1:
        dic_doc[id1] += 1
        doc_src_count[id1] += 1
        doc_tgt_count[id2] += 1
        dic_doc[id2] += 1

** Features: paper node degree in the citation graph **

In [21]:
temp1 = []
temp2 = []
for id1, id2, lab in data_set:
    temp1.append(dic_doc[id1]-lab)
    temp2.append(dic_doc[id2]-lab)
features['connection1'] = temp1
features['connection2'] = temp2

In [22]:
temp1 = []
temp2 = []
for id1, id2 in test:
    temp1.append(dic_doc[id1]-lab)
    temp2.append(dic_doc[id2]-lab)
features_test['connection1'] = temp1
features_test['connection2'] = temp2

** Feature: paper abstract length **

In [25]:
corpus = [df["title"].values.tolist()[i]+ ' ' +df["abstract"].values.tolist()[i] for i in range(df.shape[0])]
tok_corp = [nltk.word_tokenize(sent) for sent in corpus]

In [26]:
dic_doclen = {df['id'].values.tolist()[i]: len(tok_corp[i]) for i in range(df.shape[0])}

In [27]:
temp1 = []
temp2 = []
for id1, id2 in data:
    temp1.append(dic_doclen[id1])
    temp2.append(dic_doclen[id2])
features['len1'] = temp1
features['len2'] = temp2

In [28]:
temp1 = []
temp2 = []
for id1, id2 in test:
    temp1.append(dic_doclen[id1])
    temp2.append(dic_doclen[id2])
features_test['len1'] = temp1
features_test['len2'] = temp2

** dictionary of paper graph based on citation **

In [29]:
dic_edge_list = {id: [] for id in df['id'].values.tolist()}
dic_edge_set = {id: set([]) for id in df['id'].values.tolist()}
for id1, id2, lab in data_set:
    if lab == 1:
        dic_edge_list[id1].append(id2)
        dic_edge_set[id1].add(id2)
        dic_edge_list[id2].append(id1)
        dic_edge_set[id2].add(id1)

** Feature: path count of 2 edge long between each pair of papers **

In [30]:
temp = []
for id1, id2 in data:
    c = 0
    for id3 in dic_edge_list[id1]:
        if id3 != id2 and id2 in dic_edge_set[id3]:
            c += 1
    temp.append(c)
features['edge2'] = temp

In [31]:
temp = []
for id1, id2 in test:
    c = 0
    for id3 in dic_edge_list[id1]:
        if id3 != id2 and id2 in dic_edge_set[id3]:
            c += 1
    temp.append(c)
features_test['edge2'] = temp

** Feature: path count of 2 edge long between each pair of papers (weighted: degree of the middle node) **

In [32]:
temp = []
for id1, id2 in data:
    c = 0
    for id3 in dic_edge_list[id1]:
        if id3 != id2 and id2 in dic_edge_set[id3]:
            c += dic_doc[id3]
    temp.append(c)
features['edge2_weighted'] = temp

In [None]:
temp = []
for id1, id2 in test:
    c = 0
    for id3 in dic_edge_list[id1]:
        if id3 != id2 and id2 in dic_edge_set[id3]:
            c += dic_doc[id3]
    temp.append(c)
features_test['edge2_weighted'] = temp

** Feature: path count of 3 edge long between each pair of papers **

In [188]:
temp = []
count = 0
for id1, id2 in data:
    c = 0
    for id3 in dic_edge_list[id1]:
        if id3 != id2:
            for id4 in dic_edge_list[id3]:
                if id4 != id2 and id4 != id1 and id2 in dic_edge_set[id4]:
                    c += 1
    temp.append(c)
    count += 1
    if count%1000 == 0:
        print(count/6155.12)
features['edge3'] = temp

0.16246636946152146


KeyboardInterrupt: 

In [None]:
temp = []
count = 0
for id1, id2 in test:
    c = 0
    for id3 in dic_edge_list[id1]:
        if id3 != id2:
            for id4 in dic_edge_list[id3]:
                if id4 != id2 and id4 != id1 and id2 in dic_edge_set[id4]:
                    c += 1
    temp.append(c)
    count += 1
    if count%1000 == 0:
        print(count/6155.12)
features_test['edge3'] = temp

** Feature: path count of 3 edge long between each pair of papers (weighted: degree of the middle node) **

In [192]:
temp = []
count = 0
for id1, id2 in data:
    c = 0
    for id3 in dic_edge_list[id1]:
        if id3 != id2:
            for id4 in dic_edge_list[id3]:
                if id4 != id2 and id4 != id1 and id2 in dic_edge_set[id4]:
                    c += dic_doc[id3]*dic_doc[id4]
    temp.append(c)
    count += 1
    if count%1000 == 0:
        print(count/6155.12)
features['edge3_weighted'] = temp

0.16246636946152146
0.3249327389230429
0.4873991083845644
0.6498654778460858
0.8123318473076073
0.9747982167691288
1.1372645862306503
1.2997309556921717
1.4621973251536933
1.6246636946152146
1.7871300640767362
1.9495964335382576
2.112062802999779
2.2745291724613006
2.436995541922822
2.5994619113843433
2.761928280845865
2.9243946503073865
3.0868610197689077
3.2493273892304293
3.411793758691951
3.5742601281534725
3.7367264976149936
3.8991928670765152
4.061659236538037
4.224125605999558
4.38659197546108
4.549058344922601
4.711524714384122
4.873991083845644
5.0364574533071655
5.198923822768687
5.361390192230209
5.52385656169173
5.686322931153251
5.848789300614773
6.011255670076294
6.173722039537815
6.336188408999337
6.498654778460859
6.66112114792238
6.823587517383902
6.986053886845423
7.148520256306945
7.310986625768466
7.473452995229987
7.635919364691509
7.7983857341530305
7.960852103614552
8.123318473076074
8.285784842537595
8.448251211999116
8.610717581460637
8.77318395092216
8.9356503

72.78493351876162
72.94739988822315
73.10986625768466
73.27233262714618
73.4347989966077
73.59726536606922
73.75973173553075
73.92219810499226
74.08466447445379
74.24713084391531
74.40959721337683
74.57206358283835
74.73452995229988
74.89699632176139
75.05946269122292
75.22192906068445
75.38439543014596
75.54686179960748
75.70932816906901
75.87179453853052
76.03426090799205
76.19672727745356
76.35919364691509
76.52166001637661
76.68412638583813
76.84659275529965
77.00905912476118
77.17152549422269
77.33399186368422
77.49645823314574
77.65892460260726
77.82139097206878
77.98385734153031
78.14632371099182
78.30879008045335
78.47125644991488
78.63372281937639
78.79618918883791
78.95865555829943
79.12112192776095
79.28358829722248
79.44605466668399
79.60852103614552
79.77098740560704
79.93345377506856
80.09592014453008
80.25838651399161
80.42085288345312
80.58331925291465
80.74578562237618
80.90825199183769
81.07071836129921
81.23318473076074
81.39565110022225
81.55811746968378
81.72058383

In [None]:
temp = []
count = 0
for id1, id2 in test:
    c = 0
    for id3 in dic_edge_list[id1]:
        if id3 != id2:
            for id4 in dic_edge_list[id3]:
                if id4 != id2 and id4 != id1 and id2 in dic_edge_set[id4]:
                    c += dic_doc[id3]*dic_doc[id4]
    temp.append(c)
    count += 1
    if count%1000 == 0:
        print(count/6155.12)
features_test['edge3_weighted'] = temp

** dictionary for negative citation network **

In [119]:
dic_edge_neg_list = {id: [] for id in df['id'].values.tolist()}
dic_edge_neg_set = {id: set([]) for id in df['id'].values.tolist()}
for id1, id2, lab in data_set:
    if lab == 0:
        dic_edge_neg_list[id1].append(id2)
        dic_edge_neg_set[id1].add(id2)
        dic_edge_neg_list[id2].append(id1)
        dic_edge_neg_set[id2].add(id1)

** Feature: path count of 2 edge (one positive and one negative) long between each pair of papers **

In [100]:
temp = []
for id1, id2 in data:
    c = 0
    for id3 in dic_edge_list[id1]:
        if id3 != id2 and id2 in dic_edge_neg_set[id3]:
            c += 1
    for id3 in dic_edge_list[id2]:
        if id3 != id1 and id1 in dic_edge_neg_set[id3]:
            c += 1
    temp.append(c)

features['edge2_neg'] = temp

In [None]:
temp = []
for id1, id2 in test:
    c = 0
    for id3 in dic_edge_list[id1]:
        if id3 != id2 and id2 in dic_edge_neg_set[id3]:
            c += 1
    for id3 in dic_edge_list[id2]:
        if id3 != id1 and id1 in dic_edge_neg_set[id3]:
            c += 1
    temp.append(c)

features_test['edge2_neg'] = temp

** Feature: path count of 3 edge (two positive and one negative) long between each pair of papers **

In [123]:
temp = []
count = 0
for id1, id2 in data:
    c = 0
    for id3 in dic_edge_neg_list[id1]:
        if id3 != id2:
            for id4 in dic_edge_list[id3]:
                if id4 != id2 and id4 != id1 and id2 in dic_edge_set[id4]:
                    c += 1
    temp.append(c)
    count += 1
    if count%2000 == 0:
        print(count/6155.12)

edge3_neg_left = temp

temp = []
count = 0
for id1, id2 in data:
    c = 0
    for id3 in dic_edge_list[id1]:
        if id3 != id2:
            for id4 in dic_edge_neg_list[id3]:
                if id4 != id2 and id4 != id1 and id2 in dic_edge_set[id4]:
                    c += 1
    temp.append(c)
    count += 1
    if count%2000 == 0:
        print(count/6155.12)

edge3_neg_middle = temp

temp = []
count = 0
for id1, id2 in data:
    c = 0
    for id3 in dic_edge_neg_list[id2]:
        if id3 != id1:
            for id4 in dic_edge_list[id3]:
                if id4 != id1 and id4 != id2 and id1 in dic_edge_set[id4]:
                    c += 1
    temp.append(c)
    count += 1
    if count%2000 == 0:
        print(count/6155.12)

edge3_neg_right = temp

features['edge3_neg'] = list(map(lambda x: x[0] + x[1] + x[2], zip(edge3_neg_left, edge3_neg_middle, edge3_neg_right)))
features['edge3_neg_middle'] = edge3_neg_middle
features['edge3_neg_sid'] = list(map(lambda x: x[0] + x[1], zip(edge3_neg_left, edge3_neg_right)))

0.3249327389230429
0.6498654778460858
0.9747982167691288
1.2997309556921717
1.6246636946152146
1.9495964335382576
2.2745291724613006
2.5994619113843433
2.9243946503073865
3.2493273892304293
3.5742601281534725
3.8991928670765152
4.224125605999558
4.549058344922601
4.873991083845644
5.198923822768687
5.52385656169173
5.848789300614773
6.173722039537815
6.498654778460859
6.823587517383902
7.148520256306945
7.473452995229987
7.7983857341530305
8.123318473076074
8.448251211999116
8.77318395092216
9.098116689845202
9.423049428768245
9.747982167691289
10.072914906614331
10.397847645537373
10.722780384460417
11.04771312338346
11.372645862306502
11.697578601229546
12.022511340152588
12.34744407907563
12.672376817998675
12.997309556921717
13.32224229584476
13.647175034767804
13.972107773690846
14.29704051261389
14.621973251536932
14.946905990459975
15.271838729383019
15.596771468306061
15.921704207229103
16.246636946152147
16.57156968507519
16.896502423998232
17.221435162921274
17.54636790184432

In [None]:
temp = []
count = 0
for id1, id2 in test:
    c = 0
    for id3 in dic_edge_neg_list[id1]:
        if id3 != id2:
            for id4 in dic_edge_list[id3]:
                if id4 != id2 and id4 != id1 and id2 in dic_edge_set[id4]:
                    c += 1
    temp.append(c)
    count += 1
    if count%2000 == 0:
        print(count/6155.12)

edge3_neg_left = temp

temp = []
count = 0
for id1, id2 in test:
    c = 0
    for id3 in dic_edge_list[id1]:
        if id3 != id2:
            for id4 in dic_edge_neg_list[id3]:
                if id4 != id2 and id4 != id1 and id2 in dic_edge_set[id4]:
                    c += 1
    temp.append(c)
    count += 1
    if count%2000 == 0:
        print(count/6155.12)

edge3_neg_middle = temp

temp = []
count = 0
for id1, id2 in test:
    c = 0
    for id3 in dic_edge_neg_list[id2]:
        if id3 != id1:
            for id4 in dic_edge_list[id3]:
                if id4 != id1 and id4 != id2 and id1 in dic_edge_set[id4]:
                    c += 1
    temp.append(c)
    count += 1
    if count%2000 == 0:
        print(count/6155.12)

edge3_neg_right = temp

features_test['edge3_neg'] = list(map(lambda x: x[0] + x[1] + x[2], zip(edge3_neg_left, edge3_neg_middle, edge3_neg_right)))
features_test['edge3_neg_middle'] = edge3_neg_middle
features_test['edge3_neg_sid'] = list(map(lambda x: x[0] + x[1], zip(edge3_neg_left, edge3_neg_right)))

** Feature: Title + Abstract same words count between papers **

In [47]:
temp = []
for id1, id2 in data:
    title1 = data_dic[id1][1].split()
    title2 = data_dic[id2][1].split()
    count = 0
    for a1 in title1:
        for a2 in title2:
            if a1 == a2 and a1 not in stop:
                count += 1
    temp.append(count)
features['T1T2_c'] = temp

temp = []
for id1, id2 in data:
    title1 = nltk.word_tokenize(data_dic[id1][1])
    title2 = nltk.word_tokenize(data_dic[id2][4])
    count = 0
    for a1 in title1:
        for a2 in title2:
            if a1 == a2 and a1 not in stop:
                count += 1
    temp.append(count)
features['T1Ab2_c'] = temp

temp = []
for id1, id2 in data:
    title1 = nltk.word_tokenize(data_dic[id1][4])
    title2 = nltk.word_tokenize(data_dic[id2][1])
    count = 0
    for a1 in title1:
        for a2 in title2:
            if a1 == a2 and a1 not in stop:
                count += 1
    temp.append(count)
features['T2Ab1_c'] = temp

temp = []
for id1, id2 in data:
    title1 = nltk.word_tokenize(data_dic[id1][4])
    title2 = nltk.word_tokenize(data_dic[id2][4])
    count = 0
    for a1 in title1:
        for a2 in title2:
            if a1 == a2 and a1 not in stop:
                count += 1
    temp.append(count)
features['Ab1Ab2_c'] = temp

In [None]:
temp = []
for id1, id2 in test:
    title1 = data_dic[id1][1].split()
    title2 = data_dic[id2][1].split()
    count = 0
    for a1 in title1:
        for a2 in title2:
            if a1 == a2 and a1 not in stop:
                count += 1
    temp.append(count)
features_test['T1T2_c'] = temp

temp = []
for id1, id2 in test:
    title1 = nltk.word_tokenize(data_dic[id1][1])
    title2 = nltk.word_tokenize(data_dic[id2][4])
    count = 0
    for a1 in title1:
        for a2 in title2:
            if a1 == a2 and a1 not in stop:
                count += 1
    temp.append(count)
features_test['T1Ab2_c'] = temp

temp = []
for id1, id2 in test:
    title1 = nltk.word_tokenize(data_dic[id1][4])
    title2 = nltk.word_tokenize(data_dic[id2][1])
    count = 0
    for a1 in title1:
        for a2 in title2:
            if a1 == a2 and a1 not in stop:
                count += 1
    temp.append(count)
features_test['T2Ab1_c'] = temp

temp = []
for id1, id2 in test:
    title1 = nltk.word_tokenize(data_dic[id1][4])
    title2 = nltk.word_tokenize(data_dic[id2][4])
    count = 0
    for a1 in title1:
        for a2 in title2:
            if a1 == a2 and a1 not in stop:
                count += 1
    temp.append(count)
features_test['Ab1Ab2_c'] = temp

** Save features to file **

In [124]:
features.to_pickle('features.pkl')
features_test.to_pickle('features_test.pkl')

** Features: Tf-idf Matrix **

In [None]:
tf = TfidfVectorizer(input='content', analyzer='word', min_df = 0, stop_words = 'english', sublinear_tf=False)
tfidf_matrix =  tf.fit_transform(corpus2)

In [None]:
svd = TruncatedSVD(n_components=50, n_iter=7, random_state=42)
lsa = svd.fit_transform(tfidf_matrix)

In [None]:
list_id = df['id'].values.tolist()
dic_lsa = {list_id[i]: lsa[i] for i in range(df.shape[0])}

In [None]:
temp1, temp2 = [], []
count = 0
for id1, id2 in data:
    temp1.append(dic_lsa[id1].tolist())
    temp2.append(dic_lsa[id2].tolist())
    if count%3000 == 0: print(count/6155.12)
    count +=1
lsa_features1 = pd.DataFrame(np.array(temp1))
lsa_features2 = pd.DataFrame(np.array(temp2))
lsa_features1.to_pickle('lsa1.pkl')
lsa_features2.to_pickle('lsa2.pkl')

In [None]:
temp1, temp2 = [], []
for id1, id2 in test:
    temp1.append(dic_lsa[id1].tolist())
    temp2.append(dic_lsa[id2].tolist())
lsa_features_test1 = pd.DataFrame(np.array(temp1))
lsa_features_test2 = pd.DataFrame(np.array(temp2))
lsa_features_test1.to_pickle('lsa_test1.pkl')
lsa_features_test2.to_pickle('lsa_test2.pkl')