In [2]:
import pandas as pd
from tqdm.notebook import tqdm as tqdm_notebook
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from collections import defaultdict
from collections import Counter
from langdetect import detect
import re
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import fpgrowth
from mlxtend.frequent_patterns import association_rules
%matplotlib inline

In [3]:
FILE_ORIGIN = '../dblpv13.json'
FILE_PREPROCESSED = '../data.json'

In [4]:
CHUNKSIZE = 100000
MAX_OBSERVATIONS = 1000000 #Set to 55000000 if want all rows
df = pd.DataFrame()
with pd.read_json(FILE_PREPROCESSED, orient='records', lines=True, chunksize=CHUNKSIZE, nrows=MAX_OBSERVATIONS) as reader:
    for chunk_i, chunk in tqdm_notebook(enumerate(reader), total=MAX_OBSERVATIONS // CHUNKSIZE):
        df_new = pd.DataFrame(chunk)
        df_new['chunk_idx'] = chunk_i
        df = pd.concat([df, df_new], ignore_index=True)

  0%|          | 0/10 [00:00<?, ?it/s]

In [5]:
df.head()

Unnamed: 0,_id,title,venue,year,keywords,n_citation,lang,authors,fos,page_start,...,volume,issue,issn,isbn,doi,pdf,url,abstract,references,chunk_idx
0,53e99784b7602d9701f3e3f5,3GIO.,{'type': 0},2011.0,[],0.0,en,,,,...,,,,,,,,,,0
1,53e99784b7602d9701f3e133,The relationship between canopy parameters and...,"{'_id': '53a7297d20f7420be8bd4ae7', 'name_d': ...",2011.0,"[canopy parameters, canopy spectrum, different...",0.0,en,"[{'_id': '53f45728dabfaec09f209538', 'name': '...","[Agronomy, Moisture, Hydrology, Environmental ...",1930.0,...,,,,,10.1109/IGARSS.2011.6049503,,[http://dx.doi.org/10.1109/IGARSS.2011.6049503],Drought is the first place in all the natural ...,,0
2,53e99784b7602d9701f3e151,A solution to the problem of touching and brok...,"{'_id': '53a72a4920f7420be8bfa51b', 'name_d': ...",1993.0,"[handwriting recognition, prototypes, image se...",17.0,en,"[{'_id': '53f46797dabfaeb22f542630', 'name': '...","[Intelligent character recognition, Pattern re...",602.0,...,,,,,10.1109/ICDAR.1993.395663,,[http://dx.doi.org/10.1109/ICDAR.1993.395663],,"[53e99cf5b7602d97025ace63, 557e8a7a6fee0fe990c...",0
3,53e99784b7602d9701f3e15d,Timing yield estimation using statistical stat...,"{'_id': '53a72e2020f7420be8c80142', 'name_d': ...",2005.0,"[sequential circuits, statistical distribution...",28.0,en,"[{'_id': '53f43b03dabfaedce555bf2a', 'name': '...","[Delay calculation, Timing failure, Monte Carl...",2461.0,...,,,,0-7803-8834-8,10.1109/ISCAS.2005.1465124,//static.aminer.org/pdf/PDF/000/423/329/timing...,"[http://dx.doi.org/10.1109/ISCAS.2005.1465124,...",As process variations become a significant pro...,"[53e9a8a9b7602d97031f6bb9, 599c7b6b601a182cd27...",0
4,53e99784b7602d9701f3e161,360°,"{'_id': '5390a74a20f70186a0e8b40b', 'raw': 'AC...",2009.0,"[global high technology, daily short-distance ...",,en,"[{'_id': '53f46946dabfaec09f24b4ed', 'name': '...",,39.0,...,,,,,10.1145/1665137.1665166,,,360° represents the concerns that are addresse...,,0


In [6]:
authors = Counter()
for val in df['authors']:
    if isinstance(val, list):
        for author in val:
            if 'name' in author and re.match(r'^[a-zA-Z\s\-\.\,]+$', author['name']):
                authors[author['name']] += 1

In [7]:
top_n_authors = dict(authors.most_common(30000))

In [8]:
top_n_authors

{'Wei Wang': 602,
 'Lei Zhang': 415,
 'Wei Zhang': 405,
 'Wei Li': 363,
 'Yang Liu': 336,
 'Lei Wang': 330,
 'Jun Wang': 318,
 'Li Zhang': 313,
 'Yan Zhang': 302,
 'Wen Gao': 292,
 'Jun Zhang': 291,
 'Wei Liu': 273,
 'Ming Li': 262,
 'Xin Li': 262,
 'Jun Li': 255,
 'Jing Li': 254,
 'Jiawei Han': 242,
 'Li Li': 241,
 'Wei Chen': 236,
 'Yu Wang': 233,
 'Elisa Bertino': 232,
 'Jing Wang': 232,
 'Bo Li': 231,
 'Philip S. Yu': 228,
 'Thomas S. Huang': 226,
 'Tao Li': 225,
 'Yu Zhang': 224,
 'Hai Jin': 222,
 'Hui Li': 222,
 'Hui Wang': 217,
 'Witold Pedrycz': 217,
 'Xin Wang': 213,
 'Bin Li': 210,
 'Jie Wu': 206,
 'Chin-Chen Chang': 206,
 'Jie Zhang': 205,
 'Mario Piattini': 202,
 'Yong Wang': 201,
 'Ajith Abraham': 200,
 'Jian Li': 198,
 'Yi Zhang': 197,
 'Edwin R. Hancock': 197,
 'H. Vincent Poor': 197,
 'Yang Yang': 197,
 'Jing Zhang': 197,
 'Ying Zhang': 196,
 'Bin Wang': 195,
 'Qing Li': 195,
 'Ping Zhang': 195,
 'Rui Zhang': 194,
 'Jie Yang': 194,
 'Xiang Li': 193,
 'Yi Wang': 193,
 'X

In [9]:
type(top_n_authors)

dict

In [10]:
def get_authors_list(x):
    authors = []
    if not isinstance(x, list):
        return authors
    for author in x:
        if 'name' in author and author['name'] in top_n_authors:
            authors.append(author['name'])
    return authors

df['authors_list'] = df.authors.apply(get_authors_list)

In [11]:
df.head()

Unnamed: 0,_id,title,venue,year,keywords,n_citation,lang,authors,fos,page_start,...,issue,issn,isbn,doi,pdf,url,abstract,references,chunk_idx,authors_list
0,53e99784b7602d9701f3e3f5,3GIO.,{'type': 0},2011.0,[],0.0,en,,,,...,,,,,,,,,0,[]
1,53e99784b7602d9701f3e133,The relationship between canopy parameters and...,"{'_id': '53a7297d20f7420be8bd4ae7', 'name_d': ...",2011.0,"[canopy parameters, canopy spectrum, different...",0.0,en,"[{'_id': '53f45728dabfaec09f209538', 'name': '...","[Agronomy, Moisture, Hydrology, Environmental ...",1930.0,...,,,,10.1109/IGARSS.2011.6049503,,[http://dx.doi.org/10.1109/IGARSS.2011.6049503],Drought is the first place in all the natural ...,,0,[Yun Xu]
2,53e99784b7602d9701f3e151,A solution to the problem of touching and brok...,"{'_id': '53a72a4920f7420be8bfa51b', 'name_d': ...",1993.0,"[handwriting recognition, prototypes, image se...",17.0,en,"[{'_id': '53f46797dabfaeb22f542630', 'name': '...","[Intelligent character recognition, Pattern re...",602.0,...,,,,10.1109/ICDAR.1993.395663,,[http://dx.doi.org/10.1109/ICDAR.1993.395663],,"[53e99cf5b7602d97025ace63, 557e8a7a6fee0fe990c...",0,[]
3,53e99784b7602d9701f3e15d,Timing yield estimation using statistical stat...,"{'_id': '53a72e2020f7420be8c80142', 'name_d': ...",2005.0,"[sequential circuits, statistical distribution...",28.0,en,"[{'_id': '53f43b03dabfaedce555bf2a', 'name': '...","[Delay calculation, Timing failure, Monte Carl...",2461.0,...,,,0-7803-8834-8,10.1109/ISCAS.2005.1465124,//static.aminer.org/pdf/PDF/000/423/329/timing...,"[http://dx.doi.org/10.1109/ISCAS.2005.1465124,...",As process variations become a significant pro...,"[53e9a8a9b7602d97031f6bb9, 599c7b6b601a182cd27...",0,"[Chris C. N. Chu, Hai Zhou]"
4,53e99784b7602d9701f3e161,360°,"{'_id': '5390a74a20f70186a0e8b40b', 'raw': 'AC...",2009.0,"[global high technology, daily short-distance ...",,en,"[{'_id': '53f46946dabfaec09f24b4ed', 'name': '...",,39.0,...,,,,10.1145/1665137.1665166,,,360° represents the concerns that are addresse...,,0,[]


In [12]:
%%time
te = TransactionEncoder()
te_ary = te.fit(df['authors_list']).transform(df['authors_list'])

CPU times: user 1.2 s, sys: 770 ms, total: 1.97 s
Wall time: 2.07 s


In [13]:
te_ary

array([[False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       ...,
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False]])

In [14]:
df_te = pd.DataFrame(te_ary, columns=te.columns_)

In [15]:
df_te.head()

Unnamed: 0,Arvind,CACM Staff,Computer Staff,IEEE Expert staff,IEEE Software Staff,Kinshuk,Luqi,Staff,Ubiquity staff,A Min Tjoa,...,wang,wei,wu,xu,yang,yu,zhang,zhao,zhou,zhu
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [17]:
%%time
res = fpgrowth(df_te, min_support=1e-30, use_colnames=True, max_len=2)

CPU times: user 35.7 s, sys: 16.3 s, total: 52 s
Wall time: 1min 2s


In [20]:
res_rules = association_rules(res, metric="lift", min_threshold=1)
res_rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(Guoliang Chen),(Yun Xu),0.000035,0.000019,0.000003,0.085714,4511.278195,2.999335e-06,1.093729
1,(Yun Xu),(Guoliang Chen),0.000019,0.000035,0.000003,0.157895,4511.278195,2.999335e-06,1.187458
2,(Di Wu),(Yun Xu),0.000097,0.000019,0.000001,0.010309,542.593597,9.981570e-07,1.010397
3,(Yun Xu),(Di Wu),0.000019,0.000097,0.000001,0.052632,542.593597,9.981570e-07,1.055453
4,(Yi Shang),(Yun Xu),0.000032,0.000019,0.000002,0.062500,3289.473684,1.999392e-06,1.066646
...,...,...,...,...,...,...,...,...,...
359879,(Kai Zhang),(Weijun Liu),0.000061,0.000013,0.000001,0.016393,1261.034048,9.992070e-07,1.016653
359880,(Jinhua Wang),(Weijun Liu),0.000013,0.000013,0.000001,0.076923,5917.159763,9.998310e-07,1.083319
359881,(Weijun Liu),(Jinhua Wang),0.000013,0.000013,0.000001,0.076923,5917.159763,9.998310e-07,1.083319
359882,(Wei Jin),(Weijun Liu),0.000029,0.000013,0.000001,0.034483,2652.519894,9.996230e-07,1.035701


In [27]:
res_rules[res_rules.antecedents.eq({'Guoliang Chen'})]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(Guoliang Chen),(Yun Xu),3.5e-05,1.9e-05,3e-06,0.085714,4511.278195,2.999335e-06,1.093729
20764,(Guoliang Chen),(Yunji Chen),3.5e-05,2.1e-05,1e-06,0.028571,1360.544218,9.99265e-07,1.02939
73285,(Guoliang Chen),(Huaping Chen),3.5e-05,2e-05,1e-06,0.028571,1428.571429,9.993e-07,1.029391
149971,(Guoliang Chen),(Darrell Whitley),3.5e-05,3.1e-05,1e-06,0.028571,921.658986,9.98915e-07,1.02938
158652,(Guoliang Chen),(Qiang Lu),3.5e-05,1.9e-05,2e-06,0.057143,3007.518797,1.999335e-06,1.060586
213300,(Guoliang Chen),(Tianshi Chen),3.5e-05,2.7e-05,2e-06,0.057143,2116.402116,1.999055e-06,1.060577
213439,(Guoliang Chen),(Yixin Chen),3.5e-05,5.7e-05,2e-06,0.057143,1002.506266,1.998005e-06,1.060546
213441,(Guoliang Chen),(Yongsheng Ding),3.5e-05,3.7e-05,1e-06,0.028571,772.200772,9.98705e-07,1.029374
213443,(Guoliang Chen),(Jing Wang),3.5e-05,0.000232,1e-06,0.028571,123.152709,9.9188e-07,1.029173
213444,(Guoliang Chen),(Rui Wang),3.5e-05,0.000131,1e-06,0.028571,218.102508,9.95415e-07,1.029277


In [61]:
class CoAuthors:
    def __init__(self, association_rules, top_n):
        self.association_rules = association_rules
        self.top_n = top_n

    def predict(self, author):
        return [
            coauthor
            for coauthors in self.association_rules[self.association_rules.antecedents.eq({author})].nlargest(self.top_n, 'lift').consequents
            for coauthor in coauthors
        ]

In [62]:
co_authors = CoAuthors(res_rules, 10)

In [63]:
co_authors.predict('Yunquan Zhang')

['Wang Lei',
 'Huiyang Zhou',
 'Xiaodi Huang',
 'Guoliang Chen',
 'Tao Luo',
 'Chen Ding',
 'Chao Li',
 'Chao Yang',
 'Ke Wang',
 'Ting Wang']

In [64]:
import pickle
with open('coauthors_model', 'wb') as picklefile:
    pickle.dump(co_authors, picklefile)