# AcknowとAuthorが共著してるかどうか

In [2]:
import ast
import json
import pandas as pd
import pickle
import os 
import sys
from tqdm import tqdm
import vaex
import glob
import networkx as nx
from itertools import combinations
from ast import literal_eval

# for reading acknow
sys.path.append(os.path.join(os.path.dirname(os.path.abspath('')), '.'))
from utils.read_acknow import ReadAcknow


FILES = ['compbiology', 'biology', 'medicine', 'genetics', 'ntds', 'pathogenes', 'plosone', 'srep']

# edgelist acknow.
PATH_ROW_ACKNOW = '../../data/treatedEdgeList-ack_MAG'

In [3]:
%load_ext autoreload
%autoreload

In [4]:
def see(var):
    print(type(var))
    if type(var) == dict:
        for i, (k, v) in enumerate(var.items()):
            print(k, ":", v)
            if i == 4:
                break
    if type(var) == set:
        print(list(var)[:5])

# Read PLOS and SciRep

In [5]:
%%time
# concat all dfs
dfs = {}
dfs_all = pd.DataFrame()
for file in FILES:
    print(f"read file: {file}")
    dfs[file] = pd.read_csv(f'{PATH_ROW_ACKNOW}/{file}.csv')[['paperId', 'author', 'acknow']]
    dfs_all = pd.concat([dfs_all, dfs[file]])

read file: compbiology
read file: biology
read file: medicine
read file: genetics
read file: ntds


  exec(code, glob, local_ns)


read file: pathogenes
read file: plosone
read file: srep
CPU times: user 3.34 s, sys: 301 ms, total: 3.64 s
Wall time: 3.65 s


In [6]:
dfs_all.head()

Unnamed: 0,paperId,author,acknow
0,10.1371/journal.pcbi.0010001,Hanah_Margalit,Aviv_Regev
1,10.1371/journal.pcbi.0010001,Hanah_Margalit,Benjamin_Gordon
2,10.1371/journal.pcbi.0010001,Hanah_Margalit,Dalit_May
3,10.1371/journal.pcbi.0010001,Hanah_Margalit,Ernest_Fraenkel
4,10.1371/journal.pcbi.0010001,Hanah_Margalit,Lena_Nekludova


# Read MAG data (Paper & PaperAuthorAffil) and put pid from MAG

### read MAG paper

In [7]:
# read paper id data
df_magpaper_id_doi_plos = pd.read_table('../../data-MAG/papers_plos.txt', sep=' ', names=['pid', 'doi'])
df_magpaper_id_doi_srep = pd.read_table('../../data-MAG/papers_srep.txt', sep=' ', names=['pid', 'doi'])
df_magpaper_id_doi = pd.concat([df_magpaper_id_doi_plos, df_magpaper_id_doi_srep])
# change doi to lower case
df_magpaper_id_doi["doi"] = df_magpaper_id_doi["doi"].str.lower()

# clear error row
# うまく処理されてないデータを削除
s_vs = df_magpaper_id_doi['doi'].value_counts()
mag_available_dois = set(s_vs[s_vs.values == 1].index)
df_magpaper_id_doi2 = df_magpaper_id_doi[df_magpaper_id_doi['doi'].isin(mag_available_dois)]

df_magpaper_id_doi2.head()

Unnamed: 0,pid,doi
0,1966827022,10.1371/journal.pcbi.1002887
1,1970867439,10.1371/journal.pone.0040555
2,1974609334,10.1371/journal.pmed.1000202
3,1985768873,10.1371/journal.pcbi.1000731
4,1991290371,10.1371/journal.pone.0090052


### replace paper's doi to papaerID of MAG

In [8]:
dfs_all_w_pid = pd.merge(dfs_all, df_magpaper_id_doi2,  
          how='inner',left_on ='paperId',right_on='doi')
dfs_all_w_pid.head()

Unnamed: 0,paperId,author,acknow,pid,doi
0,10.1371/journal.pcbi.0010001,Hanah_Margalit,Aviv_Regev,2145218186,10.1371/journal.pcbi.0010001
1,10.1371/journal.pcbi.0010001,Hanah_Margalit,Benjamin_Gordon,2145218186,10.1371/journal.pcbi.0010001
2,10.1371/journal.pcbi.0010001,Hanah_Margalit,Dalit_May,2145218186,10.1371/journal.pcbi.0010001
3,10.1371/journal.pcbi.0010001,Hanah_Margalit,Ernest_Fraenkel,2145218186,10.1371/journal.pcbi.0010001
4,10.1371/journal.pcbi.0010001,Hanah_Margalit,Lena_Nekludova,2145218186,10.1371/journal.pcbi.0010001


### replace authorName of row data into authorId of MAG

In [9]:
# %%time
# vaex_mag_pauthorAffil = vaex.from_csv('../../data-MAG/PaperAuthorAffiliations/PaperAuthorAffiliations.csv', convert=True, chunk_size=10_000_000) # 10_000_000~200MB

# read hdf5 ? from 2nd time
vaex_mag_pauthorAffil = vaex.open('../../data-MAG/PaperAuthorAffiliations/PaperAuthorAffiliations.csv.hdf5')

In [10]:
# pid of row data
row_pids = dfs_all_w_pid.pid.tolist()

# authorIds of paperId from AuthorAffil.csv
vaex_row_paperinfo = vaex_mag_pauthorAffil[vaex_mag_pauthorAffil.PaperId.isin(row_pids)]
# vaex_row_paperinfo.head()

# group authors by paperID before merging to dfs_all
df_row_paperinfo = vaex_row_paperinfo.to_pandas_df().rename({' AuthorId': 'AuthorId'}, axis=1)
df_row_paperinfo_groupby_paper = df_row_paperinfo.groupby('PaperId')['AuthorId'].apply(list).reset_index()
df_row_paperinfo_groupby_paper.head()

Unnamed: 0,PaperId,AuthorId
0,11386834,"[1399976251, 2010425313, 2129111168]"
1,12394107,"[2137887536, 2423373870, 2423373870, 2529899695]"
2,23277314,"[1777294175, 1777294175, 2127422046, 215886216..."
3,27879997,"[1975038829, 1975038829, 1975038829, 205481105..."
4,28776219,"[2030885370, 2409662761]"


### merge to dfs_all authorId data and recreate edgelist using AuthorIds

In [11]:
# use only pid and acknow columns
dfs_all_pid_acknow = dfs_all_w_pid[['pid', 'acknow']].drop_duplicates()
# merge authorIds and explode them
dfs_all_w_pid_tmp = pd.merge(dfs_all_pid_acknow, df_row_paperinfo_groupby_paper,  
          how='inner',left_on ='pid',right_on='PaperId')
replaced_dfs_all = dfs_all_w_pid_tmp[['pid', 'acknow', 'AuthorId']].explode('AuthorId')
replaced_dfs_all.head()

Unnamed: 0,pid,acknow,AuthorId
0,2145218186,Aviv_Regev,1219376531
0,2145218186,Aviv_Regev,2079978182
0,2145218186,Aviv_Regev,2125555973
1,2145218186,Benjamin_Gordon,1219376531
1,2145218186,Benjamin_Gordon,2079978182


# Acknowの名前を使って，彼らの名前がとりあえず入ってる論文setを作成

In [26]:
# %%time
# vaex_mag_authors = vaex.from_csv('../../data-MAG/Authors/Authors.csv', convert=True, chunk_size=10_000_000) # 10_000_000~200MB

# read hdf5 ? from 2nd time
vaex_mag_authors = vaex.open('../../data-MAG/Authors/Authors.csv.hdf5')

In [13]:
# read hdf5 ? from 2nd time
vaex_mag_pauthorAffil = vaex.open('../../data-MAG/PaperAuthorAffiliations/PaperAuthorAffiliations.txt.hdf5')
vaex_mag_pauthorAffil.head()

#,PaperId,AuthorId,AffiliationId,AuthorSequenceNumber,OriginalAuthor,OriginalAffiliation
0,9,2632942543,,1,Victoriano Perruca Albadalejo,--
1,15,199142497,,1,Robert Münscher,"Heidelberg, Deutschland"
2,15,680395887,,2,Julia Hormuth,"Reutlingen, Deutschland"
3,23,1243978490,79576900.0,1,Eric T Stoopler,"'Department of Oral Medicine, University of Penn..."
4,23,2582258949,,3,Arthur S Kuperstein,--
5,23,2582736345,,2,Ying Wai Sia,--
6,58,2641488431,,1,I. Uray,--
7,58,2693978806,,2,P. Kovacs,--
8,79,2662843304,,1,K. Suresh,--
9,79,2683440697,,2,T. S. J. Norman,--


In [14]:
list(set(replaced_dfs_all.acknow.tolist()))

['F._Totanes',
 'Nasser_Ibrahim',
 'Ayame_Enomoto',
 'Denis_Valleix',
 'M._Boissan',
 'W._Mason',
 'J._Silaa',
 'Arlene_Rosen',
 'Petra_Hitschke',
 'Alyson_Warr',
 'Laura_Liggins',
 'T._Spintzenberger',
 'Linda_Wordeman',
 'Tsu-An_Hsu',
 'Cassandra_Carrivales',
 'Joseph_Mevan-Atkins',
 'David_N_Brindley',
 'Lindsay_Dean',
 'Linda_Moran',
 'dr.jose_Antonio_Encinar',
 'Marisa_Hernández',
 'I._Splawski',
 'c._Cantalapiedra',
 'Paul_Garthwaite',
 'Suelee_Robbe-Austerman',
 'Chiang_CY',
 'Mary_Mirvis',
 'P._drinker',
 'Mirjam_Debon',
 'Abbigail_Tumpey',
 'S._Fauve',
 'Randall_Downer',
 'Bjørn_Lau',
 'A._Shah',
 'Arnaud_Borderie',
 'D._Živorová',
 'Maud_Mazaniello-Chezol',
 'Stephanie_London',
 'Caroline_Petersen',
 'Eric_Jackson',
 'Tim_Lewis',
 'M._Aguet',
 'M._Takami',
 'John_E._Cadle',
 'Suzie_Lavoie',
 'R.J._Youle',
 'Enrique_Torres',
 'Erzsébet_Kertész',
 'J._Garcia-Ulloa',
 'Marcel_Visser',
 'Dominique_Poncelin',
 'a._Tcherenkov',
 'Alice_L.T._Yu',
 'Mandy_Tumlin',
 'Camden_Lo',
 'Koj

In [15]:
# repalce acknow name without '_'
replaced_dfs_all['acknow'] = replaced_dfs_all['acknow'].apply(lambda x: " ".join(x.split('_')))

In [17]:
row_acknows = list(set(replaced_dfs_all.acknow.tolist()))

In [27]:
%%time
# list of acknows
row_acknows = list(set(replaced_dfs_all.acknow.tolist()))

# extract possible acknow ids from Authors.csv
possible_ackonw_ids = vaex_mag_authors[vaex_mag_authors['DisplayName'].isin(row_acknows)].AuthorId.tolist()

# extract possible paperId include acknow name 
vaex_row_paperinfo_acknow = vaex_mag_pauthorAffil[(vaex_mag_pauthorAffil['OriginalAuthor'].isin(row_acknows)) | vaex_mag_pauthorAffil['AuthorId'].isin(possible_ackonw_ids)][['PaperId', 'AuthorId']]

""" replace before"""
# # Acknow name list
# row_acknows = list(set(replaced_dfs_all.acknow.tolist()))

# # extract possible acknow ids
# possible_ackonw_ids = vaex_mag_authors[vaex_mag_authors['DisplayName'].isin(row_acknows)].AuthorId.tolist()

# paperIds which is possible of acknows
# とりあえずacknowの名前のidの人が含まれてる論文
# vaex_row_paperinfo_acknow = vaex_mag_pauthorAffil[vaex_mag_pauthorAffil[' AuthorId'].isin(possible_ackonw_ids)]

vaex_row_paperinfo_acknow.head()

CPU times: user 3min 7s, sys: 27.9 s, total: 3min 35s
Wall time: 1min 4s


#,PaperId,AuthorId
0,125,2250382311
1,285,2970207820
2,348,3075130288
3,504,2419810751
4,524,3148567105
5,587,2145458040
6,1193,2628638280
7,1273,2250460279
8,1273,2690401570
9,1314,2627395902


In [28]:
# acknowの名前が含まれてる論文ids
possible_acknow_paperid_set = set(vaex_row_paperinfo_acknow.PaperId.tolist())

In [29]:
print(f'length of possible acknow ids = {len(set(possible_ackonw_ids))}')

length of possible acknow ids = 15505739


# AuthorIdを使って，Author's Published Paper Idsを作る

In [30]:
# author's ids
row_authors = list(set(replaced_dfs_all.AuthorId.tolist()))

# paperIds which is published by authors
vaex_author_related_paperinfo = vaex_mag_pauthorAffil[vaex_mag_pauthorAffil['AuthorId'].isin(row_authors)]

# extract pids
authors_published_paper_ids = set(vaex_author_related_paperinfo.PaperId.tolist())

# 各authorがpublishした論文idを必要な論文idだけから作る

### authorが書いた論文の中で，acknowの名前が入ってるものに限定（必要な論文の共著情報をとってくる）

In [31]:
d1_possible_collaborated_paper_ids = list(authors_published_paper_ids & possible_acknow_paperid_set) # length == 11,866,992

### この論文idに対して，1.誰がどの論文を書いたのか`{authors: [pids]}`,と 2.論文とその著者関係`{pid: [authors]}`を作る．

### 1.誰がどの論文を書いたのか`{authors: [pids]}`→df_author_related_paperinfo_grouped

In [32]:
%%time
# extract paper and author data necessary
vaex_author_related_paperinfo2 = vaex_author_related_paperinfo[vaex_author_related_paperinfo['PaperId'].isin(d1_possible_collaborated_paper_ids)]

# group papers by authorId
df_author_related_paperinfo = vaex_author_related_paperinfo2.to_pandas_df() #.rename({' AuthorId': 'AuthorId'}, axis=1)
df_author_related_paperinfo_grouped = df_author_related_paperinfo.groupby('AuthorId')['PaperId'].apply(list).reset_index()

# rename 
df_author_related_paperinfo_grouped = df_author_related_paperinfo_grouped.rename({'PaperId': 'PublishedPaperId'}, axis=1)
df_author_related_paperinfo_grouped.head()

CPU times: user 1min 2s, sys: 1min 8s, total: 2min 11s
Wall time: 2min 56s


Unnamed: 0,AuthorId,PublishedPaperId
0,15251,"[289883683, 1571665470, 1576209224, 1814515265..."
1,18161,"[231729728, 838129493, 1502222176, 1566592723,..."
2,27238,"[574348173, 643728258, 657244501, 754093823, 1..."
3,30645,"[1006201, 68834758, 127261126, 134483773, 1422..."
4,41399,"[5786333, 44086190, 50835861, 137984443, 16627..."


### 2. 論文とその著者関係`{pid: [authors]}`→ df_candidate_paperinfo_grouped

In [34]:
%%time
# extract paper and author data necessary
vaex_candidated_paperinfo = vaex_mag_pauthorAffil[vaex_mag_pauthorAffil['PaperId'].isin(d1_possible_collaborated_paper_ids)]

# filter by acknow's authorId: もし共著情報にはacknow以外の共著情報はいらないので，そこは捨てる
vaex_candidated_paperinfo2 = vaex_candidated_paperinfo[vaex_candidated_paperinfo['AuthorId'].isin(list(set(possible_ackonw_ids)))]

# group authorId by paperId
df_candidate_paperinfo = vaex_candidated_paperinfo2.to_pandas_df() #.rename({' AuthorId': 'AuthorId'}, axis=1)
df_candidate_paperinfo_grouped = df_candidate_paperinfo.groupby('PaperId')['AuthorId'].apply(list).reset_index()

# reanme
df_candidate_paperinfo_grouped = df_candidate_paperinfo_grouped.rename({'AuthorId': 'CollaboratedAuthorId'}, axis=1)
df_candidate_paperinfo_grouped.head() # length = 11,866,992

CPU times: user 3min 19s, sys: 1min 42s, total: 5min 1s
Wall time: 5min 8s


Unnamed: 0,PaperId,CollaboratedAuthorId
0,504,[2419810751]
1,587,[2145458040]
2,1688,"[2126056503, 2736971842]"
3,2208,[2099564275]
4,3066,[2222372278]


# Save1

In [35]:
df_author_related_paperinfo_grouped.to_csv('../../data-computed-MAG/Author_publishedPaperId.csv', index=False)
df_candidate_paperinfo_grouped.to_csv('../../data-computed-MAG/Paper_CollaboratedAuthorId.csv', index=False)

"""
if you start from here, run the following commands
"""
# df_author_related_paperinfo_grouped = pd.read_csv('../../data-computed-MAG/Author_publishedPaperId.csv')
# df_author_related_paperinfo_grouped = pd.read_csv('../../data-computed-MAG/Paper_CollaboratedAuthorId.csv')


'\nif you start from here, run the following commands\n'

# 共著関係`{author: [CollaboratedAuthorIds]}`を作る

### まずはauthorsの共著情報を作る

### 手順
1. `authors: [published_pids]`をexplode (w/ df_author_related_paperinfo_grouped)
2. published_pidに対して，`{pid: [collaboraterId]}`の情報をmerge (w/ df_candidate_paperinfo_grouped)
3. groupby authorIdsで`{authorId: [collaboraterIds]}`を作る

### 1. `authors: [published_pids]`をexplode (w/ df_author_related_paperinfo_grouped)

In [36]:
df_author_related_paperinfo_exploded = df_author_related_paperinfo_grouped.explode('PublishedPaperId')
df_author_related_paperinfo_exploded.head()

Unnamed: 0,AuthorId,PublishedPaperId
0,15251,289883683
0,15251,1571665470
0,15251,1576209224
0,15251,1814515265
0,15251,1844237543


### 2. published_pidに対して，`{pid: [collaboraterId]}`の情報をmerge (w/ df_candidate_paperinfo_grouped)

In [37]:
%%time
df_author_related_paperinfo_merged = pd.merge(df_author_related_paperinfo_exploded, df_candidate_paperinfo_grouped,  
          how='inner', left_on='PublishedPaperId', right_on='PaperId')
df_author_related_paperinfo_merged.head() # length = 32,143,233

CPU times: user 23.6 s, sys: 30.4 s, total: 54.1 s
Wall time: 1min 12s


Unnamed: 0,AuthorId,PublishedPaperId,PaperId,CollaboratedAuthorId
0,15251,289883683,289883683,[2218595941]
1,346729876,289883683,289883683,[2218595941]
2,2218595941,289883683,289883683,[2218595941]
3,15251,1571665470,1571665470,[2218595941]
4,346729876,1571665470,1571665470,[2218595941]


### 3. groupby authorIdsで`{authorId: [collaboraterIds]}`を作る

In [38]:
%%time
# change columns of `CollaboratedAuthorId` for drop_duplicates
df_author_related_paperinfo_merged['str_CollaboratedAuthorId'] = df_author_related_paperinfo_merged['CollaboratedAuthorId'].apply(str)

# drop same collaboration row
df_author_related_paperinfo_merged2 = df_author_related_paperinfo_merged.drop_duplicates(subset=['AuthorId', 'str_CollaboratedAuthorId'])

CPU times: user 27.2 s, sys: 19.3 s, total: 46.5 s
Wall time: 58 s


In [39]:
%%time
df_author_collaborater = df_author_related_paperinfo_merged2[['AuthorId', 'CollaboratedAuthorId']].groupby('AuthorId').agg({'CollaboratedAuthorId': 'sum'}).reset_index()

CPU times: user 1min 26s, sys: 7.43 s, total: 1min 33s
Wall time: 1min 37s


In [40]:
df_author_collaborater.head()

Unnamed: 0,AuthorId,CollaboratedAuthorId
0,15251,"[2218595941, 3123245046, 1061427020, 221859594..."
1,18161,"[228256569, 1995611453, 2776889896, 251390627,..."
2,27238,"[27238, 2157213807, 27238, 2166733471, 27238, ..."
3,30645,"[30645, 2779806783, 30645, 30645, 2333061619, ..."
4,41399,"[2119885576, 2102033253, 2102033253, 211386442..."


# Save2

In [41]:
%%time
# df_author_collaborater.to_csv('../../data-computed-MAG/AuthorId_CollaborartedAuthorId.csv', index=False)
"""
if you start from here, run the following commands
"""
# df_author_collaborater = pd.read_csv('../../data-computed-MAG/AuthorId_CollaborartedAuthorId.csv', converters={'CollaboratedAuthorId': literal_eval})

CPU times: user 19.9 s, sys: 789 ms, total: 20.7 s
Wall time: 21 s


'\nif you start from here, run the following commands\n'

# Acknowのそれぞれの名前に対して，考えうるauthorIdの情報を`{acknowName: [authorId(candidates)]}`で持つ

In [42]:
# from Authors.csv
vaex_acknow_possibleIds = vaex_mag_authors[vaex_mag_authors['DisplayName'].isin(row_acknows)][['AuthorId', 'DisplayName']]
vaex_acknow_possibleIds.head()

# from paperAuthorAffil
vaex_acknow_possibleIds2 = vaex_mag_pauthorAffil[(vaex_mag_pauthorAffil['OriginalAuthor'].isin(row_acknows))][['AuthorId', 'OriginalAuthor']]

In [43]:
%%time
# group authorId by paperId
df_acknow_possibleIds = vaex_acknow_possibleIds.to_pandas_df()
df_acknow_possibleIds2 = vaex_acknow_possibleIds2.to_pandas_df().rename({'OriginalAuthor': 'DisplayName'}, axis=1)

# merge them
df_acknow_possibleIds = pd.concat([df_acknow_possibleIds, df_acknow_possibleIds2])

df_acknow_possibleIds_grouped = df_acknow_possibleIds.groupby('DisplayName')['AuthorId'].apply(list).reset_index()

# rename
df_acknow_possibleIds_grouped = df_acknow_possibleIds_grouped.rename({'AuthorId': 'PossibleAcknowId'}, axis=1)
df_acknow_possibleIds_grouped.head()

CPU times: user 1min 54s, sys: 40.2 s, total: 2min 35s
Wall time: 1min 49s


Unnamed: 0,DisplayName,PossibleAcknowId
0,A Desmedt,"[2498740362, 2596806000, 2604083863, 267282799..."
1,A Fernando,"[2205519779, 2233627207, 2308779187, 242638003..."
2,A Giaccia,"[3183792638, 23160563, 23160563, 23160563, 318..."
3,A Joyner,"[2567358014, 2177468446, 2177468446, 256735801..."
4,A K Gupta,"[2108172652, 2126845727, 2132210745, 219326991..."


# `replaced_dfs_all`に，authorのcollaboration情報と，acknowの考えうるauthorId情報をmergeして,共通してるauthorIdからacknowのauthorIdを決める

In [44]:
replaced_dfs_all.head()

Unnamed: 0,pid,acknow,AuthorId
0,2145218186,Aviv Regev,1219376531
0,2145218186,Aviv Regev,2079978182
0,2145218186,Aviv Regev,2125555973
1,2145218186,Benjamin Gordon,1219376531
1,2145218186,Benjamin Gordon,2079978182


### merge with `author_collaboration`

In [45]:
%%time
replaced_dfs_all_mergedCollaboration = pd.merge(replaced_dfs_all, df_author_collaborater,  
          how='inner', on='AuthorId')
replaced_dfs_all_mergedCollaboration.head()

CPU times: user 1.29 s, sys: 623 ms, total: 1.92 s
Wall time: 2.2 s


Unnamed: 0,pid,acknow,AuthorId,CollaboratedAuthorId
0,2145218186,Aviv Regev,1219376531,"[3384926, 1219376531, 3384926, 1219376531, 208..."
1,2145218186,Benjamin Gordon,1219376531,"[3384926, 1219376531, 3384926, 1219376531, 208..."
2,2145218186,Dalit May,1219376531,"[3384926, 1219376531, 3384926, 1219376531, 208..."
3,2145218186,Ernest Fraenkel,1219376531,"[3384926, 1219376531, 3384926, 1219376531, 208..."
4,2145218186,Lena Nekludova,1219376531,"[3384926, 1219376531, 3384926, 1219376531, 208..."


### merge with `acknow_possibleAcknowId`

In [46]:
%%time
replaced_dfs_all_mergedCollaboration_mergedAcknowId = pd.merge(replaced_dfs_all_mergedCollaboration, df_acknow_possibleIds_grouped,  
          how='inner', left_on='acknow', right_on='DisplayName')
replaced_dfs_all_mergedCollaboration_mergedAcknowId.head() # length = 4,802,501

CPU times: user 1.42 s, sys: 392 ms, total: 1.81 s
Wall time: 1.99 s


Unnamed: 0,pid,acknow,AuthorId,CollaboratedAuthorId,DisplayName,PossibleAcknowId
0,2145218186,Aviv Regev,1219376531,"[3384926, 1219376531, 3384926, 1219376531, 208...",Aviv Regev,"[1893730172, 2808908451, 2925260654, 294674651..."
1,2105003357,Aviv Regev,1219376531,"[3384926, 1219376531, 3384926, 1219376531, 208...",Aviv Regev,"[1893730172, 2808908451, 2925260654, 294674651..."
2,2145218186,Aviv Regev,2079978182,"[3577622, 1893730172, 1893730172, 1979319809, ...",Aviv Regev,"[1893730172, 2808908451, 2925260654, 294674651..."
3,2105003357,Aviv Regev,2079978182,"[3577622, 1893730172, 1893730172, 1979319809, ...",Aviv Regev,"[1893730172, 2808908451, 2925260654, 294674651..."
4,2145218186,Aviv Regev,2125555973,"[7269306, 187710123, 2099193067, 2125555973, 2...",Aviv Regev,"[1893730172, 2808908451, 2925260654, 294674651..."


# Save3

In [47]:
%%time
# replaced_dfs_all_mergedCollaboration_mergedAcknowId[['pid', 'acknow', 'AuthorId', 'CollaboratedAuthorId', 'PossibleAcknowId']].to_csv('../../data-computed-MAG/d1Collaboration_fileter_allinfo.csv', index=False)
"""
if you start from here, run the following commands.
RECOMMMENDATION: start from 'SAVE2' due to the loading speed
"""
# replaced_dfs_all_mergedCollaboration_mergedAcknowId = pd.read_csv('../../data-computed-MAG/d1Collaboration_fileter_allinfo.csv', converters={'CollaboratedAuthorId': literal_eval, 'PossibleAcknowId': literal_eval})

CPU times: user 15min 34s, sys: 35.2 s, total: 16min 9s
Wall time: 16min 31s


"\nif you start from here, run the following commands.\nRECOMMMENDATION: start from 'SAVE2' due to the loading speed\n"

# acknowのidを決める

### 共著してるかどうかをみる

In [48]:
%%time
# PossibleAcknowIdとCollaboratedAuthorIdの共通部分を見る
replaced_dfs_all_mergedCollaboration_mergedAcknowId['IntersectedAcknowIds'] = replaced_dfs_all_mergedCollaboration_mergedAcknowId.apply(lambda row : set(row['PossibleAcknowId']) & set(row['CollaboratedAuthorId']), axis=1)

CPU times: user 2min 59s, sys: 26.9 s, total: 3min 26s
Wall time: 3min 45s


In [49]:
replaced_dfs_all_mergedCollaboration_mergedAcknowId.head()

Unnamed: 0,pid,acknow,AuthorId,CollaboratedAuthorId,DisplayName,PossibleAcknowId,IntersectedAcknowIds
0,2145218186,Aviv Regev,1219376531,"[3384926, 1219376531, 3384926, 1219376531, 208...",Aviv Regev,"[1893730172, 2808908451, 2925260654, 294674651...",{1893730172}
1,2105003357,Aviv Regev,1219376531,"[3384926, 1219376531, 3384926, 1219376531, 208...",Aviv Regev,"[1893730172, 2808908451, 2925260654, 294674651...",{1893730172}
2,2145218186,Aviv Regev,2079978182,"[3577622, 1893730172, 1893730172, 1979319809, ...",Aviv Regev,"[1893730172, 2808908451, 2925260654, 294674651...","{3147624449, 1893730172, 3207811733}"
3,2105003357,Aviv Regev,2079978182,"[3577622, 1893730172, 1893730172, 1979319809, ...",Aviv Regev,"[1893730172, 2808908451, 2925260654, 294674651...","{3147624449, 1893730172, 3207811733}"
4,2145218186,Aviv Regev,2125555973,"[7269306, 187710123, 2099193067, 2125555973, 2...",Aviv Regev,"[1893730172, 2808908451, 2925260654, 294674651...",{1893730172}


### 少なくとも一つ以上のidのかぶりがあったAcknowの人たち

In [50]:
df_determinedAcknow = replaced_dfs_all_mergedCollaboration_mergedAcknowId[replaced_dfs_all_mergedCollaboration_mergedAcknowId['IntersectedAcknowIds'] != set()][['pid', 'acknow', 'IntersectedAcknowIds']]
df_determinedAcknow.head()

Unnamed: 0,pid,acknow,IntersectedAcknowIds
0,2145218186,Aviv Regev,{1893730172}
1,2105003357,Aviv Regev,{1893730172}
2,2145218186,Aviv Regev,"{3147624449, 1893730172, 3207811733}"
3,2105003357,Aviv Regev,"{3147624449, 1893730172, 3207811733}"
4,2145218186,Aviv Regev,{1893730172}


In [22]:
len(df_determinedAcknow)

867006

In [51]:
len(df_determinedAcknow) # after one

1081208

### 一つだけにAcknowIdが決まった人たち

In [52]:
df_determinedAcknow_oneId = df_determinedAcknow[df_determinedAcknow['IntersectedAcknowIds'].str.len() == 1][['pid', 'acknow', 'IntersectedAcknowIds']]
df_determinedAcknow_oneId.head()

Unnamed: 0,pid,acknow,IntersectedAcknowIds
0,2145218186,Aviv Regev,{1893730172}
1,2105003357,Aviv Regev,{1893730172}
4,2145218186,Aviv Regev,{1893730172}
5,2105003357,Aviv Regev,{1893730172}
11,2168608639,Aviv Regev,{1893730172}


In [53]:
# change extract acknow id as str 
df_determinedAcknow_oneId['DetermindedAcknowId'] = df_determinedAcknow_oneId['IntersectedAcknowIds'].apply(list).str[0]

df_determinedAcknow_oneId.head()

Unnamed: 0,pid,acknow,IntersectedAcknowIds,DetermindedAcknowId
0,2145218186,Aviv Regev,{1893730172},1893730172
1,2105003357,Aviv Regev,{1893730172},1893730172
4,2145218186,Aviv Regev,{1893730172},1893730172
5,2105003357,Aviv Regev,{1893730172},1893730172
11,2168608639,Aviv Regev,{1893730172},1893730172


In [24]:
len(df_determinedAcknow_oneId)

797588

In [54]:
len(df_determinedAcknow_oneId) # agter one

987058

# Save

In [55]:
# save rsults
# df_determinedAcknow_oneId[['pid', 'acknow', 'DetermindedAcknowId']].to_csv('../../data-computed-MAG/d1Collaboration_result.csv', index=False)

df_determinedAcknow_oneId = pd.read_csv('../../data-computed-MAG/d1Collaboration_result.csv')

In [56]:
df_determinedAcknow_oneId.head()

Unnamed: 0,pid,acknow,DetermindedAcknowId
0,2145218186,Aviv Regev,1893730172
1,2105003357,Aviv Regev,1893730172
2,2145218186,Aviv Regev,1893730172
3,2105003357,Aviv Regev,1893730172
4,2168608639,Aviv Regev,1893730172


# Cited indentifier

In [14]:
df_result_cited = pd.read_csv('../../data-computed-MAG/references_result.csv')

In [15]:
df_result_cited

Unnamed: 0,pid,acknow,ReferencedAuthorIds
0,2145218186,Lena Nekludova,1991417625
1,2145218186,Yoseph Barash,2103482019
2,2145218186,Yoseph Barash,2103482019
3,2088922607,Bruce Stillman,2160703791
4,2088922607,M. K. Raghuraman,2063095141
...,...,...,...
224842,3127584525,Hans Keppler,2099200267
224843,3127584525,Hans Keppler,2099200267
224844,3127584525,Hans Keppler,2099200267
224845,3127584525,Federica Schiavi,2301209342


# merge 

In [16]:
len(df_determinedAcknow_oneId), len(df_result_cited)

(797588, 224847)

In [23]:
%%time
df_acknowId_d1collab_ref_merged = pd.merge(df_determinedAcknow_oneId, df_result_cited,  
          how='outer', on=['pid', 'acknow'])
df_acknowId_d1collab_ref_merged = df_acknowId_d1collab_ref_merged.drop_duplicates(subset=['pid', 'acknow'])
df_acknowId_d1collab_ref_merged.head()

CPU times: user 234 ms, sys: 58.2 ms, total: 293 ms
Wall time: 289 ms


Unnamed: 0,pid,acknow,DetermindedAcknowId,ReferencedAuthorIds
0,2145218186,Aviv Regev,1893730000.0,
2,2105003357,Aviv Regev,1893730000.0,
4,2168608639,Aviv Regev,1893730000.0,
12,2033343333,Aviv Regev,1893730000.0,3147624000.0
15,2105898083,Aviv Regev,1893730000.0,


In [24]:
len(df_acknowId_d1collab_ref_merged)

231168

In [19]:
df_acknowId_d1collab_ref_merged[df_acknowId_d1collab_ref_merged['DetermindedAcknowId'] == df_acknowId_d1collab_ref_merged['ReferencedAuthorIds']]

Unnamed: 0,pid,acknow,DetermindedAcknowId,ReferencedAuthorIds
16,2046231013,Aviv Regev,1.893730e+09,1.893730e+09
38,2028088265,Yael Altuvia,2.212855e+08,2.212855e+08
40,2026342583,Yael Altuvia,2.212855e+08,2.212855e+08
41,2145218186,Yoseph Barash,2.103482e+09,2.103482e+09
45,2105003357,Yoseph Barash,2.103482e+09,2.103482e+09
...,...,...,...,...
1212858,3123697845,Rupert Langer,2.034815e+09,2.034815e+09
1212870,3127263314,Andrea Berton,1.974124e+09,1.974124e+09
1212887,3126167156,Alpan Bek,1.967853e+09,1.967853e+09
1212889,3127584525,Hans Keppler,2.099200e+09,2.099200e+09


In [62]:
# 二つの方法（References, d1 collaboration）でずれて判定された人たち
df_acknowId_d1collab_ref_merged[(df_acknowId_d1collab_ref_merged['DetermindedAcknowId'] != df_acknowId_d1collab_ref_merged['ReferencedAuthorIds']) & ~(df_acknowId_d1collab_ref_merged['DetermindedAcknowId'].isnull()) & ~(df_acknowId_d1collab_ref_merged['ReferencedAuthorIds'].isnull())]

Unnamed: 0,pid,acknow,DetermindedAcknowId,ReferencedAuthorIds
12,2033343333,Aviv Regev,1.893730e+09,3.147624e+09
71,1971564731,Nir Friedman,2.079978e+09,3.175696e+09
355,2127164464,Gustavo Stolovitzky,3.204995e+09,2.029563e+09
528,1980375476,Peter Forster,2.181181e+09,2.729617e+09
652,2264241394,Zhifang Chai,2.267736e+09,2.931652e+09
...,...,...,...,...
1212198,3013066072,Jingzhen Shao,2.962232e+09,2.800880e+09
1212454,3021669255,Carlo Gatti,3.168202e+09,2.609771e+09
1212604,2999205633,Yuji Nakamura,2.106848e+09,2.560203e+09
1212668,3033722825,Ruthaiwan Bunkrongcheap,2.754391e+09,3.283531e+08


In [56]:
# 全てのacknowの人数
len(dfs_all.drop_duplicates(subset=['paperId', 'acknow']))

798525

In [20]:
231168/798525

0.28949375410913875

In [25]:
print(f"全てのacknowの内，{231168/len(dfs_all.drop_duplicates(subset=['paperId', 'acknow']))}の人のidを付与した")

全てのacknowの内，0.28949375410913875の人のidを付与した
