# Create an edgelist of identified acknowledged scholar from citation approach
We wiil see that an acknowledged individual name is registered in MAG and on the list of references in the acknowledged papers.

In [19]:
import ast
import pandas as pd
import vaex
import os.path
from ast import literal_eval
from pathlib import Path

FILES = ['compbiology', 'biology', 'medicine', 'genetics', 'ntds', 'pathogenes', 'plosone', 'srep']

# Path to data
PATH_ROW_ACKNOW = '../../data/TreatedEdgeList21'
PATH_MAG = '../../data-MAG/'

# Path to data to save
PATH_SAVE = '../../data-computed-MAG'
Path(PATH_SAVE).mkdir(parents=True, exist_ok=True) # create save dir if not exists

In [20]:
def see(var):
    """funtion to check the contents of variable (dict or set)"""
    print(type(var))
    if type(var) == dict:
        for i, (k, v) in enumerate(var.items()):
            print(k, ":", v)
            if i == 4:
                break
    if type(var) == set:
        print(list(var)[:5])
        
        
def read_vaex(file_path):
    """
    It may take times to read files for the first time.
    From second time, the loading time will be shorter.
    """
    if os.path.exists(file_path):
        print('file exists')
        return vaex.open(file_path+'.hdf5')
    return vaex.from_csv(file_path, convert=True, chunk_size=10_000_000)

# Read Acknow data (PLOS and SciRep)

In [21]:
%%time
# concat all dfs
dfs = {}
dfs_all = pd.DataFrame()
for file in FILES:
#     print(f"read file: {file}")
    dfs[file] = pd.read_csv(f'{PATH_ROW_ACKNOW}/{file}.csv')[['paperId', 'author', 'acknow']]
    dfs_all = pd.concat([dfs_all, dfs[file]])
dfs_all.head()

  exec(code, glob, local_ns)


CPU times: user 3.39 s, sys: 449 ms, total: 3.84 s
Wall time: 4.06 s


Unnamed: 0,paperId,author,acknow
0,10.1371/journal.pcbi.0010001,Hanah_Margalit,Aviv_Regev
1,10.1371/journal.pcbi.0010001,Hanah_Margalit,Benjamin_Gordon
2,10.1371/journal.pcbi.0010001,Hanah_Margalit,Dalit_May
3,10.1371/journal.pcbi.0010001,Hanah_Margalit,Ernest_Fraenkel
4,10.1371/journal.pcbi.0010001,Hanah_Margalit,Lena_Nekludova


# Read MAG data (Paper & PaperAuthorAffil) and put pid from MAG

### read MAG Paper with the doi of collected data

In [22]:
# read paper id data
df_magpaper_id_doi_plos = pd.read_table(f'{PATH_MAG}/papers_plos.txt', sep=' ', names=['pid', 'doi'])
df_magpaper_id_doi_srep = pd.read_table(f'{PATH_MAG}/papers_srep.txt', sep=' ', names=['pid', 'doi'])
df_magpaper_id_doi = pd.concat([df_magpaper_id_doi_plos, df_magpaper_id_doi_srep])

# change doi to lower case
df_magpaper_id_doi["doi"] = df_magpaper_id_doi["doi"].str.lower()

# clear error row: Eliminate cases where multiple paper ids are assigned to a single doi.
s_vs = df_magpaper_id_doi['doi'].value_counts()
mag_available_dois = set(s_vs[s_vs.values == 1].index)
df_magpaper_id_doi2 = df_magpaper_id_doi[df_magpaper_id_doi['doi'].isin(mag_available_dois)]

df_magpaper_id_doi2.head()

Unnamed: 0,pid,doi
0,1966827022,10.1371/journal.pcbi.1002887
1,1970867439,10.1371/journal.pone.0040555
2,1974609334,10.1371/journal.pmed.1000202
3,1985768873,10.1371/journal.pcbi.1000731
4,1991290371,10.1371/journal.pone.0090052


### Rate of DOI mathcing with MAG

In [8]:
row_dois = set(dfs_all.paperId.tolist())
mag_dois = set(df_magpaper_id_doi2.doi.tolist())

common_dois = row_dois & mag_dois
print('Intersected dois = ', len(common_dois)/len(row_dois))

Intersected dois =  0.8262019771987638


### Collect author information of MAG via doi

In [14]:
df_magpaper_commondois = df_magpaper_id_doi2[df_magpaper_id_doi2['doi'].isin(common_dois)]
common_authors_pid = df_magpaper_commondois.pid.tolist()

# Extract necessary data with PaperReferences and PaperAuthorAffiliations of MAG

In [15]:
# read PaperAuthorAffiliations.csv
vaex_mag_pauthorAffil = read_vaex(f'{PATH_MAG}/PaperAuthorAffiliations/PaperAuthorAffiliations.txt')

# read Preferences.csv
vaex_mag_prefs = read_vaex(f'{PATH_MAG}/PaperReferences/PaperReferences.csv')

# extract necessary row with ```common_authors_pid```
paper_references_extracted = vaex_mag_prefs[vaex_mag_prefs.pid.isin(common_authors_pid)]
rfpids = paper_references_extracted[' rfpid'].tolist()

# extract necessary row with ```rfpids```
vaex_mag_pauthorAffil_extracted = vaex_mag_pauthorAffil[vaex_mag_pauthorAffil.PaperId.isin(rfpids)]

file exists
file exists


## Convert extracted data of `PaperReferences` & `PaperAuthorAffi.` into DataFrame
Using `PaperReferences = {paperId: referencedPaperId}` and `PaperAuthorAffi = {paperId: authorIds}`,
create `{paperId: [authorIds]}`

In [15]:
# dict of paper:　referenced paper id
df_paper_references_extracted = paper_references_extracted.to_pandas_df()
# dict of paper: author id
df_mag_pauthorAffil_extracted = vaex_mag_pauthorAffil_extracted.to_pandas_df()

In [16]:
%%time
# groupby to a dict of {paperid: [authorIds]}
df_mag_pauthorAffil_extracted_grouped = df_mag_pauthorAffil_extracted.groupby('PaperId',)[' AuthorId'].apply(list)
df_mag_pauthorAffil_extracted_grouped = df_mag_pauthorAffil_extracted_grouped.reset_index()

CPU times: user 31 s, sys: 801 ms, total: 31.8 s
Wall time: 32.1 s


### merge with `df_paper_references_extracted`

In [20]:
%%time
df_merged = pd.merge(df_paper_references_extracted, df_mag_pauthorAffil_extracted_grouped,  
          how='inner',left_on =' rfpid',right_on='PaperId')
df_pid_authors = df_merged[['pid', ' AuthorId']].rename({' AuthorId': 'ReferencedAuthorIds'}, axis=1)

CPU times: user 2.06 s, sys: 221 ms, total: 2.28 s
Wall time: 2.28 s


In [21]:
%%time
df_merged = pd.merge(df_paper_references_extracted, df_mag_pauthorAffil_extracted_grouped,  
          how='inner',left_on =' rfpid',right_on='PaperId')
df_pid_authors = df_merged[['pid', ' AuthorId']].rename({' AuthorId': 'ReferencedAuthorIds'}, axis=1)
df_pid_authors.head()

CPU times: user 2.1 s, sys: 253 ms, total: 2.35 s
Wall time: 2.35 s


Unnamed: 0,pid,ReferencedAuthorIds
0,11386834,"[2139784259, 2141443236, 2224777168, 256426612..."
1,11386834,"[2105759627, 2137706283, 2191198583, 226611927..."
2,1978622015,"[2105759627, 2137706283, 2191198583, 226611927..."
3,2397410999,"[2105759627, 2137706283, 2191198583, 226611927..."
4,2588037944,"[2105759627, 2137706283, 2191198583, 226611927..."


### save authors informations as csv

In [22]:
# # save as authors informations
# df_pid_authors.to_csv(f'{PATH_SAVE}/pid_authors.csv', index=False)

df_pid_authors = pd.read_csv(f'{PATH_SAVE}/pid_authors.csv', converters={'ReferencedAuthorIds': literal_eval})

In [23]:
%%time
# format as DataFrame
# df_pid_authors['ReferencedAuthorIds'] = df_pid_authors['ReferencedAuthorIds'].apply(literal_eval)
df_pid_author = df_pid_authors.explode('ReferencedAuthorIds')
l_ReferencedAuthorIds = df_pid_author['ReferencedAuthorIds'].tolist()
df_pid_author.head()

CPU times: user 5.96 s, sys: 1.14 s, total: 7.1 s
Wall time: 7.3 s


Unnamed: 0,pid,ReferencedAuthorIds
0,11386834,2139784259
0,11386834,2141443236
0,11386834,2224777168
0,11386834,2564266129
0,11386834,2610359911


# Check acknoweldged names are included in ReferencedAuthorIds
With `Authors.csv`，convert `{paperId: authorIds}`into `{paperId: authorNames}`

### Read Authors.csv

In [17]:
# read Authors.csv
vaex_mag_authors = read_vaex(f'{PATH_MAG}/Authors/Authors.csv')

file exists


In [25]:
# extract necessary rows and change to pandas
vaex_mag_authors_extracted = vaex_mag_authors[vaex_mag_authors.AuthorId.isin(l_ReferencedAuthorIds)]
# vaex_mag_authors_extracted.head()
df_mag_authors_extracted = vaex_mag_authors_extracted[['AuthorId', 'DisplayName']].to_pandas_df()

### Convert `{paperId: authorIds}` into `{paperId: authorNames}`

In [26]:
%%time
df_merged_auth_refauth = pd.merge(df_pid_author, df_mag_authors_extracted,  
          how='inner',left_on ='ReferencedAuthorIds',right_on='AuthorId')
df_merged_auth_refauth.head()

CPU times: user 18.5 s, sys: 3.33 s, total: 21.8 s
Wall time: 23 s


Unnamed: 0,pid,ReferencedAuthorIds,AuthorId,DisplayName
0,11386834,2139784259,2139784259,Yulong Yin
1,11386834,2139784259,2139784259,Yulong Yin
2,2124872669,2139784259,2139784259,Yulong Yin
3,11386834,2139784259,2139784259,Yulong Yin
4,11386834,2139784259,2139784259,Yulong Yin


In [27]:
%%time
# groupby to paperid: [authorIds]
df_pid_refAuthIds = df_merged_auth_refauth[['pid', 'DisplayName']].groupby('pid')['DisplayName'].apply(set)

df_pid_refAuthIds = df_pid_refAuthIds.reset_index()

CPU times: user 21.9 s, sys: 3.64 s, total: 25.5 s
Wall time: 28.2 s


### save df_pid_refAuthIds informations as csv

In [28]:
# # save as authors informations
# df_pid_refAuthIds.to_csv(f'{PATH_SAVE}/pid_refrencedAuthorIds.csv', index=False)

# df_pid_refAuthIds = pd.read_csv(f'{PATH_SAVE}/pid_refrencedAuthorIds.csv')

### check acknowledged individual paper are cited in the paper

In [29]:
%%time
df_pid_refAuthIds = pd.read_csv(f'{PATH_SAVE}/pid_refrencedAuthorIds.csv', converters={'DisplayName': literal_eval})
df_pid_refAuthIds.head()

CPU times: user 48 µs, sys: 7 µs, total: 55 µs
Wall time: 62.9 µs


Unnamed: 0,pid,DisplayName
0,11386834,"{Hitoshi Shimano, Insuk Sohn, Geoffrey J. Huds..."
1,12394107,"{Aurél Gábris, Jonathan C. F. Matthews, E. Kni..."
2,23277314,"{Ken A. Dill, Claudio Castellano, Leo Egghe, G..."
3,27879997,"{Andre K. Geim, Vladimir E. Kravtsov, Hung Q. ..."
4,28776219,"{Alvaro Avezum, Adelina Riarte, Susana B. Ocam..."


In [30]:
df_magpaper_id_doi_w_regAuthIds = pd.merge(df_magpaper_id_doi, df_pid_refAuthIds,  
          how='inner', on='pid')
df_magpaper_id_doi_w_regAuthIds.head()

Unnamed: 0,pid,doi,DisplayName
0,1966827022,10.1371/journal.pcbi.1002887,"{Joel M. Reid, Robert S. Fulton, Abhijit Guha,..."
1,1970867439,10.1371/journal.pone.0040555,"{Gernot Riedel, David N. Furness, Malgorzata K..."
2,1985768873,10.1371/journal.pcbi.1000731,"{Peter E. Leopold, John D. Westbrook, Thomas B..."
3,2003219556,10.1371/journal.pone.0009165,"{Chong-Shan Shi, Jyoti Idnani, Hiroko Omori, R..."
4,2071989361,10.1371/journal.pone.0097711,"{Giovanni Bonanno, Marc Potters, F. Grümmer, J..."


In [31]:
%%time
dfs_all_w_pid = pd.merge(dfs_all, df_magpaper_id_doi_w_regAuthIds,  
          how='inner',left_on ='paperId',right_on='doi')

CPU times: user 552 ms, sys: 161 ms, total: 713 ms
Wall time: 755 ms


In [32]:
# change acknow's name
dfs_all_w_pid['acknow'] = dfs_all_w_pid['acknow'].apply(lambda x: " ".join(x.split('_')))
dfs_all_w_pid[['pid', 'acknow', 'DisplayName']].head()

Unnamed: 0,pid,acknow,DisplayName
0,2145218186,Aviv Regev,"{B. Marshall, Yosef Hochberg, Jeremy M Berg, G..."
1,2145218186,Benjamin Gordon,"{B. Marshall, Yosef Hochberg, Jeremy M Berg, G..."
2,2145218186,Dalit May,"{B. Marshall, Yosef Hochberg, Jeremy M Berg, G..."
3,2145218186,Ernest Fraenkel,"{B. Marshall, Yosef Hochberg, Jeremy M Berg, G..."
4,2145218186,Lena Nekludova,"{B. Marshall, Yosef Hochberg, Jeremy M Berg, G..."


In [33]:
# 入ってるかどうかをバイナリで判断
dfs_all_w_pid['cited_as_same_name'] = dfs_all_w_pid[['acknow', 'DisplayName']].apply(lambda row: True if (row['acknow'] in row['DisplayName']) else False, axis=1)

In [34]:
# acknowの名前がはいってるrowだけを抽出
a = dfs_all_w_pid[dfs_all_w_pid['cited_as_same_name']][['acknow', 'pid', 'DisplayName']]
df_referenced_acknow = a.drop_duplicates(subset=['acknow', 'pid'])
df_referenced_acknow.head()

Unnamed: 0,acknow,pid,DisplayName
4,Lena Nekludova,2145218186,"{B. Marshall, Yosef Hochberg, Jeremy M Berg, G..."
8,Yoseph Barash,2145218186,"{B. Marshall, Yosef Hochberg, Jeremy M Berg, G..."
27,Bruce Stillman,2088922607,"{Luda S. Shlyakhtenko, Ting Chen, John J. Wyri..."
30,M. K. Raghuraman,2088922607,"{Luda S. Shlyakhtenko, Ting Chen, John J. Wyri..."
40,Gustavo Stolovitzky,2034469541,"{David Maxwell, J. P. M. Postma, Shaun B. Grim..."


# Identify the acknowledged scholarID

### make sure a name with different person are listed in the references
If the case exists, we don't consider it. That is, we only consider the matched acknowledged scholar ID which appear in the references list once.

In [35]:
df_merged_auth_refauth_tmp = df_merged_auth_refauth[df_merged_auth_refauth.pid.isin(set(df_referenced_acknow['pid'].tolist()))]
df_merged_auth_refauth_tmp.head()

Unnamed: 0,pid,ReferencedAuthorIds,AuthorId,DisplayName
14,2052636440,2139784259,2139784259,Yulong Yin
24,2058112588,2139784259,2139784259,Yulong Yin
29,2966614822,2139784259,2139784259,Yulong Yin
31,3089897809,2139784259,2139784259,Yulong Yin
33,2730479429,2139784259,2139784259,Yulong Yin


In [36]:
# Identify
df_ref_author_ids = df_merged_auth_refauth_tmp.groupby(['pid', 'DisplayName'])['AuthorId']
df_ref_author_ids_grouped = df_ref_author_ids.nunique().reset_index()

# Those who have unique ID
df_merged_auth_refauth_tmp2 = df_ref_author_ids_grouped[df_ref_author_ids_grouped['AuthorId'] == 1]
df_merged_auth_refauth_tmp2.head()

Unnamed: 0,pid,DisplayName,AuthorId
0,27879997,A. Johansson,1
1,27879997,A. L. Efros,1
2,27879997,A. Satta,1
3,27879997,A. V. Shtyk,1
4,27879997,A. Yu. Mironov,1


In [37]:
df_merged_auth_refauth_tmp3 = pd.merge(df_merged_auth_refauth_tmp, df_merged_auth_refauth_tmp2,  
          how='inner', on =['pid', 'DisplayName'])[['pid', 'ReferencedAuthorIds', 'DisplayName']]

In [38]:
df_merged_auth_refauth_tmp3.head()

Unnamed: 0,pid,ReferencedAuthorIds,DisplayName
0,2052636440,2139784259,Yulong Yin
1,2058112588,2139784259,Yulong Yin
2,2058112588,2139784259,Yulong Yin
3,2966614822,2139784259,Yulong Yin
4,3089897809,2139784259,Yulong Yin


### Finally identify the ID

In [39]:
df_referenced_acknow.head()

Unnamed: 0,acknow,pid,DisplayName
4,Lena Nekludova,2145218186,"{B. Marshall, Yosef Hochberg, Jeremy M Berg, G..."
8,Yoseph Barash,2145218186,"{B. Marshall, Yosef Hochberg, Jeremy M Berg, G..."
27,Bruce Stillman,2088922607,"{Luda S. Shlyakhtenko, Ting Chen, John J. Wyri..."
30,M. K. Raghuraman,2088922607,"{Luda S. Shlyakhtenko, Ting Chen, John J. Wyri..."
40,Gustavo Stolovitzky,2034469541,"{David Maxwell, J. P. M. Postma, Shaun B. Grim..."


In [40]:
%%time
df_result1 = pd.merge(df_referenced_acknow, df_merged_auth_refauth_tmp3,  
          how='inner',left_on =['pid', 'acknow'],right_on=['pid', 'DisplayName'])

CPU times: user 4.78 s, sys: 1.13 s, total: 5.91 s
Wall time: 6.26 s


In [41]:
df_result1.head()

Unnamed: 0,acknow,pid,DisplayName_x,ReferencedAuthorIds,DisplayName_y
0,Lena Nekludova,2145218186,"{B. Marshall, Yosef Hochberg, Jeremy M Berg, G...",1991417625,Lena Nekludova
1,Yoseph Barash,2145218186,"{B. Marshall, Yosef Hochberg, Jeremy M Berg, G...",2103482019,Yoseph Barash
2,Yoseph Barash,2145218186,"{B. Marshall, Yosef Hochberg, Jeremy M Berg, G...",2103482019,Yoseph Barash
3,Bruce Stillman,2088922607,"{Luda S. Shlyakhtenko, Ting Chen, John J. Wyri...",2160703791,Bruce Stillman
4,M. K. Raghuraman,2088922607,"{Luda S. Shlyakhtenko, Ting Chen, John J. Wyri...",2063095141,M. K. Raghuraman


# Save

In [44]:
df_result1[['pid', 'acknow', 'ReferencedAuthorIds']].to_csv('../../data-computed-MAG/references_result.csv', index=False)