# The resulting dataset is formatted to be public

In [5]:
import pandas as pd
import numpy as np
import os.path
from pathlib import Path

FILES = ['compbiology', 'biology', 'medicine', 'genetics', 'ntds', 'pathogenes', 'plosone', 'srep']

# Path to data
PATH_SAVED_CSV = '../../data-computed-MAG'
PATH_MAG = '../../data-MAG/'
PATH_ROW_ACKNOW = '../../data/TreatedEdgeList21'

# Path to data to save
PATH_SAVE = '../../data-computed/final_csvs'
Path(PATH_SAVE).mkdir(parents=True, exist_ok=True) # create save dir if not exists

# Result dataset

In [6]:
# collaboration identifier
df_determinedAcknow_oneId = pd.read_csv('../../data-computed-MAG/d1Collaboration_result.csv')

# Reference identifier
df_result_cited = pd.read_csv('../../data-computed-MAG/references_result.csv')

# merge them
df_acknowId_d1collab_ref_merged = pd.merge(df_determinedAcknow_oneId, df_result_cited,  
          how='outer', on=['pid', 'acknow'])
# drop duplicated row
df_acknowId_d1collab_ref_merged = df_acknowId_d1collab_ref_merged.drop_duplicates(subset=['pid', 'acknow'])

# Acknoweldged scholars identified only by collaboration approach
df_acknowId_from_d1Collab = df_acknowId_d1collab_ref_merged[df_acknowId_d1collab_ref_merged['ReferencedAuthorIds'].isnull()]
# Acknoweldged scholars identified only by citation approach
df_acknowId_from_ref = df_acknowId_d1collab_ref_merged[df_acknowId_d1collab_ref_merged['DetermindedAcknowId'].isnull()]
# Acknoweldged scholars identified by both collaboration and citation approach
df_acknowId_from_both = df_acknowId_d1collab_ref_merged[df_acknowId_d1collab_ref_merged['DetermindedAcknowId'] == df_acknowId_d1collab_ref_merged['ReferencedAuthorIds']]

# IDs of acknowledged scholars
df_acknowId = pd.concat([df_acknowId_from_d1Collab, df_acknowId_from_ref, df_acknowId_from_both])

# make new acknowId col
df_acknowId['acknowId'] = [ int(row['DetermindedAcknowId']) if np.isnan(row['ReferencedAuthorIds']) else (row['ReferencedAuthorIds']) for i, row in df_acknowId.iterrows()]
df_acknowId['acknowId'] = df_acknowId['acknowId'].astype(int)
df_acknowId.head()

Unnamed: 0,pid,acknow,DetermindedAcknowId,ReferencedAuthorIds,acknowId
0,2145218186,Aviv Regev,1893730000.0,,1893730172
2,2105003357,Aviv Regev,1893730000.0,,1893730172
4,2168608639,Aviv Regev,1893730000.0,,1893730172
15,2105898083,Aviv Regev,1893730000.0,,1893730172
28,2145218186,Dalit May,2800119000.0,,2800119218


# Divide by journals

# Read doi data in order to tell result in terms of journals

In [140]:
# read paper id data
df_magpaper_id_doi_plos = pd.read_table(f'{PATH_MAG}/papers_plos.txt', sep=' ', names=['pid', 'doi'])
df_magpaper_id_doi_srep = pd.read_table(f'{PATH_MAG}/papers_srep.txt', sep=' ', names=['pid', 'doi'])
df_magpaper_id_doi = pd.concat([df_magpaper_id_doi_plos, df_magpaper_id_doi_srep])

# change doi to lower case
df_magpaper_id_doi["doi"] = df_magpaper_id_doi["doi"].str.lower()

# clear error row: Eliminate cases where multiple paper ids are assigned to a single doi.
s_vs = df_magpaper_id_doi['doi'].value_counts()
mag_available_dois = set(s_vs[s_vs.values == 1].index)
df_magpaper_id_doi2 = df_magpaper_id_doi[df_magpaper_id_doi['doi'].isin(mag_available_dois)]

df_magpaper_id_doi2.head()

Unnamed: 0,pid,doi
0,1966827022,10.1371/journal.pcbi.1002887
1,1970867439,10.1371/journal.pone.0040555
2,1974609334,10.1371/journal.pmed.1000202
3,1985768873,10.1371/journal.pcbi.1000731
4,1991290371,10.1371/journal.pone.0090052


In [12]:
# read acknow files
# concat all dfs
dfs = {}
dfs_all = pd.DataFrame()
for file in FILES:
#     print(f"read file: {file}")
    dfs[file] = pd.read_csv(f'{PATH_ROW_ACKNOW}/{file}.csv')[['paperId', 'author', 'acknow']]
    dfs_all = pd.concat([dfs_all, dfs[file]])
dfs_all.head()

  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0,paperId,author,acknow
0,10.1371/journal.pcbi.0010001,Hanah_Margalit,Aviv_Regev
1,10.1371/journal.pcbi.0010001,Hanah_Margalit,Benjamin_Gordon
2,10.1371/journal.pcbi.0010001,Hanah_Margalit,Dalit_May
3,10.1371/journal.pcbi.0010001,Hanah_Margalit,Ernest_Fraenkel
4,10.1371/journal.pcbi.0010001,Hanah_Margalit,Lena_Nekludova


# read original collected files

# Add DOI 

In [30]:
# merge doi
df_acknowId_wDOI = pd.merge(df_acknowId, df_magpaper_id_doi2,  
          how='inner', on='pid')

# rename columns
df_acknowId_wDOI = df_acknowId_wDOI.rename({"pid": "PaperId",
                                            "acknow": "AcknowledgedName",
                                            "DetermindedAcknowId": "CollaborationApproach",
                                            "ReferencedAuthorIds": "CitationApproach",
                                            "acknowId": "AcknowledgedId",
                                            "doi": "Doi",
                                            }, axis=1)

# conver to boolean
df_acknowId_wDOI.fillna(False)
df_acknowId_wDOI['CollaborationApproach'] = df_acknowId_wDOI['CollaborationApproach'].apply(lambda x: False if np.isnan(x) else True)
df_acknowId_wDOI['CitationApproach'] = df_acknowId_wDOI['CitationApproach'].apply(lambda x: False if np.isnan(x) else True)

In [31]:
final_df = df_acknowId_wDOI[['Doi', 'PaperId', 'AcknowledgedName', 'AcknowledgedId', 'CollaborationApproach', 'CitationApproach']]
final_df

Unnamed: 0,Doi,PaperId,AcknowledgedName,AcknowledgedId,CollaborationApproach,CitationApproach
0,10.1371/journal.pcbi.0010001,2145218186,Aviv Regev,1893730172,True,False
1,10.1371/journal.pcbi.0010001,2145218186,Dalit May,2800119218,True,False
2,10.1371/journal.pcbi.0010001,2145218186,Ruth Hershberg,2155149074,True,False
3,10.1371/journal.pcbi.0010001,2145218186,Yael Altuvia,221285518,True,False
4,10.1371/journal.pcbi.0010001,2145218186,Lena Nekludova,1991417625,False,True
...,...,...,...,...,...,...
235561,10.1038/s41598-020-80660-z,3118328364,Zhaoyang Wang,2596801132,True,True
235562,10.1038/s41598-020-79271-5,3118331071,Masayoshi Nishiyama,2187044629,True,True
235563,10.1038/s41598-021-81652-3,3127263314,Andrea Berton,1974123948,True,True
235564,10.1038/s41598-021-82174-8,3127584525,Hans Keppler,2099200267,True,True


In [35]:
len(final_df)

235566

In [36]:
final_dfs = {}
for file in FILES:
    if file == 'srep':
        final_dfs[file] = final_df[final_df['PaperId'].isin(df_magpaper_id_doi_srep.pid.tolist())]
        continue
    final_dfs[file] = final_df[final_df['Doi'].isin(dfs[file].paperId.tolist())]

In [37]:
s = 0
for file in FILES:
    s += len(final_dfs[file])
    
s == len(final_df)

True

# save for each file

In [203]:
# for file in FILES:
#     final_dfs[file].to_csv(f"{PATH_SAVE}/{file}.csv", index=False)