In [4]:
# enable .py change and reflect its change on this notebook
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [6]:
import vaex 
import os
import time
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

# you need to copy 'preprocessing.py' & 'detector.py' to the same directory of this notebook.
from preprocessing import Preprocessing
from detector import Detector

# Read data & set necessary data path

In [7]:
# ============ PATH ============ 
# Path should be absolute path.
PATH_ROW_ACKNOW = os.environ.get('DATA_ROWACK')
PATH_MAG = os.environ.get('DATA_MAG')
FILES = ['compbiology', 'biology', 'medicine', 'genetics', 'ntds', 'pathogenes', 'plosone', 'srep']


In [8]:
# ============ DB (MAG) ============
"""To check the necessary data structure of DB data, 
example csv data of DB are in the same directory 'exampleDB(mag)'. 

.exampleDB(mag)
├── example_db_author.csv
├── example_db_paper_author.csv
├── example_db_paper_refPaper.csv
└── example_df_doi_paperId.csv
"""
# DB data (it should be csv or .hdf5 file)
FILE_DB_AUTHOR = f'{PATH_MAG}/Authors/Authors.csv' # please set your path
FILE_DB_PAPER_AUTHOR = f'{PATH_MAG}/PaperAuthorAffiliations/PaperAuthorAffiliations.csv' # please set your path
FILE_DB_PAPER_REFPAPER = f'{PATH_MAG}/PaperReferences/PaperReferences.csv.hdf5' # please set your path
# doi to paperID
df_magpaper_id_doi_plos = pd.read_table(f'{PATH_MAG}/papers_plos.txt', sep=' ', names=['PaperId', 'Doi']) # please set your path
df_magpaper_id_doi_srep = pd.read_table(f'{PATH_MAG}/papers_srep.txt', sep=' ', names=['PaperId', 'Doi']) # please set your path
df_doi_paperId = pd.concat([df_magpaper_id_doi_plos, df_magpaper_id_doi_srep])


In [9]:
# ============ Acknowledgment data ============
"""To check the necessary data structure of acknowledgement data, 
example csv data of input df_acknow is in the same directory of 'exampleAcknow'. 

.exampleAcknow
└── example_df_acknow.csv
"""

# Acknow data (Please read your data. The following script to read acknow data is just my case.)
# You may just need to run ``` acknowdata_used pd.read_csv('your_path') ```
def _split_dataframe(df, chunk_size = 10000): 
    chunks = list()
    num_chunks = len(df) // chunk_size + 1
    for i in range(num_chunks):
        chunks.append(df[i*chunk_size:(i+1)*chunk_size])
    return chunks

# Read acknow data & concat all dfs
dfs = {}
dfs_all = pd.DataFrame()
for file in FILES:
    print(f"read file: {file}")
    dfs[file] = pd.read_csv(f'{PATH_ROW_ACKNOW}/{file}.csv', low_memory=False)[['paperId', 'author', 'acknow']]
    dfs_all = pd.concat([dfs_all, dfs[file]])
# rename columns
dfs_all_renamed = dfs_all.rename({'paperId': 'Doi', 'acknow':'AcknowName'}, axis=1)

# data to use
df_split = _split_dataframe(dfs_all_renamed) # split into small
acknowdata_used = df_split[0]


read file: compbiology
read file: biology
read file: medicine
read file: genetics
read file: ntds
read file: pathogenes
read file: plosone
read file: srep


In the following, large data might takes time, so split into small dataset and we will use it.

# Whole Process once

In [10]:
def run_preprocess(pre):
    print('\n Start Preprocessing ...')
    df_acknow = pre.adjust_df_acknow()
    v_db_author, v_db_paper_author, v_db_paper_refPaper = pre.read_db_as_vaex()

    # check
    if df_acknow.empty: 
        print('=== Acknow data(Doi or/and AcknowName) does not exist in DB data ===')
        return pd.DataFrame.from_dict({'PaperId': [], 'AcknowName':[]}), v_db_author, v_db_paper_author, v_db_paper_refPaper
    
    return df_acknow, v_db_author, v_db_paper_author, v_db_paper_refPaper
    
def run_detecor(detector, df_doi_paperId, stats=False):
    print('\n Start Detector')
    # get possible acknow Ids and dataframe
    df_acknow_possibleIds_grouped, possible_acknow_ids = detector.possible_acknoweldged_candidates_id()

    # check2
    if len(possible_acknow_ids) == 0:
        print('Possible scholar ID containing ANY AcknowName in the input data could not be found in DB')
        return

    # Collaboration approach
    df_acknow_with_collab_identified = detector.collaboration_approach(possible_acknow_ids, df_acknow_possibleIds_grouped)

    # Citation approach
    df_authorIds_citedAuthorIds_identified = detector.citation_approach(df_acknow_possibleIds_grouped)

    # Combine both approaches
    df_acknowId = detector.merge_two_approach(df_acknow_with_collab_identified, df_authorIds_citedAuthorIds_identified)

    # Formatting
    df_acknowId = detector.format_result(df_acknowId, df_doi_paperId)

    # save
    if detector.save_file != '':
        df_acknowId.to_csv(detector.save_file, index=False)
    
    # stats
    if stats:
        print(detector.stats(detector.df_acknow, df_acknowId))
        
    return df_acknowId

In [11]:
# ============ PREPROCESSING ============
# preprocessing the acknow and DB data 
pre = Preprocessing(acknowdata_used, df_doi_paperId, FILE_DB_AUTHOR, FILE_DB_PAPER_AUTHOR, FILE_DB_PAPER_REFPAPER)
df_acknow, v_db_author, v_db_paper_author, v_db_paper_refPaper = run_preprocess(pre)

OK!! DB data(/Users/keigokusumegi/research/data/MAG/Authors/Authors.csv.hdf5) exits as vaex.
OK!! DB data(/Users/keigokusumegi/research/data/MAG/PaperAuthorAffiliations/PaperAuthorAffiliations.csv.hdf5) exits as vaex.
OK!! DB data(/Users/keigokusumegi/research/data/MAG/PaperReferences/PaperReferences.csv.hdf5) exits as vaex.

 Start Preprocessing ...


In [12]:
# ============ DETECTOR ============
# Initialize with the necessary data
detector = Detector(df_acknow, v_db_author, v_db_paper_author, v_db_paper_refPaper, save_file='example_results.csv') # results will be save in 'example_results.csv'
df_acknowId = run_detecor(detector, df_doi_paperId, stats=True) 


 Start Detector
Start possible scholar Id search
End possible_acknoweldged_candidates_id (Time in this part: 9.94s)
Start collaboration_approach
    - sub step1 : _find_target_authorIds. (Time: 7.82s)
    - sub step2: _published_paperIds_of_target_authors. (Time: 18.67s)
      -- get all collaborators of target_authors
      -- confine collaborators by using possible acknowledged scholars Ids(=acknow_candidate_ids) 
    - sub step3: _collaborators_of_paperId. (Time: 34.14s)
    - sub step4: _merge_to_create_author_collabIds. (Time: 34.20s)
  - Done: Step1.
  - Done: Step2. (Time: 34.25s)
  - Done: Step3. (Time: 34.61s)
Start citation_approach
  - Done: Step1. (Time: 28.50s)
  - Done: Step2. (Time: 28.51s)
Merging collab. and citation approach results ...
Format the resutls
{'Num of input scholars names': 2316, 'Num of input papers': 659, 'Num of identified acknowledged scholars': 855, 'Num of papers with identified acknowledged scholars': 431, 'Proportion of identified scholars per in

In [10]:
df_acknowId

Unnamed: 0,PaperId,AcknowName,CommonScholarId_by_colla,CommonScholarId_by_ref,AcknowId,CollaborationApproach,CitationApproach
0,2145218186,Aviv Regev,1.893730e+09,,1893730172,True,False
6,2105003357,Aviv Regev,1.893730e+09,,1893730172,True,False
14,2145218186,Dalit May,2.800119e+09,,2800119218,True,False
17,2145218186,Ruth Hershberg,2.155149e+09,,2155149074,True,False
20,2145218186,Yael Altuvia,2.212855e+08,,221285518,True,False
...,...,...,...,...,...,...,...
39811,2046302208,Yves Pommier,7.199088e+08,7.199088e+08,719908761,True,True
39843,2006465310,Frank Oliver Glöckner,1.536379e+09,1.536379e+09,1536378997,True,True
39861,2006465310,Renzo Kottmann,2.093521e+09,2.093521e+09,2093521308,True,True
39887,1966390421,Heidi Hofer,2.005038e+09,2.005038e+09,2005038157,True,True


# One by One
It is the same as the above. But I leave the notebook records for debug.

## Preprocessing data

In [215]:
# ============ PREPROCESSING ============
pre = Preprocessing(acknowdata_used, df_doi_paperId, FILE_DB_AUTHOR, FILE_DB_PAPER_AUTHOR, FILE_DB_PAPER_REFPAPER)
df_acknow, v_db_author, v_db_paper_author, v_db_paper_refPaper = run_preprocess(pre)

Start Preprocessing.
OK!! DB data(/Users/keigokusumegi/research/data/MAG/Authors/Authors.csv.hdf5) exits as vaex.
OK!! DB data(/Users/keigokusumegi/research/data/MAG/PaperAuthorAffiliations/PaperAuthorAffiliations.csv.hdf5) exits as vaex.
OK!! DB data(/Users/keigokusumegi/research/data/MAG/PaperReferences/PaperReferences.csv.hdf5) exits as vaex.


## Identify acknowledged scholars

In [216]:
# Initialize with the necessary data
detector = Detector(df_acknow, v_db_author, v_db_paper_author, v_db_paper_refPaper)

In [217]:
# get possible acknow Ids and dataframe
df_acknow_possibleIds_grouped, possible_acknow_ids = detector.possible_acknoweldged_candidates_id()

# check
if len(possible_acknow_ids) == 0:
    print('Possible scholar ID containing ANY AcknowName in the input data could not be found in DB')

Start possible scholar Id search
End possible_acknoweldged_candidates_id (Time in this part: 11.60s)


In [218]:
# Collaboration approach
df_acknow_with_collab_identified = detector.collaboration_approach(possible_acknow_ids, df_acknow_possibleIds_grouped)

Start collaboration_approach
    - sub step1 : _find_target_authorIds. (Time: 8.03s)
    - sub step2: _published_paperIds_of_target_authors. (Time: 20.99s)
      -- get all collaborators of target_authors
      -- confine collaborators by using possible acknowledged scholars Ids(=acknow_candidate_ids) 
    - sub step3: _collaborators_of_paperId. (Time: 38.46s)
    - sub step4: _merge_to_create_author_collabIds. (Time: 38.52s)
  - Done: Step1.
  - Done: Step2. (Time: 38.86s)
  - Done: Step3. (Time: 39.14s)


In [219]:
# Citation approach
df_authorIds_citedAuthorIds_identified = detector.citation_approach(df_acknow_possibleIds_grouped)

Start citation_approach
  - Done: Step1. (Time: 27.13s)
  - Done: Step2. (Time: 27.14s)


In [220]:
# Combine both approaches
df_acknowId = detector.merge_two_approach(df_acknow_with_collab_identified, df_authorIds_citedAuthorIds_identified)

Merging collab. and citation approach results ...


In [221]:
# Formatting
df_acknowId = detector.format_result(df_acknowId, df_doi_paperId)

# save
df_acknowId.to_csv('result', index=False)

Formating the resutls ...


In [223]:
from pprint import pprint 
pprint(detector.stats(detector.df_acknow, df_acknowId))

{'Num identified scholars by Both approach': 329,
 'Num identified scholars by Collab. approach': 431,
 'Num identified scholars by Reference approach': 147,
 'Num of identified acknowledged scholars': 855,
 'Num of input papers': 659,
 'Num of input scholars names': 2316,
 'Num of papers with identified acknowledged scholars': 431,
 'Overall computational time': 1,
 'Proportion of identified scholars per input names': '0.3692'}
