In [1]:
import json
import pandas as pd
pd.options.mode.chained_assignment = None
from tqdm import tqdm
import numpy as np
import regex as re
import matplotlib.pyplot as plt

In [2]:
data_dir = '2022-06-02'

In [3]:
all_metadata = pd.read_csv(f"../{data_dir}/metadata.csv",low_memory=False)

In [4]:
all_metadata.shape

(1056660, 19)

In [1]:
1056660-797794

258866

In [5]:
from collections import Counter
len(Counter(all_metadata['journal']))

54994

In [6]:
# de-duplicated cord_uid
all_metadata = all_metadata.drop_duplicates('cord_uid',keep='first')

In [7]:
all_metadata['title'] = all_metadata['title'].str.replace(r'[\t\r"“.．#]',' ', regex=True).str.strip()
all_metadata['cord_uid'] = all_metadata['cord_uid'].str.replace(r'[\t\r"“.．#]',' ', regex=True).str.strip()
all_metadata['source_x'] = all_metadata['source_x'].str.replace(r'[\t\r"“.．#]',' ', regex=True).str.strip()
all_metadata['publish_time'] = all_metadata['publish_time'].str.replace(r'[\t\r"“.．#]',' ', regex=True).str.strip()

In [8]:
# de-duplicated title
all_metadata = all_metadata.drop_duplicates('title',keep='first')

In [9]:
all_metadata.shape

(797794, 19)

In [10]:
# metadata_title_time_dict = dict(all_metadata[['title','publish_time']].values)
metadata_title_uid_dict = dict(all_metadata[['title','cord_uid']].values)

In [11]:
# paper has pmc json file
pmc_metadata = all_metadata.loc[~(all_metadata['pmc_json_files'].isna())]

In [12]:
pmc_metadata['publish_time'] = pd.to_datetime(pmc_metadata['publish_time'],format="%Y-%m-%d")

In [13]:
f"the number of papers: {pmc_metadata.shape[0]}"

'the number of papers: 234768'

In [14]:
# papers since 2020
pmc_metadata = pmc_metadata.loc[pmc_metadata['publish_time']>='2020-01-01'].reset_index(drop=True)

In [15]:
f"since 2020{pmc_metadata.shape[0]}"

'since 2020199057'

In [16]:
# papers in Elsevier Medline
pmc_metadata = pmc_metadata.loc[(pmc_metadata['source_x'].str.contains('Elsevier')) | (pmc_metadata['source_x'].str.contains('Medline'))]

In [17]:
f"Elsevier Medline: {pmc_metadata.shape[0]}"

'Elsevier Medline: 152164'

In [18]:
pmc_metadata.to_csv(f'valid_metadata_{data_dir}.csv',sep='\t',index=False)

In [19]:
pd.read_csv(f'valid_metadata_{data_dir}.csv',sep='\t').shape

(152164, 19)

### citation network

In [22]:
cite_edge_tuple = []
for _,row in tqdm(pmc_metadata[['cord_uid','pmc_json_files','publish_time']].iterrows()):
    cord_uid,pmc_json_files = row['cord_uid'],row['pmc_json_files']
    paper_fulltext = json.load(open(f"../{data_dir}/{pmc_json_files}"))
    paper_title = re.sub(r'[\t\r"“.．#]',' ', paper_fulltext['metadata']['title']).strip()
    citations = paper_fulltext["bib_entries"]
    for _,c in citations.items():
        if str(c['year']) in  {'2020','2021','2022'}:
            cited_title = re.sub(r'[\t\r"“.．#]',' ', c['title']).strip()
            try:
                cited_uid = metadata_title_uid_dict[cited_title]
            except:
                cited_uid = None
            cite_edge_tuple.append((paper_title,cited_title,row['publish_time'],cord_uid,cited_uid))

152164it [08:05, 313.31it/s]


In [23]:
citation_net = pd.DataFrame(data=cite_edge_tuple,columns=['citing_paper','cited_paper','time','citing_uid','cited_uid'])

In [24]:
citation_net

Unnamed: 0,citing_paper,cited_paper,time,citing_uid,cited_uid
0,Deservingness: migration and health in social ...,Overselling globalization: the misleading Conf...,2021-04-07,kby4wprm,
1,Deservingness: migration and health in social ...,Asians and Asian Americans’ experiences of rac...,2021-04-07,kby4wprm,
2,Deservingness: migration and health in social ...,Structural competency and global health education,2021-04-07,kby4wprm,
3,Deservingness: migration and health in social ...,Structural Competency: Interprofessional Medic...,2021-04-07,kby4wprm,
4,Deservingness: migration and health in social ...,Us food workers are in danger that threatens ...,2021-04-07,kby4wprm,
...,...,...,...,...,...
2316509,A Patient With Bilateral Conjunctivitis Positi...,Clinical features of patients infected with 20...,2020-07-02,05dxn54t,m2lq1qnp
2316510,A Patient With Bilateral Conjunctivitis Positi...,Conjunctivitis can be the only presenting sign...,2020-07-02,05dxn54t,fzrl5h9l
2316511,A Patient With Bilateral Conjunctivitis Positi...,Characteristics of ocular findings of patients...,2020-07-02,05dxn54t,
2316512,Hospital variation in admissions to neonatal i...,Utility of birth certificate data for evaluati...,2020-08-14,pq54lvya,


In [30]:
citation_net['time'].max()

Timestamp('2022-10-15 00:00:00')

In [25]:
all_paper = set(citation_net['cited_paper']) | set(citation_net['citing_paper'])

In [26]:
print(f"citing paper and cited paper add up to {len(all_paper)}")

citing paper and cited paper add up to 1137781


In [27]:
all_paper_uid = set(citation_net['citing_uid']) | set(citation_net['cited_uid'])

In [28]:
print(f"citing paper and cited paper uid add up to {len(all_paper_uid)}")

citing paper and cited paper uid add up to 260832


In [26]:
# citation_net[['citing_year','citing_month','citing_date']] = citation_net['time'].str.split('-',expand=True)
# citation_net.drop(columns=['time'],inplace=True)

In [29]:
citation_net.to_csv(f'citation_net_{data_dir}.tsv',sep='\t',index=False)