In [1]:
import json
import pandas as pd
pd.options.mode.chained_assignment = None
from tqdm import tqdm
import numpy as np
import regex as re
import matplotlib.pyplot as plt

In [2]:
data_dir = '2021-07-19'

In [3]:
all_metadata = pd.read_csv(f"../{data_dir}/metadata.csv",low_memory=False)

In [4]:
all_metadata.shape

(709156, 19)

In [6]:
from collections import Counter
len(Counter(all_metadata['journal']))

41099

In [5]:
# 根据cord_uid去重
all_metadata = all_metadata.drop_duplicates('cord_uid',keep='first')

In [6]:
all_metadata['title'] = all_metadata['title'].str.replace(r'[\t\r"“.．#]',' ', regex=True).str.strip()
all_metadata['cord_uid'] = all_metadata['cord_uid'].str.replace(r'[\t\r"“.．#]',' ', regex=True).str.strip()
all_metadata['source_x'] = all_metadata['source_x'].str.replace(r'[\t\r"“.．#]',' ', regex=True).str.strip()
all_metadata['publish_time'] = all_metadata['publish_time'].str.replace(r'[\t\r"“.．#]',' ', regex=True).str.strip()

In [7]:
# 根据title 去重
all_metadata = all_metadata.drop_duplicates('title',keep='first')

In [8]:
all_metadata.shape

(476559, 19)

In [9]:
# metadata_title_time_dict = dict(all_metadata[['title','publish_time']].values)
metadata_title_uid_dict = dict(all_metadata[['title','cord_uid']].values)

In [10]:
# 仅保留有pmc json文件的
pmc_metadata = all_metadata.loc[~(all_metadata['pmc_json_files'].isna())]

In [11]:
pmc_metadata['publish_time'] = pd.to_datetime(pmc_metadata['publish_time'],format="%Y-%m-%d")

In [12]:
f"全时间段论文数量{pmc_metadata.shape[0]}"

'全时间段论文数量107082'

In [13]:
# 仅保留2020年后的
pmc_metadata = pmc_metadata.loc[pmc_metadata['publish_time']>='2020-01-01'].reset_index(drop=True)

In [14]:
f"2020年后论文数量{pmc_metadata.shape[0]}"

'2020年后论文数量71539'

In [15]:
# 仅保留Elsevier Medline收录的论文
pmc_metadata = pmc_metadata.loc[(pmc_metadata['source_x'].str.contains('Elsevier')) | (pmc_metadata['source_x'].str.contains('Medline'))]

In [16]:
f"Elsevier Medline 的论文数量{pmc_metadata.shape[0]}"

'Elsevier Medline 的论文数量47869'

In [17]:
pmc_metadata.to_csv(f'valid_metadata_{data_dir}.csv',sep='\t',index=False)

In [18]:
pd.read_csv(f'valid_metadata_{data_dir}.csv',sep='\t').shape

(47869, 19)

### 引文网络

In [19]:
cite_edge_tuple = []
for _,row in tqdm(pmc_metadata[['cord_uid','pmc_json_files','publish_time']].iterrows()):
    cord_uid,pmc_json_files = row['cord_uid'],row['pmc_json_files']
    paper_fulltext = json.load(open(f"../{data_dir}/{pmc_json_files}"))
    paper_title = re.sub(r'[\t\r"“.．#]',' ', paper_fulltext['metadata']['title']).strip()
    citations = paper_fulltext["bib_entries"]
    for _,c in citations.items():
        if str(c['year']) in  {'2020','2021'}:
            cited_title = re.sub(r'[\t\r"“.．#]',' ', c['title']).strip()
            try:
                cited_uid = metadata_title_uid_dict[cited_title]
            except:
                cited_uid = None
            cite_edge_tuple.append((paper_title,cited_title,row['publish_time'],cord_uid,cited_uid))

47869it [02:06, 377.80it/s]


In [20]:
citation_net = pd.DataFrame(data=cite_edge_tuple,columns=['citing_paper','cited_paper','time','citing_uid','cited_uid'])

In [21]:
citation_net

Unnamed: 0,citing_paper,cited_paper,time,citing_uid,cited_uid
0,Hypoxia-induced amniotic fluid stem cell secre...,Endothelial cell infection and endotheliitis i...,2021-01-08,6jbjwl8j,43gqjlca
1,Hypoxia-induced amniotic fluid stem cell secre...,Human mesenchymal stem cells-conditioned mediu...,2021-01-08,6jbjwl8j,
2,Hypoxia-induced amniotic fluid stem cell secre...,LRP6 downregulation promotes cardiomyocyte pro...,2021-01-08,6jbjwl8j,
3,Hypoxia-induced amniotic fluid stem cell secre...,Adult cardiomyocyte proliferation: a new insig...,2021-01-08,6jbjwl8j,
4,Hypoxia-induced amniotic fluid stem cell secre...,Emergence of the stem cell secretome in regene...,2021-01-08,6jbjwl8j,
...,...,...,...,...,...
478929,The impact of policy mixes on new energy vehic...,Effects of multi policies on electric vehicle ...,2021-02-22,kg1daxuu,
478930,An initial report from the French SOT COVID Re...,Early Description of Coronavirus 2019 Disease ...,2020-08-24,f0t6n2j1,0akknq1f
478931,An initial report from the French SOT COVID Re...,Kidney transplant recipients with SARS Cov2 in...,2020-08-24,f0t6n2j1,
478932,An initial report from the French SOT COVID Re...,COVID-19 and Calcineurin Inhibitors: Should Th...,2020-08-24,f0t6n2j1,yeqqp6fr


In [22]:
all_paper = set(citation_net['cited_paper']) | set(citation_net['citing_paper'])

In [23]:
print(f"citing paper and cited paper add up to {len(all_paper)}")

citing paper and cited paper add up to 286302


In [24]:
all_paper_uid = set(citation_net['citing_uid']) | set(citation_net['cited_uid'])

In [25]:
print(f"citing paper and cited paper uid add up to {len(all_paper_uid)}")

citing paper and cited paper uid add up to 79484


In [26]:
# citation_net[['citing_year','citing_month','citing_date']] = citation_net['time'].str.split('-',expand=True)
# citation_net.drop(columns=['time'],inplace=True)

In [27]:
citation_net.to_csv(f'citation_net_{data_dir}.tsv',sep='\t',index=False)