In [15]:
import pandas as pd
import numpy as np
import glob
import json
import time 
from tqdm import tqdm
from time import sleep

# Downloading the CORD19 dataset
###### https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/historical_releases.html
###### https://github.com/allenai/cord19

In [16]:
metadata_path = r"D:\BUE\GP\testing\metadata_sample.pickle"
df_metadata = pd.DataFrame(pd.read_pickle(metadata_path))

In [17]:
pdf_path = r'D:\BUE\GP\testing\json_pdf_sample.pickle'
df_pdf = pd.DataFrame(pd.read_pickle(pdf_path))

In [18]:
pmc_path = r'D:\BUE\GP\testing\json_pmc_sample.pickle'
df_pmc = pd.DataFrame(pd.read_pickle(pmc_path))

In [19]:
df_pdf.head(1)

Unnamed: 0,paper_id,abstract,body_text
4,e0777fb5df224525ee1b06008582b084c1b6b13b,"The burden of hypertension in South Africa, as...",Raised blood pressure (BP) and hypertension ar...


In [20]:
df_pmc.head(1)

Unnamed: 0,paper_id,body_text
66,PMC8742156,The COVID-19 pandemic had a major impact on th...


In [21]:
df_metadata.head(1)

Unnamed: 0,cord_uid,sha,source_x,title,doi,pmcid,pubmed_id,license,abstract,publish_time,authors,journal,mag_id,who_covidence_id,arxiv_id,pdf_json_files,pmc_json_files,url,s2_id
900672,lepnladl,bb9e6ed5c0e9de74bfd98b8311b446cb3d9384bf,Medline; PMC; WHO,SARS-CoV-2 Infection in Kidney Transplant Reci...,10.1155/2021/2243095,PMC8570907,34745662,cc-by,INTRODUCTION: The second wave of COVID-19 has ...,2021-11-05,"Tatapudi, Ravi Raju; Kopparti, Venkateswara Ra...",Int J Nephrol,,,,document_parses/pdf_json/bb9e6ed5c0e9de74bfd98...,document_parses/pmc_json/PMC8570907.xml.json,https://www.ncbi.nlm.nih.gov/pubmed/34745662/;...,243800529.0


# Data merging (metadata,pdf,pmc)

In [22]:
df_merged = pd.merge(df_metadata, df_pdf,left_on ='sha',right_on ='paper_id',how = 'left')

In [23]:
df_merged.head(2)

Unnamed: 0,cord_uid,sha,source_x,title,doi,pmcid,pubmed_id,license,abstract_x,publish_time,...,mag_id,who_covidence_id,arxiv_id,pdf_json_files,pmc_json_files,url,s2_id,paper_id,abstract_y,body_text
0,lepnladl,bb9e6ed5c0e9de74bfd98b8311b446cb3d9384bf,Medline; PMC; WHO,SARS-CoV-2 Infection in Kidney Transplant Reci...,10.1155/2021/2243095,PMC8570907,34745662,cc-by,INTRODUCTION: The second wave of COVID-19 has ...,2021-11-05,...,,,,document_parses/pdf_json/bb9e6ed5c0e9de74bfd98...,document_parses/pmc_json/PMC8570907.xml.json,https://www.ncbi.nlm.nih.gov/pubmed/34745662/;...,243800529.0,bb9e6ed5c0e9de74bfd98b8311b446cb3d9384bf,Introduction. e second wave of COVID-19 has sp...,Severe acute respiratory syndrome corona virus...
1,ubuyzmok,758f8b07a1fdd2360cfe66f478150eedc23f602e,Medline; PMC; WHO,Genomic and Ancestral Variation Underlies the ...,10.3390/life11090921,PMC8470085,34575070,cc-by,The coronavirus disease (COVID-19) caused by t...,2021-09-05,...,,,,document_parses/pdf_json/758f8b07a1fdd2360cfe6...,document_parses/pmc_json/PMC8470085.xml.json,https://doi.org/10.3390/life11090921; https://...,237934658.0,758f8b07a1fdd2360cfe66f478150eedc23f602e,,"Since its outbreak in December 2019 in Wuhan, ..."


In [9]:
df_metadata.columns

Index(['cord_uid', 'sha', 'source_x', 'title', 'doi', 'pmcid', 'pubmed_id',
       'license', 'abstract', 'publish_time', 'authors', 'journal', 'mag_id',
       'who_covidence_id', 'arxiv_id', 'pdf_json_files', 'pmc_json_files',
       'url', 's2_id'],
      dtype='object')

In [24]:
df_pdf

Unnamed: 0,paper_id,abstract,body_text
4,e0777fb5df224525ee1b06008582b084c1b6b13b,"The burden of hypertension in South Africa, as...",Raised blood pressure (BP) and hypertension ar...
5,ee5af71875f2e77135974c75980ce22fff03e4f8,,"Particularly in these pandemic times, appeals ..."
42,be5efc90a7e5e4cb3ba27f554d2af1f6269fce46,| Seasonal influenza vaccines lack efficacy ag...,Vaccination represents an efficient and cost-e...
63,99633786b3f427e5789443a44fc5ac9081599dfd,,Severe acute respiratory syndrome coronavirus ...
91,ddd596a5c31a48e0efa95b814e4fe7726abfb25c,"Citation: Kekäläinen, T.; Hietavala, E.-M.; Ha...",The novel coronavirus disease was discovered a...
...,...,...,...
401100,e69368a663f0be4bb215c4c7b5a04e7bd522013d,,among Latinx LEP communities can be additional...
401129,7b0a5626403739c7a453b68d1adfb2484b2562a7,This is an open access article under the terms...,According to the Sepsis-3 definition proposed ...
401130,d103ef3cbfc2be251e111b63fa72d499947ce9d6,,Aus Sicht der befragten Stadtvertreter*innen i...
401161,cf8d3c98bcc60425ead55f1586523e5bbba308fe,Ingestion of magnetic foreign bodies in pediat...,Ingestion of magnetic foreign bodies in pediat...


In [25]:
df_merged = df_merged.drop('paper_id',axis=1)

In [26]:
df_merged.columns 

Index(['cord_uid', 'sha', 'source_x', 'title', 'doi', 'pmcid', 'pubmed_id',
       'license', 'abstract_x', 'publish_time', 'authors', 'journal', 'mag_id',
       'who_covidence_id', 'arxiv_id', 'pdf_json_files', 'pmc_json_files',
       'url', 's2_id', 'abstract_y', 'body_text'],
      dtype='object')

In [27]:
df_merged = pd.merge(df_merged, df_pmc,left_on='pmcid',right_on='paper_id',how = 'left')
df_merged

Unnamed: 0,cord_uid,sha,source_x,title,doi,pmcid,pubmed_id,license,abstract_x,publish_time,...,who_covidence_id,arxiv_id,pdf_json_files,pmc_json_files,url,s2_id,abstract_y,body_text_x,paper_id,body_text_y
0,lepnladl,bb9e6ed5c0e9de74bfd98b8311b446cb3d9384bf,Medline; PMC; WHO,SARS-CoV-2 Infection in Kidney Transplant Reci...,10.1155/2021/2243095,PMC8570907,34745662,cc-by,INTRODUCTION: The second wave of COVID-19 has ...,2021-11-05,...,,,document_parses/pdf_json/bb9e6ed5c0e9de74bfd98...,document_parses/pmc_json/PMC8570907.xml.json,https://www.ncbi.nlm.nih.gov/pubmed/34745662/;...,243800529.0,Introduction. e second wave of COVID-19 has sp...,Severe acute respiratory syndrome corona virus...,PMC8570907,Severe acute respiratory syndrome corona virus...
1,ubuyzmok,758f8b07a1fdd2360cfe66f478150eedc23f602e,Medline; PMC; WHO,Genomic and Ancestral Variation Underlies the ...,10.3390/life11090921,PMC8470085,34575070,cc-by,The coronavirus disease (COVID-19) caused by t...,2021-09-05,...,,,document_parses/pdf_json/758f8b07a1fdd2360cfe6...,document_parses/pmc_json/PMC8470085.xml.json,https://doi.org/10.3390/life11090921; https://...,237934658.0,,"Since its outbreak in December 2019 in Wuhan, ...",PMC8470085,"Since its outbreak in December 2019 in Wuhan, ..."
2,48b1o0gi,a15ab5fcb5810f915d264d821083e15b4a85da8a,ArXiv,Stochastic subgradient for composite convex op...,,,,arxiv,In this paper we consider optimization problem...,2022-04-18,...,,2204.08204,document_parses/pdf_json/a15ab5fcb5810f915d264...,,https://arxiv.org/pdf/2204.08204v1.pdf,248227747.0,In this paper we consider optimization problem...,The large sum of functions in the objective fu...,,
3,iglmkoyr,40ae528a88ff4b8b5566194209730b1c10471a14,Medline; PMC,Ultrasound-Guided Minimally Invasive Autopsy o...,10.1159/000514222,PMC8018195,33690234,no-cc,,2021-03-09,...,,,document_parses/pdf_json/40ae528a88ff4b8b55661...,document_parses/pmc_json/PMC8018195.xml.json,https://www.ncbi.nlm.nih.gov/pubmed/33690234/;...,232196342.0,,An outbreak of coronavirus disease in 2019 (CO...,PMC8018195,The authors have no conflicts of interest to d...
4,fucnn8lr,8dfddaea9bf1b9cdba23904c60907e1b1a1559a2,Medline; PMC,Ascending Aortic Calcification as a Potential ...,10.1155/2021/5526359,PMC8177974,34136118,cc-by,BACKGROUND: Identifying the factors related to...,2021-05-26,...,,,document_parses/pdf_json/8dfddaea9bf1b9cdba239...,document_parses/pmc_json/PMC8177974.xml.json,https://doi.org/10.1155/2021/5526359; https://...,235447645.0,Background. Identifying the factors related to...,Hip fractures are a severe health problem in p...,PMC8177974,Hip fractures are a severe health problem in p...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24995,tmh41n8s,2508c5de64605f470a35b6f3a0f98cbe765fe999,PMC,Pregnant Women and Endocrine Disruptors: Role ...,10.3390/cells11030495,PMC8834275,35159304,cc-by,"In pregnant women, the lungs, skin and placent...",2022-01-31,...,,,document_parses/pdf_json/2508c5de64605f470a35b...,document_parses/pmc_json/PMC8834275.xml.json,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC8...,,"Citation: Fouyet, S.; Olivier, E.; Leproux, P....",Endocrine-disrupting chemicals (EDCs) are defi...,PMC8834275,Endocrine-disrupting chemicals (EDCs) are defi...
24996,x3fbaugj,4a39d3fab4bfb8d0258c00d081f339f4d4411433,Elsevier; Medline; PMC; WHO,Sustaining progress towards malaria eliminatio...,10.1016/j.lanwpc.2022.100429,PMC9013518,35466325,no-cc,"In Asia Pacific, several nations that were par...",2022-04-18,...,,,document_parses/pdf_json/4a39d3fab4bfb8d0258c0...,document_parses/pmc_json/PMC9013518.xml.json,https://doi.org/10.1016/j.lanwpc.2022.100429; ...,248219147.0,"In Asia Pacific, several nations that were par...","At the ninth East Asia Summit (EAS) in 2014, h...",PMC9013518,"At the ninth East Asia Summit (EAS) in 2014, h..."
24997,wh41wczp,0b5e8c6fa299029beca996b95d1ea5382bc777e6,PMC,Federal Public Health Law,10.1016/b978-1-85617-547-0.00010-5,PMC7152008,,no-cc,This chapter explains the origins and current ...,2010-03-17,...,,,document_parses/pdf_json/0b5e8c6fa299029beca99...,,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7...,,,"Together, this chapter and the next one will e...",,
24998,oye7272z,b3c51caa98b76bd97d62e3c000d54d21b52c342e,Elsevier; PMC,Maternal Attitudes and Intentions about the CO...,10.1016/j.pedhc.2022.05.015,PMC9130683,,els-covid,The current study assessed maternal attitudes ...,2022-05-25,...,,,document_parses/pdf_json/b3c51caa98b76bd97d62e...,,https://www.sciencedirect.com/science/article/...,249047210.0,The current study assessed maternal attitudes ...,conducted one of the most comprehensive studie...,,


In [28]:
df_merged.shape

(25000, 23)

## Data cleaning and preprocessing

In [29]:
df_merged[df_merged.abstract_x != df_merged.abstract_y].shape

(22773, 23)

In [30]:
df_merged.abstract_x.isnull().sum()

3202

In [31]:
df_merged.abstract_y.isnull().sum()

1645

In [18]:
3235+1629

4864

In [32]:
df_merged['abstract_y'] = df_merged['abstract_y'].astype(str)
df_merged['abstract_y'] = np.where(df_merged['abstract_y'].map(len)>50,df_merged['abstract_y'],'na')

In [33]:
df_merged.loc[df_merged.abstract_x.isnull() & (df_merged.abstract_y != 'na'),'abstract_x'] = df_merged[df_merged.abstract_x.isnull() & (df_merged.abstract_y != 'na')].abstract_y

In [34]:
df_merged.abstract_x.isnull().sum()

2622

In [35]:
df_merged.abstract_y.isnull().sum()

0

In [36]:
df_merged.rename(columns={'abstract_x':'abstract'},inplace=True)

In [37]:
df_merged.columns

Index(['cord_uid', 'sha', 'source_x', 'title', 'doi', 'pmcid', 'pubmed_id',
       'license', 'abstract', 'publish_time', 'authors', 'journal', 'mag_id',
       'who_covidence_id', 'arxiv_id', 'pdf_json_files', 'pmc_json_files',
       'url', 's2_id', 'abstract_y', 'body_text_x', 'paper_id', 'body_text_y'],
      dtype='object')

In [38]:
df_merged = df_merged.drop(['abstract_y'],axis = 1)

In [39]:
df_merged.columns

Index(['cord_uid', 'sha', 'source_x', 'title', 'doi', 'pmcid', 'pubmed_id',
       'license', 'abstract', 'publish_time', 'authors', 'journal', 'mag_id',
       'who_covidence_id', 'arxiv_id', 'pdf_json_files', 'pmc_json_files',
       'url', 's2_id', 'body_text_x', 'paper_id', 'body_text_y'],
      dtype='object')

In [40]:
(df_merged.body_text_x != df_merged.body_text_y).sum()

25000

In [41]:
df_merged.body_text_x.isnull().sum()

1645

In [42]:
df_merged.body_text_y.isnull().sum()

4921

In [43]:
df_merged[df_merged.body_text_x.isnull() & df_merged.body_text_y.notnull()].shape

(1553, 22)

In [44]:
df_merged.loc[df_merged.body_text_y.notnull(),'body_text_x'] = df_merged.loc[df_merged.body_text_y.notnull(),'body_text_y']

In [45]:

df_merged.body_text_x.isnull().sum()

92

In [46]:
df_merged.columns

Index(['cord_uid', 'sha', 'source_x', 'title', 'doi', 'pmcid', 'pubmed_id',
       'license', 'abstract', 'publish_time', 'authors', 'journal', 'mag_id',
       'who_covidence_id', 'arxiv_id', 'pdf_json_files', 'pmc_json_files',
       'url', 's2_id', 'body_text_x', 'paper_id', 'body_text_y'],
      dtype='object')

In [47]:
df_merged.rename(columns = {'body_text_x':'body_text'},inplace=True)

df_merged = df_merged.drop(['body_text_y'],axis=1)

In [48]:
df_merged.columns

Index(['cord_uid', 'sha', 'source_x', 'title', 'doi', 'pmcid', 'pubmed_id',
       'license', 'abstract', 'publish_time', 'authors', 'journal', 'mag_id',
       'who_covidence_id', 'arxiv_id', 'pdf_json_files', 'pmc_json_files',
       'url', 's2_id', 'body_text', 'paper_id'],
      dtype='object')

In [49]:
df_final = df_merged[['sha','title','abstract','publish_time','authors','url','body_text']]

In [50]:
df_final.head()

Unnamed: 0,sha,title,abstract,publish_time,authors,url,body_text
0,bb9e6ed5c0e9de74bfd98b8311b446cb3d9384bf,SARS-CoV-2 Infection in Kidney Transplant Reci...,INTRODUCTION: The second wave of COVID-19 has ...,2021-11-05,"Tatapudi, Ravi Raju; Kopparti, Venkateswara Ra...",https://www.ncbi.nlm.nih.gov/pubmed/34745662/;...,Severe acute respiratory syndrome corona virus...
1,758f8b07a1fdd2360cfe66f478150eedc23f602e,Genomic and Ancestral Variation Underlies the ...,The coronavirus disease (COVID-19) caused by t...,2021-09-05,"Upadhyai, Priyanka; Suresh, Gokul; Parit, Rahu...",https://doi.org/10.3390/life11090921; https://...,"Since its outbreak in December 2019 in Wuhan, ..."
2,a15ab5fcb5810f915d264d821083e15b4a85da8a,Stochastic subgradient for composite convex op...,In this paper we consider optimization problem...,2022-04-18,"Necoara, Ion; Singh, Nitesh Kumar",https://arxiv.org/pdf/2204.08204v1.pdf,The large sum of functions in the objective fu...
3,40ae528a88ff4b8b5566194209730b1c10471a14,Ultrasound-Guided Minimally Invasive Autopsy o...,,2021-03-09,"Nucci, Ricardo Aparecido Baptista; Dolhnikoff,...",https://www.ncbi.nlm.nih.gov/pubmed/33690234/;...,The authors have no conflicts of interest to d...
4,8dfddaea9bf1b9cdba23904c60907e1b1a1559a2,Ascending Aortic Calcification as a Potential ...,BACKGROUND: Identifying the factors related to...,2021-05-26,"Bekki, Hirofumi; Arizono, Takeshi; Suzuki, Yuk...",https://doi.org/10.1155/2021/5526359; https://...,Hip fractures are a severe health problem in p...


In [51]:
df_final = df_final.dropna(axis=0,subset=['abstract','body_text'])
df_final.isnull().sum()

sha               0
title             0
abstract          0
publish_time      0
authors         114
url               0
body_text         0
dtype: int64

In [52]:
df_final.shape

(22301, 7)

In [53]:
df_final.to_csv('Final_CORD_19_data.csv',index=False)

In [54]:
df_final.head()

Unnamed: 0,sha,title,abstract,publish_time,authors,url,body_text
0,bb9e6ed5c0e9de74bfd98b8311b446cb3d9384bf,SARS-CoV-2 Infection in Kidney Transplant Reci...,INTRODUCTION: The second wave of COVID-19 has ...,2021-11-05,"Tatapudi, Ravi Raju; Kopparti, Venkateswara Ra...",https://www.ncbi.nlm.nih.gov/pubmed/34745662/;...,Severe acute respiratory syndrome corona virus...
1,758f8b07a1fdd2360cfe66f478150eedc23f602e,Genomic and Ancestral Variation Underlies the ...,The coronavirus disease (COVID-19) caused by t...,2021-09-05,"Upadhyai, Priyanka; Suresh, Gokul; Parit, Rahu...",https://doi.org/10.3390/life11090921; https://...,"Since its outbreak in December 2019 in Wuhan, ..."
2,a15ab5fcb5810f915d264d821083e15b4a85da8a,Stochastic subgradient for composite convex op...,In this paper we consider optimization problem...,2022-04-18,"Necoara, Ion; Singh, Nitesh Kumar",https://arxiv.org/pdf/2204.08204v1.pdf,The large sum of functions in the objective fu...
4,8dfddaea9bf1b9cdba23904c60907e1b1a1559a2,Ascending Aortic Calcification as a Potential ...,BACKGROUND: Identifying the factors related to...,2021-05-26,"Bekki, Hirofumi; Arizono, Takeshi; Suzuki, Yuk...",https://doi.org/10.1155/2021/5526359; https://...,Hip fractures are a severe health problem in p...
5,e6ab92529dce158c7046d4d4adba4b998eca9179,Universal Early Coarsening of Quenched Bose Gases,We investigate the early coarsening dynamics o...,2021-12-10,"Goo, Junhong; Lee, Yangheon; Lim, Younghoon; B...",https://www.ncbi.nlm.nih.gov/pubmed/35426709/;...,When a system crosses a symmetry-breaking phas...
