In [1]:
import pandas as pd
import numpy as np
import pickle

In [2]:

# Accessing the research papers and data

with open('data/metadata_sample.pickle', 'rb') as f:
  df_metadata = pickle.load(f)
with open('data/json_pdf_sample.pickle', 'rb') as f:
  df_pdf = pickle.load(f)
with open('data/json_pmc_sample.pickle', 'rb') as f:
  df_pmc = pickle.load(f)

In [3]:
df_metadata.head()

Unnamed: 0,cord_uid,sha,source_x,title,doi,pmcid,pubmed_id,license,abstract,publish_time,authors,journal,mag_id,who_covidence_id,arxiv_id,pdf_json_files,pmc_json_files,url,s2_id
819514,iyyq4idj,82df15cd2620c224affd4ce100e7fac7fcff7b10; 67f7...,Medline; PMC,Deconvoluting Lipid Nanoparticle Structure for...,10.1021/acs.nanolett.0c01386,PMC7228479,32375002.0,no-cc,[Image: see text] Lipid nanoparticle (LNP) pac...,2020-05-06,"Eygeris, Yulia; Patel, Siddharth; Jozic, Anton...",Nano Lett,,,,document_parses/pdf_json/82df15cd2620c224affd4...,document_parses/pmc_json/PMC7228479.xml.json,https://doi.org/10.1021/acs.nanolett.0c01386; ...,218535340.0
611388,h1juos01,1e7e71936cb8aa74a8c9e8ba90c5d729dec7f405,Medline; PMC; WHO,Predictability in Contemporary Medicine,10.3389/fmed.2021.510421,PMC8242575,34222267.0,cc-by,Medical practice is increasingly coming under ...,2021-06-16,"Ciulla, Michele M.",Front Med (Lausanne),,,,document_parses/pdf_json/1e7e71936cb8aa74a8c9e...,document_parses/pmc_json/PMC8242575.xml.json,https://www.ncbi.nlm.nih.gov/pubmed/34222267/;...,235441458.0
19784,awql4vol,cc4e338a21138f18ef26af7b26c5ecc28270d959,PMC,Experimental Models of Hepatocellular Carcinom...,10.3390/cancers13153651,PMC8344976,34359553.0,cc-by,SIMPLE SUMMARY: Hepatocellular carcinoma (HCC)...,2021-07-21,"Blidisel, Alexandru; Marcovici, Iasmina; Coric...",Cancers (Basel),,,,document_parses/pdf_json/cc4e338a21138f18ef26a...,document_parses/pmc_json/PMC8344976.xml.json,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC8...,
789340,tynn8700,3114420bfc07e18049a9e7045b42c4de7a30bb81,Medline; PMC; WHO,Cancer and COVID-19: analysis of patient outcomes,10.2217/fon-2021-0121,PMC8284249,34263660.0,cc-by,Background: We sought to investigate the outco...,2021-07-15,"Aboueshia, Mohamed; Hussein, Mohammad Hosny; A...",Future oncology,,,,document_parses/pdf_json/3114420bfc07e18049a9e...,document_parses/pmc_json/PMC8284249.xml.json,https://www.ncbi.nlm.nih.gov/pubmed/34263660/;...,235907629.0
779704,z6bkoheb,1ef3ec3dfb9bea22c1060c1c469fc1de5c41f15a,Medline; PMC; WHO,"Haematuria, a widespread petechial rash, and h...",10.1136/bcr-2021-245440,PMC8499345,34620638.0,no-cc,With increasing presentations of headaches fol...,2021-10-07,"Waraich, Ammar; Williams, George",BMJ Case Rep,,,,document_parses/pdf_json/1ef3ec3dfb9bea22c1060...,document_parses/pmc_json/PMC8499345.xml.json,https://www.ncbi.nlm.nih.gov/pubmed/34620638/;...,238475067.0


#### Data Merging (merging the data from metadata, json pdf and json pmc files for research papers)

In [4]:
#Merging the metadata and json pdf data
#sha column of metadata and paper_id of json_pdf data are same identifier for each record
df_merged = pd.merge(df_metadata,df_pdf,left_on='sha',right_on='paper_id',how='left').drop('paper_id',axis=1)

In [5]:
df_merged.head(2)
#abstract_x is the abstract from metadata and abstract_y is the abstract from json_pdf

Unnamed: 0,cord_uid,sha,source_x,title,doi,pmcid,pubmed_id,license,abstract_x,publish_time,...,journal,mag_id,who_covidence_id,arxiv_id,pdf_json_files,pmc_json_files,url,s2_id,abstract_y,body_text
0,iyyq4idj,82df15cd2620c224affd4ce100e7fac7fcff7b10; 67f7...,Medline; PMC,Deconvoluting Lipid Nanoparticle Structure for...,10.1021/acs.nanolett.0c01386,PMC7228479,32375002.0,no-cc,[Image: see text] Lipid nanoparticle (LNP) pac...,2020-05-06,...,Nano Lett,,,,document_parses/pdf_json/82df15cd2620c224affd4...,document_parses/pmc_json/PMC7228479.xml.json,https://doi.org/10.1021/acs.nanolett.0c01386; ...,218535340.0,,
1,h1juos01,1e7e71936cb8aa74a8c9e8ba90c5d729dec7f405,Medline; PMC; WHO,Predictability in Contemporary Medicine,10.3389/fmed.2021.510421,PMC8242575,34222267.0,cc-by,Medical practice is increasingly coming under ...,2021-06-16,...,Front Med (Lausanne),,,,document_parses/pdf_json/1e7e71936cb8aa74a8c9e...,document_parses/pmc_json/PMC8242575.xml.json,https://www.ncbi.nlm.nih.gov/pubmed/34222267/;...,235441458.0,Medical practice is increasingly coming under ...,The aim of this paper is to present in a histo...


In [6]:
#Lets merge the json_pmc data to the merged data too
df_merged = pd.merge(df_merged,df_pmc,left_on='pmcid',right_on='paper_id',how='left').drop('paper_id',axis=1)
df_merged.shape

(30000, 22)

##### Data Cleaning and Preprocessing

In [7]:
df_merged[df_merged.abstract_x != df_merged.abstract_y].shape

(27221, 22)

In [8]:
# check metadata abstract column to see if null values exist
df_merged.abstract_x.isnull().sum(),(df_merged.abstract_x == '').sum()

(4269, 0)

In [9]:
# Check pdf_json abstract to see if null values exist
df_merged.abstract_y.isnull().sum(),(df_merged.abstract_y == '').sum()

(1836, 8166)

Since the abstract_x from metadata is more reliable , we will use it but only fill by abstract_y text when abstract_x value is null

In [10]:
# Convert all columns to string and then replace abstract_y values
#df = df.astype(str)
df_merged["abstract_y"] = df_merged["abstract_y"].astype(str) 
df_merged['abstract_y'] = np.where(df_merged['abstract_y'].map(len) > 50, df_merged['abstract_y'], "na")

In [11]:
df_merged[df_merged['abstract_y'].apply(lambda x: len(str(x)) <= 10)]

Unnamed: 0,cord_uid,sha,source_x,title,doi,pmcid,pubmed_id,license,abstract_x,publish_time,...,mag_id,who_covidence_id,arxiv_id,pdf_json_files,pmc_json_files,url,s2_id,abstract_y,body_text_x,body_text_y
0,iyyq4idj,82df15cd2620c224affd4ce100e7fac7fcff7b10; 67f7...,Medline; PMC,Deconvoluting Lipid Nanoparticle Structure for...,10.1021/acs.nanolett.0c01386,PMC7228479,32375002.0,no-cc,[Image: see text] Lipid nanoparticle (LNP) pac...,2020-05-06,...,,,,document_parses/pdf_json/82df15cd2620c224affd4...,document_parses/pmc_json/PMC7228479.xml.json,https://doi.org/10.1021/acs.nanolett.0c01386; ...,218535340.0,na,,mRNA-based therapies and vaccines hold tremend...
8,pov7ipwy,085c356124bb58ba18b5255cd884c82fb4888cc6,Medline; PMC; WHO,Attitude Towards COVID-19 Vaccination Among He...,10.2147/idr.s332792,PMC8464326,34584432.0,cc-by-nc,INTRODUCTION: Availability and accessibility o...,2021-09-21,...,,,,document_parses/pdf_json/085c356124bb58ba18b52...,document_parses/pmc_json/PMC8464326.xml.json,https://www.ncbi.nlm.nih.gov/pubmed/34584432/;...,238199445.0,na,The COVID-19 pandemic has caused a huge number...,The COVID-19 pandemic has caused a huge number...
11,sne2o5y3,bc32a5a720d9721dcabf8f69081a63a3efa5fdcb,Elsevier; Medline; PMC,The proper use of masks,10.1016/j.banm.2020.09.028,PMC7501773,32981936.0,els-covid,,2020-09-19,...,,,,document_parses/pdf_json/bc32a5a720d9721dcabf8...,,https://doi.org/10.1016/j.banm.2020.09.028; ht...,221796386.0,na,Press release from the French National Academy...,
12,mivv87nd,598a4b91b51070095c7c25e79b7f198d604f09e7,Elsevier; PMC,Musculoskeletal and Neurologic Diseases,10.1016/b978-0-323-48435-0.00010-1,PMC7258713,,no-cc,Ferrets may exhibit neurologic signs as a resu...,2020-05-29,...,,,,document_parses/pdf_json/598a4b91b51070095c7c2...,document_parses/pmc_json/PMC7258713.xml.json,https://api.elsevier.com/content/article/pii/B...,218973161.0,na,Ferrets can manifest with neurologic signs as ...,Neurologic examination of ferrets follows the ...
18,jmdxjpyq,1928a032224b22aa192550c7e58dc71e190901df; b6cf...,Medline; PMC,Clinical prediction rule for SARS-CoV-2 infect...,10.1371/journal.pone.0248438,PMC7946184,33690722.0,cc-by,OBJECTIVES: Accurate and reliable criteria to ...,2021-03-10,...,,,,document_parses/pdf_json/1928a032224b22aa19255...,document_parses/pmc_json/PMC7946184.xml.json,https://www.ncbi.nlm.nih.gov/pubmed/33690722/;...,232193176.0,na,,The ability to rapidly estimate the probabilit...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29986,ig6ff0kd,bcf0f876cf7f8fefc104c078b22b6aafa2f04d9b,Medline; PMC,"Technology, Population Health, and Human Wellness",10.1007/s41745-020-00207-z,PMC7643103,33169052.0,no-cc,,2020-11-05,...,,,,document_parses/pdf_json/bcf0f876cf7f8fefc104c...,document_parses/pmc_json/PMC7643103.xml.json,https://doi.org/10.1007/s41745-020-00207-z; ht...,226248297.0,na,• I: Population health is central to addressin...,It has been more than a half-century since Dr....
29988,7pxvzrq8,b800e9051e17e929973706bc55e4ea6b7abba667,MedRxiv,Non-invasive Vagus Nerve Stimulation for Respi...,10.1101/2021.09.24.21264045,,,medrxiv,Background: Severe coronavirus disease 2019 (C...,2021-09-27,...,,,,document_parses/pdf_json/b800e9051e17e92997370...,,http://medrxiv.org/cgi/content/short/2021.09.2...,237941228.0,na,. CC-BY-ND 4.0 International license It is mad...,
29990,jasgmk4h,28c766fb31fbb27728121efc65c884b52831f4ea,Medline; PMC; WHO,Consequences of the COVID-19 Lockdown in Germa...,10.3390/ijerph181910463,PMC8507817,34639763.0,cc-by,The current study investigated how music has b...,2021-10-05,...,,,,document_parses/pdf_json/28c766fb31fbb27728121...,document_parses/pmc_json/PMC8507817.xml.json,https://www.ncbi.nlm.nih.gov/pubmed/34639763/;...,238741717.0,na,The coronavirus disease 2019 (COVID-19) has br...,When lockdown in Germany started on 13 March 2...
29997,fn8fuzvo,df3052bd12a69ef76897506b25bc4e5f7b008467,Medline; PMC; WHO,Regionales Monitoring von Infektionen mittels ...,10.1007/s00103-021-03397-8,PMC8358915,34383083.0,cc-by,BACKGROUND: Maps of the temporal evolution of ...,2021-08-12,...,,,,document_parses/pdf_json/df3052bd12a69ef768975...,document_parses/pmc_json/PMC8358915.xml.json,https://www.ncbi.nlm.nih.gov/pubmed/34383083/;...,236989742.0,na,Indikatoren zum Infektionsgeschehen wie zum in...,Indikatoren zum Infektionsgeschehen wie zum in...


In [12]:
# replace abstract_x (metadata column) with abstract_y (pdf_json) value where abstract_x is null
df_merged.loc[df_merged.abstract_x.isnull() & (df_merged.abstract_y != 'na'),'abstract_x'] = df_merged[df_merged.abstract_x.isnull() & (df_merged.abstract_y != 'na')].abstract_y

In [13]:
# Do we have any remaining null abstract values. Not anymore. This is good.
# The null values have reduced which is what we had expected.
df_merged.abstract_x.isnull().sum()

3550

In [14]:
# Lets get rid of the pdf_json abstract column and rename the metadata abstract column
df_merged.rename(columns = {'abstract_x' : 'abstract'}, inplace = True)
df_merged.drop('abstract_y',axis=1,inplace = True)
df_merged.columns

Index(['cord_uid', 'sha', 'source_x', 'title', 'doi', 'pmcid', 'pubmed_id',
       'license', 'abstract', 'publish_time', 'authors', 'journal', 'mag_id',
       'who_covidence_id', 'arxiv_id', 'pdf_json_files', 'pmc_json_files',
       'url', 's2_id', 'body_text_x', 'body_text_y'],
      dtype='object')

In [15]:
# This is expected because body text comes from pdf and pmc folders
(df_merged.body_text_x != df_merged.body_text_y).sum()

30000

In [16]:
df_merged.body_text_x.isnull().sum(),(df_merged.body_text_y == '').sum()

(1836, 0)

In [17]:
# This is expected because there are less papers in json_pmc
df_merged.body_text_y.isnull().sum()

6402

In [18]:
# body_text_x is pdf_json. body_text_y comes from pmc_json
# Where available we use the text from pmc file trusting the statement quality
df_merged.body_text_x.isnull().sum(),(df_merged.body_text_y.isnull()).sum()

(1836, 6402)

In [19]:
# There are ~13k rows where body_text_x is null but body_text_y is not null
df_merged.loc[df_merged.body_text_x.isnull() & df_merged.body_text_y.notnull()]

Unnamed: 0,cord_uid,sha,source_x,title,doi,pmcid,pubmed_id,license,abstract,publish_time,...,journal,mag_id,who_covidence_id,arxiv_id,pdf_json_files,pmc_json_files,url,s2_id,body_text_x,body_text_y
0,iyyq4idj,82df15cd2620c224affd4ce100e7fac7fcff7b10; 67f7...,Medline; PMC,Deconvoluting Lipid Nanoparticle Structure for...,10.1021/acs.nanolett.0c01386,PMC7228479,32375002.0,no-cc,[Image: see text] Lipid nanoparticle (LNP) pac...,2020-05-06,...,Nano Lett,,,,document_parses/pdf_json/82df15cd2620c224affd4...,document_parses/pmc_json/PMC7228479.xml.json,https://doi.org/10.1021/acs.nanolett.0c01386; ...,218535340.0,,mRNA-based therapies and vaccines hold tremend...
18,jmdxjpyq,1928a032224b22aa192550c7e58dc71e190901df; b6cf...,Medline; PMC,Clinical prediction rule for SARS-CoV-2 infect...,10.1371/journal.pone.0248438,PMC7946184,33690722.0,cc-by,OBJECTIVES: Accurate and reliable criteria to ...,2021-03-10,...,PLoS One,,,,document_parses/pdf_json/1928a032224b22aa19255...,document_parses/pmc_json/PMC7946184.xml.json,https://www.ncbi.nlm.nih.gov/pubmed/33690722/;...,232193176.0,,The ability to rapidly estimate the probabilit...
21,y0d9l0s9,9f2609078b7d4c36600263ab90e928d300b76e76; abbe...,Medline; PMC,Inclusion of cGAMP within virus‐like particle ...,10.15252/embr.202152447,PMC8339669,34142428.0,cc-by,Cyclic GMP‐AMP (cGAMP) is an immunostimulatory...,2021-06-18,...,EMBO Rep,,,,document_parses/pdf_json/9f2609078b7d4c3660026...,document_parses/pmc_json/PMC8339669.xml.json,https://www.ncbi.nlm.nih.gov/pubmed/34142428/;...,235473166.0,,Vaccination is a powerful strategy in the figh...
28,s7rtmf6i,2911aa443294c9d9564ce05b8e486463ef8ed85b; 23d3...,Elsevier; Medline; PMC; WHO,Real Masks and Spoof Faces: On the Masked Face...,10.1016/j.patcog.2021.108398,PMC8547786,34720199.0,no-cc,Face masks have become one of the main methods...,2021-10-26,...,Pattern Recognit,,,,document_parses/pdf_json/2911aa443294c9d9564ce...,document_parses/pmc_json/PMC8547786.xml.json,https://www.sciencedirect.com/science/article/...,239890411.0,,Since the SARS-CoV-2 coronavirus outbreak and ...
52,ea33sn1n,200a4d03ea5fc5e4593c34facb38986a77c7b6d2; cc1c...,Elsevier; Medline; PMC; WHO,Understanding the Effects of the COVID-19 Pand...,10.1016/j.bpsgos.2021.07.004,PMC8415869,34514460.0,no-cc,BACKGROUND: Adversity has consistently been fo...,2021-07-23,...,Biol Psychiatry Glob Open Sci,,,,document_parses/pdf_json/200a4d03ea5fc5e4593c3...,document_parses/pmc_json/PMC8415869.xml.json,https://api.elsevier.com/content/article/pii/S...,237403375.0,,Past work has indicated that adversity has a p...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29932,f9uu0o1h,bc1935141cf358405681f33f16bb8b612f46d7a9; d39e...,Elsevier; Medline; PMC,False negative rate of COVID-19 is eliminated ...,10.1016/j.tmaid.2020.101668,PMC7151360,32283215.0,no-cc,,2020-04-11,...,Travel Med Infect Dis,,,,document_parses/pdf_json/bc1935141cf358405681f...,document_parses/pmc_json/PMC7151360.xml.json,https://www.ncbi.nlm.nih.gov/pubmed/32283215/;...,215724665.0,,This study was approved by the Ethics Committe...
29944,eeeohqjn,6717b875243cf87b395f70f3ec23d8e9f0e8a2c7; 6a20...,Elsevier; Medline; PMC,"Coronaviruses, cholesterol and statins: Involv...",10.1016/j.biochi.2021.06.005,PMC8213520,34153377.0,no-cc,The infectious power of coronaviruses is depen...,2021-06-18,...,Biochimie,,,,document_parses/pdf_json/6717b875243cf87b395f7...,document_parses/pmc_json/PMC8213520.xml.json,https://api.elsevier.com/content/article/pii/S...,235473780.0,,The present Covid-19 pandemic has revealed to ...
29965,35k4065e,1c60d93cd686bb14fac65510ad3de47cf8b9332c; 8ee7...,Medline; PMC,Characterization of the Anti-Hepatitis C Virus...,10.1128/aac.00126-18,PMC6021681,29760125.0,cc-by,Although members of the Flaviviridae display h...,2018-06-26,...,Antimicrob Agents Chemother,,,,document_parses/pdf_json/1c60d93cd686bb14fac65...,document_parses/pmc_json/PMC6021681.xml.json,https://doi.org/10.1128/aac.00126-18; https://...,46890746.0,,"Urbanization, human migrations, and climate ch..."
29984,fw6pij9h,3a319ff1900a812347b31b899f24694a3ae2bf7a; 231e...,Medline; PMC,Risk factors for poor outcomes in hospitalised...,10.7189/jogh.11.10001,PMC7980087,33767855.0,cc-by,BACKGROUND: Understanding the risk factors for...,2021-03-01,...,Journal of global health,,,,document_parses/pdf_json/3a319ff1900a812347b31...,document_parses/pmc_json/PMC7980087.xml.json,https://doi.org/10.7189/jogh.11.10001; https:/...,232336581.0,,The search strategy was designed to identify s...


In [20]:
# We are trusting the text from pmc folder to be of higher quality as it contains full text. 
# Hence we will replace with body_text_x with body_text_y where body_text_y exists
df_merged.loc[df_merged.body_text_y.notnull(),'body_text_x'] = df_merged.loc[df_merged.body_text_y.notnull(), 'body_text_y']

In [21]:
# Lets get rid of the pdf_pmc body text column and rename the body text column
df_merged.rename(columns = {'body_text_x' : 'body_text'}, inplace = True)
df_merged.drop('body_text_y',axis=1,inplace = True)
df_merged.columns

Index(['cord_uid', 'sha', 'source_x', 'title', 'doi', 'pmcid', 'pubmed_id',
       'license', 'abstract', 'publish_time', 'authors', 'journal', 'mag_id',
       'who_covidence_id', 'arxiv_id', 'pdf_json_files', 'pmc_json_files',
       'url', 's2_id', 'body_text'],
      dtype='object')

In [22]:
# Body text null values have now decreased.
df_merged.body_text.isnull().sum()

97

In [23]:
df_merged.columns     

Index(['cord_uid', 'sha', 'source_x', 'title', 'doi', 'pmcid', 'pubmed_id',
       'license', 'abstract', 'publish_time', 'authors', 'journal', 'mag_id',
       'who_covidence_id', 'arxiv_id', 'pdf_json_files', 'pmc_json_files',
       'url', 's2_id', 'body_text'],
      dtype='object')

In [24]:
df_final = df_merged[['sha', 'title', 'abstract', 'publish_time', 'authors', 'url', 'body_text']]

In [25]:
df_final.head()

Unnamed: 0,sha,title,abstract,publish_time,authors,url,body_text
0,82df15cd2620c224affd4ce100e7fac7fcff7b10; 67f7...,Deconvoluting Lipid Nanoparticle Structure for...,[Image: see text] Lipid nanoparticle (LNP) pac...,2020-05-06,"Eygeris, Yulia; Patel, Siddharth; Jozic, Anton...",https://doi.org/10.1021/acs.nanolett.0c01386; ...,mRNA-based therapies and vaccines hold tremend...
1,1e7e71936cb8aa74a8c9e8ba90c5d729dec7f405,Predictability in Contemporary Medicine,Medical practice is increasingly coming under ...,2021-06-16,"Ciulla, Michele M.",https://www.ncbi.nlm.nih.gov/pubmed/34222267/;...,The aim of this paper is to present in a histo...
2,cc4e338a21138f18ef26af7b26c5ecc28270d959,Experimental Models of Hepatocellular Carcinom...,SIMPLE SUMMARY: Hepatocellular carcinoma (HCC)...,2021-07-21,"Blidisel, Alexandru; Marcovici, Iasmina; Coric...",https://www.ncbi.nlm.nih.gov/pmc/articles/PMC8...,"Hepatocellular carcinoma (HCC), the most commo..."
3,3114420bfc07e18049a9e7045b42c4de7a30bb81,Cancer and COVID-19: analysis of patient outcomes,Background: We sought to investigate the outco...,2021-07-15,"Aboueshia, Mohamed; Hussein, Mohammad Hosny; A...",https://www.ncbi.nlm.nih.gov/pubmed/34263660/;...,Study data were collected and managed using RE...
4,1ef3ec3dfb9bea22c1060c1c469fc1de5c41f15a,"Haematuria, a widespread petechial rash, and h...",With increasing presentations of headaches fol...,2021-10-07,"Waraich, Ammar; Williams, George",https://www.ncbi.nlm.nih.gov/pubmed/34620638/;...,With increasing presentations of patients with...


In [26]:
df_final = df_final.dropna(axis=0,subset=['body_text', 'title'])
df_final.isnull().sum()

sha                0
title              0
abstract        3535
publish_time       0
authors          274
url                0
body_text          0
dtype: int64

In [27]:
df_final.shape

(29902, 7)

In [28]:
df_final.to_csv('data/FINAL_CORD_DATA.csv', index=False)