In [9]:
import numpy as np
import pandas as pd
import json
import glob
import matplotlib.pyplot as plt

## Load data

Followed the notebook by Ivan Ega Pratama.

Citation: https://www.kaggle.com/ivanegapratama/covid-eda-initial-exploration-tool

### Read metadata.csv

In [15]:
root_path = "/Users/lmeng/Documents/Kaggle/CORD-19-research-challenge"
metadata_path = f"{root_path}/metadata.csv"
meta_df = pd.read_csv(metadata_path, dtype={
    'pubmed_id': str, 
    'Microsoft Academic Paper ID': str, 
    'doi': str
})
print(meta_df.shape)
meta_df.head()

(44220, 15)


Unnamed: 0,sha,source_x,title,doi,pmcid,pubmed_id,license,abstract,publish_time,authors,journal,Microsoft Academic Paper ID,WHO #Covidence,has_full_text,full_text_file
0,,Elsevier,Intrauterine virus infections and congenital h...,10.1016/0002-8703(72)90077-4,,4361535,els-covid,Abstract The etiologic basis for the vast majo...,1972-12-31,"Overall, James C.",American Heart Journal,,,False,custom_license
1,,Elsevier,Coronaviruses in Balkan nephritis,10.1016/0002-8703(80)90355-5,,6243850,els-covid,,1980-03-31,"Georgescu, Leonida; Diosi, Peter; Buţiu, Ioan;...",American Heart Journal,,,False,custom_license
2,,Elsevier,Cigarette smoking and coronary heart disease: ...,10.1016/0002-8703(80)90356-7,,7355701,els-covid,,1980-03-31,"Friedman, Gary D",American Heart Journal,,,False,custom_license
3,aecbc613ebdab36753235197ffb4f35734b5ca63,Elsevier,Clinical and immunologic studies in identical ...,10.1016/0002-9343(73)90176-9,,4579077,els-covid,"Abstract Middle-aged female identical twins, o...",1973-08-31,"Brunner, Carolyn M.; Horwitz, David A.; Shann,...",The American Journal of Medicine,,,True,custom_license
4,,Elsevier,Epidemiology of community-acquired respiratory...,10.1016/0002-9343(85)90361-4,,4014285,els-covid,Abstract Upper respiratory tract infections ar...,1985-06-28,"Garibaldi, Richard A.",The American Journal of Medicine,,,False,custom_license


### Read JSON files

In [3]:
all_json = glob.glob(f"{root_path}/**/*.json", recursive=True)
print(len(all_json))
all_json[:3]

29315


['/Users/lmeng/Documents/Kaggle/CORD-19-research-challenge/custom_license/custom_license/ab680d5dbc4f51252da3473109a7885dd6b5eb6f.json',
 '/Users/lmeng/Documents/Kaggle/CORD-19-research-challenge/custom_license/custom_license/6599ebbef3d868afac9daa4f80fa075675cf03bc.json',
 '/Users/lmeng/Documents/Kaggle/CORD-19-research-challenge/custom_license/custom_license/eb5c7f3ff921ad6469b79cc8a3c122648204ece4.json']

In [11]:
with open(all_json[4]) as file:
    first_entry = json.load(file)
    print(json.dumps(first_entry, indent=4))

{
    "paper_id": "68c0bb1989b6ca2b38da32a0d992027db39f80bc",
    "metadata": {
        "title": "Spring 2020 | 1 Beijing's Hard and Soft Repression in Hong Kong",
        "authors": [
            {
                "first": "Victoria",
                "middle": [],
                "last": "Tin-Bor Hui",
                "suffix": "",
                "affiliation": {},
                "email": ""
            }
        ]
    },
    "abstract": [
        {
            "text": "Hong Kong's new Police Commissioner Chris Tang announced in Beijing on December 7, 2019, that he would use \"both hard and soft approaches\" to end the anti-government protests. This article argues that such \"approaches\" amount to physical and non-physical repression-hard power, but employed by Hong Kong, rather than mainland, forces, combined with sharp power exercised by both Beijing and the local authorities. These measures are responses to the limits on what Beijing can do under the \"one country, two systems\"

The JSON schema given is wrong. Correct structure as follows:
- **paper_id**
- metadata
    - title
    - authors
- **abstract**
- **body_text**
- bib_entries
- ref_entries
- back_matter

In [36]:
# helper class to parse JSON
class FileReader:
    def __init__(self, file_path):
        with open(file_path) as file:
            content = json.load(file)
            self.paper_id = content['paper_id']
#             self.title = content['metadata']['title']
#             self.authors = []
            self.abstract = []
            self.body_text = []
            # authors
#             for entry in content['metadata']['authors']:
#                 middle = " ".join(entry['middle'])
#                 author = " ".join([entry['first'], entry['middle'], entry['last']])
#                 self.authors.append(author)
            # Abstract
            for entry in content['abstract']:
                self.abstract.append(entry['text'])
            # Body text
            for entry in content['body_text']:
                self.body_text.append(entry['text'])
            self.abstract = '\n'.join(self.abstract)
            self.body_text = '\n'.join(self.body_text)
    def __repr__(self):
        return f'{self.paper_id}: {self.abstract[:200]}... {self.body_text[:200]}...'

In [37]:
first_row = FileReader(all_json[4])
print(first_row)

68c0bb1989b6ca2b38da32a0d992027db39f80bc: Hong Kong's new Police Commissioner Chris Tang announced in Beijing on December 7, 2019, that he would use "both hard and soft approaches" to end the anti-government protests. This article argues that... It is also noteworthy that Tang, who was once the district commander in Yuen Long, is reputed to be linked to the mob attacks on commuters, residents, and passers-by in Yuen Long station on July 21. 1...


### Transform into dataframe

In [14]:
dict_ = {'paper_id': [], 'abstract': [], 'body_text': []}
for idx, entry in enumerate(all_json):
    if idx % (len(all_json) // 5) == 0:
        print(f'Processing index: {idx} of {len(all_json)}')
    content = FileReader(entry)
    dict_['paper_id'].append(content.paper_id)
    dict_['abstract'].append(content.abstract)
    dict_['body_text'].append(content.body_text)
covid_df = pd.DataFrame(dict_, columns=['paper_id', 'abstract', 'body_text'])
covid_df.head()

Processing index: 0 of 29315
Processing index: 5863 of 29315
Processing index: 11726 of 29315
Processing index: 17589 of 29315
Processing index: 23452 of 29315


Unnamed: 0,paper_id,abstract,body_text
0,ab680d5dbc4f51252da3473109a7885dd6b5eb6f,,The evolutionary history of humans is characte...
1,6599ebbef3d868afac9daa4f80fa075675cf03bc,"International aviation is growing rapidly, res...","Sixty years ago, civil aviation was an infant ..."
2,eb5c7f3ff921ad6469b79cc8a3c122648204ece4,,Acute infections of the gastrointestinal tract...
3,b87b790c96c75faa22a085cb560f7b3d8e018b24,,"There are three domains of life-Bacteria, Arch..."
4,68c0bb1989b6ca2b38da32a0d992027db39f80bc,Hong Kong's new Police Commissioner Chris Tang...,"It is also noteworthy that Tang, who was once ..."


### Add columns of word count

In [22]:
covid_df["abstract_count"] = covid_df["abstract"].apply(lambda x: len(x.strip().split()))
covid_df["body_count"] = covid_df["body_text"].apply(lambda x: len(x.strip().split()))
print(covid_df.shape)
covid_df.head()

(29315, 5)


Unnamed: 0,paper_id,abstract,body_text,abstract_count,body_count
0,ab680d5dbc4f51252da3473109a7885dd6b5eb6f,,The evolutionary history of humans is characte...,0,2884
1,6599ebbef3d868afac9daa4f80fa075675cf03bc,"International aviation is growing rapidly, res...","Sixty years ago, civil aviation was an infant ...",140,5838
2,eb5c7f3ff921ad6469b79cc8a3c122648204ece4,,Acute infections of the gastrointestinal tract...,0,6972
3,b87b790c96c75faa22a085cb560f7b3d8e018b24,,"There are three domains of life-Bacteria, Arch...",0,7309
4,68c0bb1989b6ca2b38da32a0d992027db39f80bc,Hong Kong's new Police Commissioner Chris Tang...,"It is also noteworthy that Tang, who was once ...",154,5593


### Join metadata and JSONdata

In [16]:
meta_df.loc[meta_df.sha == "68c0bb1989b6ca2b38da32a0d992027db39f80bc"]

Unnamed: 0,sha,source_x,title,doi,pmcid,pubmed_id,license,abstract,publish_time,authors,journal,Microsoft Academic Paper ID,WHO #Covidence,has_full_text,full_text_file
6113,68c0bb1989b6ca2b38da32a0d992027db39f80bc,Elsevier,Beijing's Hard and Soft Repression in Hong Kong,10.1016/j.orbis.2020.02.010,,,els-covid,Abstract Hong Kong's new Police Commissioner C...,2020-03-04,"Hui, Victoria Tin-bor",Orbis,,,True,custom_license


In [26]:
meta_df.columns

Index(['sha', 'source_x', 'title', 'doi', 'pmcid', 'pubmed_id', 'license',
       'abstract', 'publish_time', 'authors', 'journal',
       'Microsoft Academic Paper ID', 'WHO #Covidence', 'has_full_text',
       'full_text_file'],
      dtype='object')

In [43]:
keep_cols = ['sha', 'title', 'publish_time', 'authors', 'journal']
meta_thin = meta_df[keep_cols]

In [44]:
merged = pd.merge(covid_df, meta_thin, how="inner", left_on="paper_id", right_on="sha")
merged.drop(columns="sha", inplace=True)
print(merged.shape)
merged.head()

(27690, 9)


Unnamed: 0,paper_id,abstract,body_text,abstract_count,body_count,title,publish_time,authors,journal
0,ab680d5dbc4f51252da3473109a7885dd6b5eb6f,,The evolutionary history of humans is characte...,0,2884,Evolutionary Medicine IV. Evolution and Emerge...,2016-12-31,"Scarpino, S.V.",Encyclopedia of Evolutionary Biology
1,6599ebbef3d868afac9daa4f80fa075675cf03bc,"International aviation is growing rapidly, res...","Sixty years ago, civil aviation was an infant ...",140,5838,International aviation emissions to 2025: Can ...,2009-01-31,"Macintosh, Andrew; Wallace, Lailey",Energy Policy
2,eb5c7f3ff921ad6469b79cc8a3c122648204ece4,,Acute infections of the gastrointestinal tract...,0,6972,Mechanisms of diarrhoea,1993-06-30,"Booth, I.W.; McNeish, A.S.",Baillière's Clinical Gastroenterology
3,b87b790c96c75faa22a085cb560f7b3d8e018b24,,"There are three domains of life-Bacteria, Arch...",0,7309,Chapter 3 Features of Host Cells Cellular and ...,2016-12-31,"Louten, Jennifer",Essential Human Virology
4,68c0bb1989b6ca2b38da32a0d992027db39f80bc,Hong Kong's new Police Commissioner Chris Tang...,"It is also noteworthy that Tang, who was once ...",154,5593,Beijing's Hard and Soft Repression in Hong Kong,2020-03-04,"Hui, Victoria Tin-bor",Orbis


In [50]:
merged.to_csv(f"{root_path}/Data/merged_v1.csv", index=False)

In [51]:
merged.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 27690 entries, 0 to 27689
Data columns (total 9 columns):
paper_id          27690 non-null object
abstract          27690 non-null object
body_text         27690 non-null object
abstract_count    27690 non-null int64
body_count        27690 non-null int64
title             27646 non-null object
publish_time      27618 non-null object
authors           26940 non-null object
journal           26796 non-null object
dtypes: int64(2), object(7)
memory usage: 2.1+ MB
