## Basic Dataset Stats

Things to look at:
- number of articles without abstracts or full text
- histogram of article lengths

In [None]:
import pandas as pd
import numpy as np
import glob

In [None]:
import matplotlib.pyplot as plt
plt.style.use('ggplot')

## Load Dataset

Number of articles without abstracts and/or full text in metadata

In [None]:
root_path = '~/Documents/CORD-19-research-challenge'
metadata_path = f'{root_path}/metadata.csv'

In [None]:
meta_df = pd.read_csv(metadata_path, dtype={
    'pubmed_id': str,
    'Microsoft Academic Paper ID': str, 
    'doi': str
})
meta_df.head()

In [None]:
meta_df['has_full_text'] = meta_df['has_full_text'].map(lambda x: 1 if x else np.nan)

In [None]:
meta_df.info()

In [None]:
meta_df['full_text_file'].value_counts()

In [None]:
meta_df.isnull().groupby(['abstract','has_full_text']).size()

In [None]:
meta_df.groupby(['full_text_file','has_full_text']).size()

In [None]:
meta_df['has_abstract'] = meta_df['abstract'].map(lambda x: 1 if x else np.nan)
meta_df.groupby(['full_text_file','has_abstract']).size()

## Load all JSON files

Note: the number of json files don't exactly appear to match the number of has_full_text files from the metadata document

Edited code courtesy of: https://www.kaggle.com/maksimeren/covid-19-literature-clustering

In [None]:
all_json = glob.glob('../../CORD-19-research-challenge/**/*.json', recursive=True)

In [None]:
len(all_json)

In [None]:
class FileReader:
    def __init__(self, file_path):
        with open(file_path) as file:
            self.source = file_path.split('/')[-2]
            content = json.load(file)
            self.paper_id = content['paper_id']
            self.abstract = []
            self.body_text = []
            # Abstract
            for entry in content['abstract']:
                self.abstract.append(entry['text'])
            # Body text
            for entry in content['body_text']:
                self.body_text.append(entry['text'])
            self.abstract = '\n'.join(self.abstract)
            self.body_text = '\n'.join(self.body_text)
    def __repr__(self):
        return f'{self.source}...{self.paper_id}: {self.abstract[:200]}... {self.body_text[:200]}...'
first_row = FileReader(all_json[0])
print(first_row)

In [None]:
dict_ = {'source': [], 'paper_id': [], 'abstract': [], 'body_text': [], 'authors': [], 'title': [], 'journal': []}
for idx, entry in enumerate(all_json):
    if idx % (len(all_json) // 10) == 0:
        print(f'Processing index: {idx} of {len(all_json)}')
    content = FileReader(entry)
    
    # get metadata information
    meta_data = meta_df.loc[meta_df['sha'] == content.paper_id]
    
    # no metadata, skip this paper
    if len(meta_data) == 0:
        continue
    
    dict_['source'].append(content.source)
    dict_['paper_id'].append(content.paper_id)
    dict_['abstract'].append(content.abstract)
    dict_['body_text'].append(content.body_text)
    
    dict_['authors'].append(meta_data['authors'].values[0])
    dict_['title'].append(meta_data['title'].values[0])
    dict_['journal'].append(meta_data['journal'].values[0])
    
df_covid = pd.DataFrame(dict_, columns=['source', 'paper_id', 'abstract', 'body_text', 'authors'])
df_covid.head()

In [None]:
df_covid['abstract_word_count'] = df_covid['abstract'].apply(lambda x: len(x.strip().split()))
df_covid['body_word_count'] = df_covid['body_text'].apply(lambda x: len(x.strip().split()))
df_covid['has_body_text'] = df_covid['body_text'].map(lambda x: 1 if x else np.nan)
df_covid['has_abstract'] = df_covid['abstract'].map(lambda x: 1 if x else np.nan)

In [None]:
df_covid.isnull().groupby(['has_abstract','has_body_text']).size()

There do appear to exist duplicate entries for abstract/body text

In [None]:
df_covid[df_covid.duplicated(subset=['abstract', 'body_text'], keep=False)].sort_values(by=['abstract', 'body_text'])

In [None]:
df_covid.hist(column='body_word_count', bins=100)

In [None]:
sorted(df_covid['body_word_count'].tolist())[-100:]