In [1]:
# Imports
import os
from os.path import join as join_path
import numpy as np
rng_seed = 368
np.random.seed(rng_seed) # Random seed for reproducibility
import pandas as pd
import glob
import json
from tqdm.notebook import tqdm
tqdm.pandas()

# Spacy (for language detection)
# -----
# To install:
# !pip install spacy
# !pip install scispacy
# !pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.2.4/en_core_sci_lg-0.2.4.tar.gz
# !pip install spacy-langdetect
import scispacy
import spacy
import en_core_sci_lg # Biomedical word embeddings
from spacy_langdetect import LanguageDetector

  from pandas import Panel


In [2]:
!ls data

COVID.DATA.LIC.AGMT.pdf cord-19-data.csv        metadata.csv
[1m[36mbiorxiv_medrxiv[m[m         [1m[36mcustom_license[m[m          metadata.readme
[1m[36mcomm_use_subset[m[m         json_schema.txt         [1m[36mnoncomm_use_subset[m[m


In [3]:
class CORD19Data():
    '''
    TODO: Docs
    '''
    def __init__(self, data_dir: str):
        self.data_dir = data_dir
        
        # Initialize NLP model
        self.nlp = en_core_sci_lg.load(disable=["tagger", "ner"])
        self.nlp.max_length = 2000000
        self.nlp.add_pipe(LanguageDetector(), name='language_detector', last=True)
        self.nlp_words_to_check = 100
    
    def _load_metadata(self) -> pd.DataFrame:
        '''
        TODO: Docs
        '''
        print('Loading metadata...')
        cord_metadata_df = pd.read_csv(join_path(self.data_dir, 'metadata.csv'), dtype={
            'pubmed_id': str,
            'Microsoft Academic Paper ID': str, 
            'doi': str
        })
        print('Done!')
        return cord_metadata_df

    def _parse_json_article(self, article_path: str) -> tuple:
        '''Parses a CORD-19 JSON article

        Args:
            article_path: JSON article path to parse

        Returns:
            TODO
        '''
        with open(article_path, 'r') as file:
            content = json.load(file)

            # Extract information
            paper_id = content['paper_id']
            abstract = []
            body_text = []

            # Abstract
            for item in content['abstract']:
                abstract.append(item['text'])

            # Body text
            for item in content['body_text']:
                body_text.append(item['text'])

            return paper_id, '\n'.join(abstract), '\n'.join(body_text)
    
    def _parse_articles(self) -> pd.DataFrame:
        '''
        TODO: Docs
        '''
        print('Parsing JSON articles...')
        all_cord_article_paths = glob.glob(f'{self.data_dir}/**/*.json', recursive=True)
        
        # Initialize DataFrame dictionary
        cord_articles_dict = {'paper_id': [], 'abstract': [], 'body_text': []}
        for i, article_path in enumerate(tqdm(all_cord_article_paths, unit='article')):
            paper_id, abstract, body_text = self._parse_json_article(article_path)
            cord_articles_dict['paper_id'].append(paper_id)
            cord_articles_dict['abstract'].append(abstract)
            cord_articles_dict['body_text'].append(body_text)

        df = pd.DataFrame(cord_articles_dict)
        print('Done!')
        return df
    
    def _merge_metadata_articles(self, metadata_df: pd.DataFrame, articles_df: pd.DataFrame) -> pd.DataFrame:
        '''
        TODO: Docs
        '''
        print('Merging DataFrames...')
        df = pd.merge(articles_df, metadata_df, left_on='paper_id', right_on='sha', how='left')
        df = df.drop(['sha', 'abstract_y'], axis=1)
        df = df.rename(columns = {'abstract_x': 'abstract', 'source_x': 'source'})
        
        print('Done!')
        return df

    def _exlude_non_metadata_articles(self, df: pd.DataFrame):
        '''
        TODO: Docs
        '''
        print('Excluding articles without metadata...')
        df = df[df.full_text_file.notna()]
        
        print('Done!')
        return df

    def _remove_duplicates(self, df: pd.DataFrame):
        '''
        TODO: Docs
        '''
        print('Removing duplicates...')
        df.drop_duplicates(['abstract', 'body_text'], inplace=True)

        print('Done!')
        return df
    
    def _extract_language(self, text: str) -> str:
        '''
        TODO: Docs
        '''
        # Extract language using spaCy
        text_first_words = ' '.join(text.split(maxsplit=self.nlp_words_to_check)[:self.nlp_words_to_check])
        lang = self.nlp(text_first_words)._.language['language']
        
        return lang
    
    def _perform_lang_detection(self, df: pd.DataFrame):
        '''
        TODO: Docs
        '''
        print('Performing language detection...')
        
        # Extract language
        df['language'] = df.body_text.progress_apply(self._extract_language)

        print('Done!')
        return df
    
    def _save_to_file(self, df: pd.DataFrame, filename: str):
        '''
        TODO: Docs
        '''
        print('Saving to file...')
        df.to_csv(filename, index=False)
        print('Done!')
    
    def process_data(self, save_to_filename: str = None):
        '''Processes the CORD-19 data.
        
        Loads and pre-processes CORD-19 data in specified data directory.
        We take inspiration from/follow Daniel Wolffram's "CORD-19: Create Dataframe" Notebook
        - https://www.kaggle.com/danielwolffram/cord-19-create-dataframe
        
        Args:
            save_to_filename: Where to save the data after processing. Default: False
        '''
        # Perform pre-processing
        metadata_df = self._load_metadata()
        articles_df = self._parse_articles()
        df = self._merge_metadata_articles(metadata_df, articles_df)
        df = self._exlude_non_metadata_articles(df)
        df = self._remove_duplicates(df)
        df = self._perform_lang_detection(df)
        
        if save_to_filename != None:
            self._save_to_file(df, save_to_filename)

In [4]:
cord_data_dir = 'data'
cord_data_path = join_path(cord_data_dir, 'cord-19-data.csv')
CORD19Data(cord_data_dir).process_data(cord_data_path)

Loading metadata...
Done!
Parsing JSON articles...


HBox(children=(FloatProgress(value=0.0, max=33375.0), HTML(value='')))


Done!
Merging DataFrames...
Done!
Excluding articles without metadata...
Done!
Removing duplicates...
Done!
Performing language detection...


HBox(children=(FloatProgress(value=0.0, max=30184.0), HTML(value='')))


Done!
Saving to file...
Done!
