In [87]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import string
import spacy
import nltk
from sklearn.feature_extraction import stop_words

# nltk.download('wordnet')

## Load merged data

In [2]:
root_path = "/Users/lmeng/Documents/Kaggle/CORD-19-research-challenge"
merged_path = f"{root_path}/Data/merged_v1.csv"
covid = pd.read_csv(merged_path)
print(covid.shape)
covid.head()

(27690, 9)


Unnamed: 0,paper_id,abstract,body_text,abstract_count,body_count,title,publish_time,authors,journal
0,ab680d5dbc4f51252da3473109a7885dd6b5eb6f,,The evolutionary history of humans is characte...,0,2884,Evolutionary Medicine IV. Evolution and Emerge...,2016-12-31,"Scarpino, S.V.",Encyclopedia of Evolutionary Biology
1,6599ebbef3d868afac9daa4f80fa075675cf03bc,"International aviation is growing rapidly, res...","Sixty years ago, civil aviation was an infant ...",140,5838,International aviation emissions to 2025: Can ...,2009-01-31,"Macintosh, Andrew; Wallace, Lailey",Energy Policy
2,eb5c7f3ff921ad6469b79cc8a3c122648204ece4,,Acute infections of the gastrointestinal tract...,0,6972,Mechanisms of diarrhoea,1993-06-30,"Booth, I.W.; McNeish, A.S.",Baillière's Clinical Gastroenterology
3,b87b790c96c75faa22a085cb560f7b3d8e018b24,,"There are three domains of life-Bacteria, Arch...",0,7309,Chapter 3 Features of Host Cells Cellular and ...,2016-12-31,"Louten, Jennifer",Essential Human Virology
4,68c0bb1989b6ca2b38da32a0d992027db39f80bc,Hong Kong's new Police Commissioner Chris Tang...,"It is also noteworthy that Tang, who was once ...",154,5593,Beijing's Hard and Soft Repression in Hong Kong,2020-03-04,"Hui, Victoria Tin-bor",Orbis


In [3]:
covid.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27690 entries, 0 to 27689
Data columns (total 9 columns):
paper_id          27690 non-null object
abstract          20228 non-null object
body_text         27690 non-null object
abstract_count    27690 non-null int64
body_count        27690 non-null int64
title             27646 non-null object
publish_time      27618 non-null object
authors           26940 non-null object
journal           26796 non-null object
dtypes: int64(2), object(7)
memory usage: 1.9+ MB


### Drop duplicate articles

In [8]:
print(covid.body_text.nunique())
covid.drop_duplicates("body_text", inplace=True)

27662


## Processing pipeline
- Tokenize
- Lemmatize / (Stem)
- stop words

In [90]:
# take the first 10,000 rows
body = covid[["body_text"]][:10000]
body.head(5)

Unnamed: 0,body_text
0,The evolutionary history of humans is characte...
1,"Sixty years ago, civil aviation was an infant ..."
2,Acute infections of the gastrointestinal tract...
3,"There are three domains of life-Bacteria, Arch..."
4,"It is also noteworthy that Tang, who was once ..."


### Wrap the functionality

In [91]:
def tokenize(text):
    text = text.lower()
    text = re.sub("[" + string.punctuation + "0-9\\r\\t\\n]", " ", text)
    tokens = nltk.word_tokenize(text)
    tokens = [token for token in tokens if token not in stop_words.ENGLISH_STOP_WORDS]
    lemmatizer = nltk.stem.WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens if len(token) > 2]
    return tokens

In [92]:
body["body_text"] = body["body_text"].apply(tokenize)
body.head()

Unnamed: 0,body_text
0,"[evolutionary, history, human, characterized, ..."
1,"[year, ago, civil, aviation, infant, industry,..."
2,"[acute, infection, gastrointestinal, tract, fa..."
3,"[domain, life, bacteria, archaea, eukarya, org..."
4,"[noteworthy, tang, district, commander, yuen, ..."


In [30]:
spacy_nlp = spacy.load("en_core_web_sm")

In [67]:
text = "Harry he's is 12going \n to marry , her. anyway!?"
text = re.sub("[" + string.punctuation + "\\r\\t\\n]", " ", text)
text


'Harry he s is 12going   to marry   her  anyway  '

In [86]:
lemmatizer = nltk.stem.WordNetLemmatizer()
lemmatizer.lemmatize("cant")

'cant'