# Text processing using spaCy library

## Setup

In [1]:
import json
from typing import List
import spacy
nlp = spacy.load("en_core_web_sm")

## Open data

In [2]:
with open("data.json", "r") as read_file:
    CDC_data = json.load(read_file)

## Text lemmatization functions

In [3]:
def tokenize_string(text: str) -> List[str]:
    """Function that cleans up tokenizes a string, i.e. transform to lowercase and remove punctuation, spaces, symbols
    and lemmas unclassified by spaCy. Requires a spaCy model loaded as nlp.
    
    Args:
        text: Input text to tekenize.
    
    Returns:
        clean_tokens: List of clean and informative tokens.   
    """  
    doc = nlp(text.lower())
    clean_tokens = [token.lemma_ for token in doc if (token.pos_ not in ['PUNCT', 'SPACE', 'SYM', 'X']) and (not token.is_stop)]
    return clean_tokens

In [4]:
def add_tokens(article: dict) -> dict:
    """Add a tokenized_text field to a Wikipedia article dictionary containing a text field."""
    article['tokenized_text'] = tokenize_string(article['text'])
    return article

In [5]:
def tokenize(corpus: List[dict]) -> List[dict]:
    """Add tokenized_text field to a list of dictionaries encoding Wikipedia articles."""
    for _, article in enumerate(corpus):
        article = add_tokens(article)
    return corpus

In [6]:
tokenized_CDC = tokenize(CDC_data)

In [7]:
#xample of tokenized article
tokenized_CDC[2]

{'title': 'Antonine Plague',
 'text': 'The Antonine Plague of 165 to 180 AD, also known as the Plague of Galen (after Galen, the physician who described it), was an ancient pandemic brought to the Roman Empire by troops who were returning from campaigns in the Near East. Scholars have suspected it to have been either smallpox or measles. The plague may have claimed the life of a Roman emperor, Lucius Verus, who died in 169 and was the co-regent of Marcus Aurelius Antoninus, whose family name, Antoninus, has become associated with the pandemic. \nAncient sources agree that the plague appeared first during the Roman siege of the Mesopotamian city Seleucia in the winter of 165–166. Ammianus Marcellinus reported that the plague spread to Gaul and to the legions along the Rhine. Eutropius stated that a large population died throughout the empire. According to the contemporary Roman historian Cassius Dio, the disease broke out again nine years later in 189 AD and caused up to 2,000 deaths a 

## Save tokenized data to json

In [8]:
with open('tokenized_data.json', 'w') as out:
    json.dump(tokenized_CDC, out)