In [1]:
# Install dependencies
import json
import spacy

In [2]:
# a Spacy Language model
sp = spacy.load("en_core_web_sm")

In [3]:
with open('data/data.json', 'r') as outfile:
    summaries = json.load(outfile)
print(summaries[0].keys())

dict_keys(['title', 'text', 'url'])


### Lowercase and explore the data

In [4]:
text = summaries[1]["text"]

In [5]:
# Lowercase data. Lowercase the text. 
# Explore the attributes of each token returned SpaCy.
text_tokenized = sp(text.lower())
for token in text_tokenized[:5]:
    print(type(token), token.text, token.pos_, token.dep_)

<class 'spacy.tokens.token.Token'> hiv PROPN nmod
<class 'spacy.tokens.token.Token'> / SYM punct
<class 'spacy.tokens.token.Token'> aids PROPN nsubjpass
<class 'spacy.tokens.token.Token'> , PUNCT punct
<class 'spacy.tokens.token.Token'> or CCONJ cc


In [6]:
# The model were unable to classify some tokens, let's check what 
# these tokens look like
unclassified_tokens = [(token.lemma_, token.dep_) 
                       for token 
                       in text_tokenized 
                       if token.dep_ is ""]

# Tokens like these are not useful to search, we will remove them in the next step:
unclassified_tokens[:10]

  if token.dep_ is ""]


[(' ', ''), ('\n', ''), ('\n', '')]

### Remove stop words and punctuation

In [7]:
tokens_without_sw = [word for word 
                     in text_tokenized 
                     if not word.is_stop 
                     and not word.is_punct
                    ]
tokens_without_sw[:10]

[hiv,
 aids,
 human,
 immunodeficiency,
 virus,
 considered,
 authors,
 global,
 pandemic,
 currently]

### Lemmatize (tokenize) the texts

In [8]:
token_lemmas = [token.lemma_ 
               for token
               in tokens_without_sw
               if token.dep_]
token_lemmas[:10]

['hiv',
 'aids',
 'human',
 'immunodeficiency',
 'virus',
 'consider',
 'author',
 'global',
 'pandemic',
 'currently']

In [9]:
def tokenizer(document):
    text_lowercased = sp(document.lower())
    tokens_without_stopwords = [word 
                                for word 
                                in text_lowercased
                                if not word.is_stop 
                                and not word.is_punct]
    
    token_lemmatized = [token.lemma_ 
               for token
               in tokens_without_stopwords
               if token.dep_]
    
    return token_lemmatized

In [10]:
for doc in summaries:
    doc['tokenized_text'] = tokenizer(doc['text'])

In [11]:
# Lets take a look what our tokenized summaries look like:
summaries[0]['tokenized_text']

['pandemic',
 'greek',
 'πᾶν',
 'pan',
 'δῆμος',
 'demos',
 'people',
 'epidemic',
 'infectious',
 'disease',
 'spread',
 'large',
 'region',
 'instance',
 'multiple',
 'continent',
 'worldwide',
 'affect',
 'substantial',
 'number',
 'people',
 'widespread',
 'endemic',
 'disease',
 'stable',
 'number',
 'infected',
 'people',
 'pandemic',
 'widespread',
 'endemic',
 'disease',
 'stable',
 'number',
 'infected',
 'people',
 'recurrence',
 'seasonal',
 'influenza',
 'generally',
 'exclude',
 'occur',
 'simultaneously',
 'large',
 'region',
 'globe',
 'spread',
 'worldwide',
 'human',
 'history',
 'number',
 'pandemic',
 'disease',
 'smallpox',
 'tuberculosis',
 'fatal',
 'pandemic',
 'record',
 'history',
 'black',
 'death',
 'know',
 'plague',
 'kill',
 'estimate',
 '75–200',
 'million',
 'people',
 '14th',
 'century',
 'term',
 'later',
 'pandemic',
 'include',
 '1918',
 'influenza',
 'pandemic',
 'spanish',
 'flu',
 'current',
 'pandemic',
 'include',
 'covid-19',
 'sars',
 'cov-2',

In [12]:
# Save the tokenized texts to file:
with open('data/summaries.json', 'w') as outfile:
    json.dump(summaries, outfile)