In [11]:
# Install dependencies
import json
import spacy

In [12]:
# a Spacy Language model
sp = spacy.load("en_core_web_sm")

In [13]:
with open('data/data.json', 'r') as outfile:
    summaries = json.load(outfile)
print(summaries[0].keys())

dict_keys(['title', 'text', 'url'])


### Lowercase and explore the data

In [14]:
# Lowercase data. Lowercase the text. 
# Explore the attributes of each token returned SpaCy.
text = "since I upgrade my laptop is started behaving abnormally"
text_tokenized = sp(text.lower())
for token in text_tokenized[:10]:
    print(type(token), token.text, token.pos_, token.dep_)

<class 'spacy.tokens.token.Token'> since SCONJ mark
<class 'spacy.tokens.token.Token'> i PRON nsubj
<class 'spacy.tokens.token.Token'> upgrade VERB advcl
<class 'spacy.tokens.token.Token'> my DET poss
<class 'spacy.tokens.token.Token'> laptop NOUN dobj
<class 'spacy.tokens.token.Token'> is AUX auxpass
<class 'spacy.tokens.token.Token'> started VERB ROOT
<class 'spacy.tokens.token.Token'> behaving VERB xcomp
<class 'spacy.tokens.token.Token'> abnormally ADV advmod


In [15]:
# Lowercase data. Lowercase the text. 
# Explore the attributes of each token returned SpaCy.
text = "I want to add a new column to the forecast report"
text_tokenized = sp(text.lower())
for token in text_tokenized[:10]:
    print(type(token), token.text, token.pos_, token.dep_)

<class 'spacy.tokens.token.Token'> i PRON nsubj
<class 'spacy.tokens.token.Token'> want VERB ROOT
<class 'spacy.tokens.token.Token'> to PART aux
<class 'spacy.tokens.token.Token'> add VERB xcomp
<class 'spacy.tokens.token.Token'> a DET det
<class 'spacy.tokens.token.Token'> new ADJ amod
<class 'spacy.tokens.token.Token'> column NOUN dobj
<class 'spacy.tokens.token.Token'> to ADP prep
<class 'spacy.tokens.token.Token'> the DET det
<class 'spacy.tokens.token.Token'> forecast NOUN compound


In [16]:
text = summaries[1]["text"]
text

'HIV/AIDS, or Human Immunodeficiency Virus, is considered by some authors a global pandemic. However, the WHO currently uses the term \'global epidemic\' to describe HIV. As of 2018, approximately 37.9 million people are infected with HIV globally.There were about 770,000 deaths from AIDS in 2018.The 2015 Global Burden of Disease Study, in a report published in The Lancet, estimated that the global incidence of HIV infection peaked in 1997 at 3.3 million per year. Global incidence fell rapidly from 1997 to 2005, to about 2.6 million per year, but remained stable from 2005 to 2015.Sub-Saharan Africa is the region most affected. In 2018, an estimated 61% of new HIV infections occurred in this region. Prevalence ratios are "In western and central Europe and North America, low and declining incidence of HIV and mortality among people infected with HIV over the last 17 years has seen the incidence:prevalence ratio fall from 0.06 in 2000 to 0.03 in 2017. Strong and steady reductions in new H

In [17]:
# Lowercase data. Lowercase the text. 
# Explore the attributes of each token returned SpaCy.
text_tokenized = sp(text.lower())
for token in text_tokenized[:10]:
    print(type(token), token.text, token.pos_, token.dep_)

<class 'spacy.tokens.token.Token'> hiv PROPN nmod
<class 'spacy.tokens.token.Token'> / SYM punct
<class 'spacy.tokens.token.Token'> aids PROPN nsubjpass
<class 'spacy.tokens.token.Token'> , PUNCT punct
<class 'spacy.tokens.token.Token'> or CCONJ cc
<class 'spacy.tokens.token.Token'> human ADJ amod
<class 'spacy.tokens.token.Token'> immunodeficiency NOUN compound
<class 'spacy.tokens.token.Token'> virus NOUN conj
<class 'spacy.tokens.token.Token'> , PUNCT punct
<class 'spacy.tokens.token.Token'> is AUX auxpass


In [18]:
# The model were unable to classify some tokens, let's check what 
# these tokens look like
unclassified_tokens = [(token.lemma_, token.dep_) 
                       for token 
                       in text_tokenized 
                       if token.dep_ is ""]

# Tokens like these are not useful to search, we will remove them in the next step:
unclassified_tokens[:10]

  if token.dep_ is ""]


[(' ', ''), ('\n', ''), ('\n', '')]

### Remove stop words and punctuation

In [19]:
tokens_without_sw = [word for word 
                     in text_tokenized 
                     if not word.is_stop 
                     and not word.is_punct
                    ]
tokens_without_sw[:10]

[hiv,
 aids,
 human,
 immunodeficiency,
 virus,
 considered,
 authors,
 global,
 pandemic,
 currently]

### Lemmatize (tokenize) the texts

In [20]:
token_lemmas = [token.lemma_ 
               for token
               in tokens_without_sw
               if token.dep_]
token_lemmas[:10]

['hiv',
 'aids',
 'human',
 'immunodeficiency',
 'virus',
 'consider',
 'author',
 'global',
 'pandemic',
 'currently']

In [21]:
def tokenizer(document):
    text_lowercased = sp(document.lower())
    tokens_without_stopwords = [word 
                                for word 
                                in text_lowercased
                                if not word.is_stop 
                                and not word.is_punct]
    
    token_lemmatized = [token.lemma_ 
               for token
               in tokens_without_stopwords
               if token.dep_]
    
    return token_lemmatized

In [27]:
text = "I want to add a new column to the forecast report"
text_tokenized = sp(text.lower())
for token in text_tokenized[:10]:
    print(type(token), token.text, token.pos_, token.dep_, token.lemma_)
    
tokenizer(text)

<class 'spacy.tokens.token.Token'> i PRON nsubj i
<class 'spacy.tokens.token.Token'> want VERB ROOT want
<class 'spacy.tokens.token.Token'> to PART aux to
<class 'spacy.tokens.token.Token'> add VERB xcomp add
<class 'spacy.tokens.token.Token'> a DET det a
<class 'spacy.tokens.token.Token'> new ADJ amod new
<class 'spacy.tokens.token.Token'> column NOUN dobj column
<class 'spacy.tokens.token.Token'> to ADP prep to
<class 'spacy.tokens.token.Token'> the DET det the
<class 'spacy.tokens.token.Token'> forecast NOUN compound forecast


['want', 'add', 'new', 'column', 'forecast', 'report']

In [22]:
for doc in summaries:
    doc['tokenized_text'] = tokenizer(doc['text'])

In [23]:
# Lets take a look what our tokenized summaries look like:
summaries[0]['tokenized_text']

['pandemic',
 'greek',
 'πᾶν',
 'pan',
 'δῆμος',
 'demos',
 'people',
 'epidemic',
 'infectious',
 'disease',
 'spread',
 'large',
 'region',
 'instance',
 'multiple',
 'continent',
 'worldwide',
 'affect',
 'substantial',
 'number',
 'people',
 'widespread',
 'endemic',
 'disease',
 'stable',
 'number',
 'infected',
 'people',
 'pandemic',
 'widespread',
 'endemic',
 'disease',
 'stable',
 'number',
 'infected',
 'people',
 'recurrence',
 'seasonal',
 'influenza',
 'generally',
 'exclude',
 'occur',
 'simultaneously',
 'large',
 'region',
 'globe',
 'spread',
 'worldwide',
 'human',
 'history',
 'number',
 'pandemic',
 'disease',
 'smallpox',
 'tuberculosis',
 'fatal',
 'pandemic',
 'record',
 'history',
 'black',
 'death',
 'know',
 'plague',
 'kill',
 'estimate',
 '75–200',
 'million',
 'people',
 '14th',
 'century',
 'term',
 'later',
 'pandemic',
 'include',
 '1918',
 'influenza',
 'pandemic',
 'spanish',
 'flu',
 'current',
 'pandemic',
 'include',
 'covid-19',
 'sars',
 'cov-2',

In [24]:
# Save the tokenized texts to file:
with open('data/summaries.json', 'w') as outfile:
    json.dump(summaries, outfile)