In [3]:
# import dependencies
import json
import itertools
from collections import Counter

import spacy
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

In [4]:
# Load a Spacy Language model
sp = spacy.load("en_core_web_sm")

In [5]:
# Load data
with open('data/summaries.json', 'r') as outfile:
    summaries = json.load(outfile)

### Build a corpus vocabulary

In [23]:
# concatenate all tokenized texts into a single list
tokenized_texts = [i["tokenized_text"] for i in summaries]

# flatten the list of lists
vocab = list(itertools.chain(*tokenized_texts))

# remove duplicates
vocab = list(set(vocab))
print(len(vocab))

1494


In [24]:
# Save the vocabulary
with open('data/vocab.json', 'w') as outfile:
    json.dump(vocab, outfile)

In [25]:
# count how many times each token occurs in a document
docs_token_counter = []
for doc in summaries:
    doc_tokenized = doc["tokenized_text"]
    docs_token_counter.append(Counter(doc_tokenized))

### Find all unique tokens in the corpus

In [26]:
# For each token in corpus vocabulary, count in how many documents it occurs
number_docs_with_token  = {}
for token in vocab:
    count_docs = sum([1 for doc in docs_token_counter if token in doc.keys()])
    number_docs_with_token[token] = count_docs

In [27]:
number_docs_with_token['ebola']

2

In [28]:
number_docs_with_token['disease']

16

### Compute TfIdfs of the documents

In [29]:
for i, doc in enumerate(docs_token_counter):
    doc_length = len(doc)
    tfidf_vec = []
    for token in vocab:
        
        # compute a term frequency (tf) per document
        tf = doc[token] / len(summaries[i]["tokenized_text"])
        
        # compute a log of inverse document frequency per document

        idf = np.log(len(summaries)/number_docs_with_token[token])

        tfidf = tf * idf
        tfidf_vec.append(tfidf)
    
    # add tf_idf vector to the dictionaries
    summaries[i]['tf_idf'] = tfidf_vec

In [30]:
# Save an updated summary with computed Tf-Idf vectors
with open('data/summaries.json', 'w') as json_file:
    json.dump(summaries, json_file)

### Vectorize query

In [31]:
query = "highest pandemic casualties"

In [32]:
# Reuse the tokenizer from Milestone 1 to tokenize search queries
# this is why we needed to import spaCy in this milestone

def tokenizer(document):
    text_lowercased = sp(document.lower())
    tokens_without_stopwords = [word for word 
                     in text_lowercased 
                     if not word.is_stop 
                     and not word.is_punct
                     and len(word.dep_.strip())!=0]   
    
    token_lemmatized = [token.lemma_ 
               for token
               in tokens_without_stopwords]
    
    return token_lemmatized

In [33]:
# Reuse the workflow for article Tf-Idf calculation
# to build a vectorizer function for search queries

def vectorize(query, vocab = vocab):
    
    query_tokenized = tokenizer(query)
    query_token_counter = Counter(query_tokenized)
    query_vec = []
    for token in vocab:
        
        tf = query_token_counter[token] / len(query_tokenized)
        idf = np.log(len(summaries) / number_docs_with_token[token])
        tfidf = tf * idf
        query_vec.append(tfidf)
            
    return query_vec

In [34]:
vectorize('does ebola affect the heart')

[0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.8549831191538455,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 

### Search documents with Sklearn

In [35]:
# Build a search function
def search_tfidf(query, docs):
    
    # vectorize query
    query_vec = vectorize(query)
    query_arr = np.array(query_vec)
    
    # Build a list of results and their cosine similarity scores
    rankings = []
    for doc in docs:
        doc_rank = {}
        doc_arr = np.array(doc['tf_idf'])
        rank = cosine_similarity(query_arr.reshape(1,-1), doc_arr.reshape(1, -1))[0][0]
        if rank > 0:
            doc_rank['title'] = doc['title']
            doc_rank['rank'] = rank
            rankings.append(doc_rank)

    #return sorted results
    return sorted(rankings, key=lambda k: k['rank'], reverse=True)

ranking = search_tfidf(query, summaries)

In [36]:
search_tfidf("ebola", summaries)

[{'title': 'Plague of Cyprian', 'rank': 0.11754261855142299},
 {'title': 'Science diplomacy and pandemics', 'rank': 0.07113700480918281}]

In [37]:
# Lets check if the article 'Plague of Cyprian' has the word "ebola" in it
for s in summaries:
    if s["title"] == 'Plague of Cyprian':
        print(s["text"])

The Plague of Cyprian was a pandemic that afflicted the Roman Empire about from AD 249 to 262. The plague is thought to have caused widespread manpower shortages for food production and the Roman army, severely weakening the empire during the Crisis of the Third Century. Its modern name commemorates St. Cyprian, bishop of Carthage, an early Christian writer who witnessed and described the plague. The agent of the plague is highly speculative because of sparse sourcing, but suspects have included smallpox, pandemic influenza and viral hemorrhagic fever (filoviruses) like the Ebola virus.


In [38]:
# Lets check if the article 'Science diplomacy and pandemics' has the word "ebola" in it
for s in summaries:
    if s["title"] == 'Science diplomacy and pandemics':
        print(s["text"])

Science diplomacy is the collaborative efforts by local and global entities to solve global issues using science and technology as a base. In science diplomacy, collaboration takes place to advance science but science can also be used to facilitate diplomatic relations. This allows even conflicting nations to come together through science to find solutions to global issues. Global organizations, researchers, public health officials, countries, government officials, and clinicians have previously worked together to create effective measures of infection control and subsequent treatment. They continue to do so through sharing of resources, research data, ideas, and by putting into effect laws and regulations that can further advance scientific research. Without the collaborative efforts of such entities, the world would not have the vaccines and treatments we now possess for diseases that were once considered deadly such as tuberculosis, tetanus, polio, influenza, etc. Historically, scie

In [43]:
search_tfidf("ebola symptoms", summaries)

[{'title': 'Plague of Cyprian', 'rank': 0.09494841123126888},
 {'title': 'HIV/AIDS', 'rank': 0.07226910347800031},
 {'title': 'Cholera', 'rank': 0.06283658382482012},
 {'title': 'Science diplomacy and pandemics', 'rank': 0.05746294977619649},
 {'title': 'COVID-19 pandemic', 'rank': 0.03011852409071728},
 {'title': 'Swine influenza', 'rank': 0.02212609108939102}]

In [41]:
# Lets check if the article 'Cholera' has the words "ebola" or "symptoms"
for s in summaries:
    if s["title"] == 'Cholera':
        print(s["text"])

Cholera is an infection of the small intestine by some strains of the bacterium Vibrio cholerae. Symptoms may range from none, to mild, to severe. The classic symptom is large amounts of watery diarrhea that lasts a few days. Vomiting and muscle cramps may also occur. Diarrhea can be so severe that it leads within hours to severe dehydration and electrolyte imbalance. This may result in sunken eyes, cold skin, decreased skin elasticity, and wrinkling of the hands and feet. Dehydration can cause the skin to turn bluish. Symptoms start two hours to five days after exposure.Cholera is caused by a number of types of Vibrio cholerae, with some types producing more severe disease than others. It is spread mostly by unsafe water and unsafe food that has been contaminated with human feces containing the bacteria. Undercooked seafood is a common source. Humans are the only animal affected. Risk factors for the disease include poor sanitation, not enough clean drinking water, and poverty. There 