In [42]:
# import dependencies
import json
from collections import defaultdict

import spacy

In [43]:
# Load Spacy Language model
sp = spacy.load("en_core_web_sm")

In [44]:
# load data
with open('data/summaries.json', 'r') as outfile:
    summaries = json.load(outfile)

with open('data/vocab.json', 'r') as outfile:
    vocab = json.load(outfile)
    
len(summaries), len(vocab)

(26, 1494)

### Build an inverted Index

In [45]:
inverted_index = {}

for i, word in enumerate(vocab):
    inverted_index[word] = []
    
    for doc in summaries:
        # for each word in corpus vocabulary list all articles
        # it occurs in and this word's TfIdf score for this article
        if doc['tf_idf'][i]!=0:
            inverted_index[word].append((doc['title'], doc['tf_idf'][i])) 

In [46]:
# Now you have a lookup table of all articles that have a particular keyword
# lets request a list of articles with the word "coronavirus" in them
inverted_index["coronavirus"]

[('COVID-19 pandemic', 0.05749582125920263)]

In [47]:
# Check if "coronavirus" is indeed in the article (try other keywords as well)
for s in summaries:
    if s["title"] == 'COVID-19 pandemic':
        print(s["text"])

The COVID-19 pandemic, also known as the coronavirus pandemic, is an ongoing pandemic of coronavirus disease 2019 (COVID-19) caused by severe acute respiratory syndrome coronavirus 2 (SARS-CoV-2). The disease was first identified in December 2019 in Wuhan, China. The outbreak was declared a Public Health Emergency of International Concern in January 2020, and a pandemic in March 2020. As of 17 October 2020, more than 39.5 million cases have been confirmed, with more than 1.1 million deaths attributed to COVID-19.

Common symptoms include fever, cough, fatigue, breathing difficulties, and loss of smell. Complications may include pneumonia and acute respiratory distress syndrome. The incubation period is typically around five days but may range from one to 14 days. There are several vaccine candidates in development, although none have proven their safety and efficacy. There is no known specific antiviral medication, so primary treatment is currently symptomatic.
Recommended preventive m

In [48]:
    
# Now you have a lookup table of all articles that have a particular keyword
# lets request a list of articles with the word "coronavirus" in them
inverted_index["disease"]

[('Pandemic', 0.022322198426744867),
 ('Epidemiology of HIV/AIDS', 0.0016857910270197945),
 ('Antonine Plague', 0.007469351012026167),
 ('Basic reproduction number', 0.0056020132590196255),
 ('Cholera', 0.008194224738931659),
 ('COVID-19 pandemic', 0.005711856656255304),
 ('Disease X', 0.029604135108640295),
 ('HIV/AIDS', 0.013486328216158356),
 ('HIV/AIDS in Yunnan', 0.008593058686401785),
 ('Pandemic prevention', 0.013871651879477165),
 ('Pandemic Severity Assessment Framework', 0.004537456222258886),
 ('1929–1930 psittacosis pandemic', 0.004646007806523453),
 ('Science diplomacy and pandemics', 0.01032995352727023),
 ('Superspreader', 0.0066507919970096),
 ('Targeted immunization strategies', 0.0105545177343848),
 ('Virus', 0.0015866268489598066)]

### Search inverted index

In [49]:
# Reuse the tokenizer from Milestone 1 to tokenize search queries

def tokenizer(document):
    text_lowercased = sp(document.lower())
    tokens_without_stopwords = [word for word 
                     in text_lowercased 
                     if not word.is_stop 
                     and not word.is_punct
                     and len(word.dep_.strip())!=0]   
    
    token_lemmatized = [token.lemma_ 
               for token
               in tokens_without_stopwords]
    
    return token_lemmatized

In [50]:
# Create a search function to search the inverted index

def search(query, index = inverted_index):
    
    query_tokens = tokenizer(query)
    
    # Lookup all query tokens in the inverted index
    # and build an list of articles that have them
    # The results should be a list of tuples with article titles and TfIdf scores
    newlist = []
    for token in query_tokens:
        newlist.extend(inverted_index[token])
    
    # create a dictionary with compound TfIdf scores 
    # to take into account that an article can include multiple keywords
    # from your query
    
    output = defaultdict(int) 
    
    for k, v in newlist: 
        output[k] += v 
    results = [(x, y) for x, y in output.items()]
    
    # sort search results by their TfIdf scores
    return sorted(results, key = lambda x: x[1], reverse=True) 

In [51]:
# Time to check how well this search performs for multi-word queries:
title, score = search(query = "world health organization")[0]
for s in summaries:
    if s["title"] == title:
        print(s["text"])

The Johns Hopkins Center for Health Security (abbreviated CHS; previously the UPMC Center for Health Security, the Center for Biosecurity of UPMC, and the Johns Hopkins Center for Civilian Biodefense Strategies) is an independent, nonprofit organization of the Johns Hopkins Bloomberg School of Public Health, and part of the Environmental Health and Engineering department. It is concerned with the areas of health consequences from epidemics and disasters as well as averting biological weapons development, and implications of biosecurity for the bioeconomy. It is a think tank that does policy research and gives policy recommendations to the United States government as well as the World Health Organization and the UN Biological Weapons Convention.


In [52]:
# Lets try another multi-word query
search(query = "Ebola virus")

[('Virus', 0.06746676589985189),
 ('Plague of Cyprian', 0.0634287152349009),
 ('Crimson Contagion', 0.0339553131009123),
 ('Viral load', 0.03386619154421699),
 ('Disease X', 0.031470777995967494),
 ('Swine influenza', 0.028050041257275376),
 ('Science diplomacy and pandemics', 0.027286695292144007),
 ('HIV/AIDS in Yunnan', 0.022837201731587032),
 ('HIV/AIDS', 0.013653988336874786),
 ('Spanish flu', 0.012903018978346673),
 ('Epidemiology of HIV/AIDS', 0.005973619897382719),
 ('COVID-19 pandemic', 0.005060007442488892)]

In [53]:
for s in summaries:
    if s["title"] == 'Virus':
        print(s["text"])

A virus is a submicroscopic infectious agent that replicates only inside the living cells of an organism. Viruses infect all types of life forms, from animals and plants to microorganisms, including bacteria and archaea.
Since Dmitri Ivanovsky's 1892 article describing a non-bacterial pathogen infecting tobacco plants and the discovery of the tobacco mosaic virus by Martinus Beijerinck in 1898, more than 6,000  virus species have been described in detail of the millions of types of viruses in the environment. Viruses are found in almost every ecosystem on Earth and are the most numerous type of biological entity. The study of viruses is known as virology, a subspeciality of microbiology.
When infected, a host cell is forced to rapidly produce thousands of identical copies of the original virus. When not inside an infected cell or in the process of infecting a cell, viruses exist in the form of independent particles, or virions, consisting of: (i) the genetic material, i.e., long molecu

In [54]:
# Lets try another multi-word query
search(query = "vaccine effect")

[('Pandemic prevention', 0.061699549981524916),
 ('Science diplomacy and pandemics', 0.036932666365101494),
 ('Targeted immunization strategies', 0.02852579106448193),
 ('Disease X', 0.016002273036172788),
 ('Spanish flu', 0.01079742124676686),
 ('COVID-19 pandemic', 0.007718743464506874),
 ('Cholera', 0.005536651430236998),
 ('HIV/AIDS', 0.005207088845103844),
 ('Virus', 0.0042881908136149305)]

In [55]:
# show that this is order-insensitive
search(query = "effect vaccine")

[('Pandemic prevention', 0.061699549981524916),
 ('Science diplomacy and pandemics', 0.036932666365101494),
 ('Targeted immunization strategies', 0.02852579106448193),
 ('Disease X', 0.016002273036172788),
 ('Spanish flu', 0.01079742124676686),
 ('COVID-19 pandemic', 0.007718743464506874),
 ('Cholera', 0.005536651430236998),
 ('HIV/AIDS', 0.005207088845103844),
 ('Virus', 0.0042881908136149305)]