# Milestone 3: Implement an Inverted Index and Search

Import spacy and load the small English model.

In [26]:
import spacy
import json
import os
nlp = spacy.load("en_core_web_sm")

Load the corpus and processed following TF-IDF data.

In [27]:
with open("./outputs/milestone-2/corpus.json") as f:
    corpus = json.load(f)
    #corpus as a set instead of a list
    corpus = set(corpus)
with open("./outputs/milestone-2/tf-idf-data.json") as f:
    data = json.load(f)

Build the inverted index.

In [28]:
inverted_index = {}
for word in corpus:
    for entry in data:
        #get the amount of times the word appears in the document
        times_in_doc = entry["tf_idf"][word]
        if times_in_doc > 0:
            if word in inverted_index:
                inverted_index[word].append(entry["title"])
            else:
                inverted_index[word] = [entry["title"]]

Implement a tokenizer function (we take the one developed for the milestone 1, actually).

In [29]:
def tokenize(text):
    #lowercase
    text = text.lower()
    #Creates a spaCy document with the text lemmas and their attributes
    doc = nlp(text)
    #Removes stopwords, punctuation, line breaks and other unclassified lemmas.
    tokens = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct and not token.is_space and token.pos_ != 'X']
    #Returns a list of tokens (lemmas) found in the text.
    return tokens

Implement the search function.

In [30]:
def search(query):
    query_tokens = tokenize(query)
    results_set = set()
    results = []
    for token in query_tokens:
        if token in inverted_index:
            results_set.update(set(inverted_index[token]))
            #Concat the results
            results.extend(inverted_index[token])
    #Sorts the results by the number of occurrences of each document in the results list.
    results = sorted(list(set(results)), key = lambda doc_title: results.count(doc_title), reverse = True)
    return results

Test the search function with "symptoms of swine flu".

In [31]:
search("symptoms of swine flu")

['Swine influenza', 'Spanish flu', 'Cholera', 'Pandemic']

Test the search function with "pandemic prevention organizations".

In [32]:
search("pandemic prevention organizations")

['Pandemic prevention',
 'HIV/AIDS',
 'Science diplomacy and pandemics',
 'Crimson Contagion',
 'Event 201',
 'Pandemic',
 '1929–1930 psittacosis pandemic',
 'Spanish flu',
 'Cholera',
 'Pandemic Severity Assessment Framework',
 'HIV/AIDS in Yunnan',
 'PREDICT (USAID)',
 'COVID-19 pandemic',
 'Plague of Cyprian',
 'Epidemiology of HIV/AIDS',
 'Swine influenza',
 'Antonine Plague',
 'Pandemic severity index']