# Inverted Index and Search

## Setup

In [1]:
import json
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from collections import OrderedDict, Counter
from typing import List
import spacy
nlp = spacy.load("en_core_web_sm")

## Load vocabulary and corpus

In [2]:
with open("vectorized_data.json", "r") as read_file:
    CDC_data = json.load(read_file)

In [3]:
with open("CDC_vocabulary.json", "r") as read_file:
    CDC_vocabulary = json.load(read_file)

## Build inverted index

In [4]:
def build_inverted_index(vocabulary: list, corpus: List[dict]) -> 'OrderedDict[str, set]':
    """Build an inverted index for a given corpus with given vocabulary."""
    index_vector = OrderedDict((token, set()) for token in vocabulary)
    for num, article in enumerate(corpus):
        for key, value in article.get('tf_idf').items():
            if value != 0:
                index_vector[key].add(num)
    return index_vector  

In [5]:
CDC_index_vector = build_inverted_index(vocabulary=CDC_vocabulary, corpus=CDC_data)

In [6]:
#Example: index entries contained in more than 13 articles
{key: value for key, value in CDC_index_vector.items() if len(value)>=13}

{'disease': {0, 1, 2, 3, 5, 6, 8, 10, 11, 12, 13, 17, 18, 20, 22, 25},
 'include': {0, 3, 5, 6, 8, 10, 12, 13, 15, 17, 20, 21, 25},
 'pandemic': {0, 1, 2, 5, 6, 7, 10, 12, 13, 14, 15, 16, 17, 18, 19, 21, 23}}

## Example queries

In [7]:
with open("example_queries.json", "r") as read_file:
    example_queries = json.load(read_file)

In [8]:
example_queries

['black death',
 'zoonotic diseases',
 'swine flu',
 'cholera transmission',
 'classification of viruses',
 'economic impact of pandemics',
 'pandemic prevention organizations',
 'spread of infectious diseases',
 'prevention of viral infections',
 'common symptoms of coronavirus']

## Tokenize function from milestone 1

In [9]:
def tokenize_string(text: str) -> List[str]:
    """Function that cleans up tokenizes a string, i.e. transform to lowercase and remove punctuation, spaces, symbols
    and lemmas unclassified by spaCy. Requires a spaCy model loaded as nlp.
    
    Args:
        text: Input text to tekenize.
    
    Returns:
        clean_tokens: List of clean and informative tokens.   
    """  
    doc = nlp(text.lower())
    clean_tokens = [token.lemma_ for token in doc if (token.pos_ not in ['PUNCT', 'SPACE', 'SYM', 'X']) and (not token.is_stop)]
    return clean_tokens 

In [10]:
tokenize_string(example_queries[0])

['black', 'death']

In [11]:
tokenize_string(example_queries[9])

['common', 'symptom', 'coronavirus']

## Compute TF_IDF for a token list (inspired from milestone 2)

In [12]:
def compute_TF(*, vocabulary: list, tokenized_text: List[str]) -> 'OrderedDict[str, float]':
    """Compute TF given a corpus vocabulary and a tokenized document, and return the document’s TF-vector.
    Note: token that do not exist in the corpus vocabulary should not be taken into account."""
    counter = Counter([elem for elem in tokenized_text if elem in vocabulary])
    normalizer = sum(counter.values())
    TF_vector = OrderedDict((token, 0) for token in vocabulary)
    for key, value in counter.items():
        TF_vector[key] = value/normalizer
    return TF_vector

In [13]:
def compute_IDF(*, corpus: List[dict], vocabulary: list) -> 'OrderedDict[str, float]':
    """Compute DF for each word in the vocabulary given a corpus of tokenized texts."""
    IDF_vector = OrderedDict((token, 0) for token in vocabulary)
    corpus_size = len(corpus)
    for key in IDF_vector.keys():
        for _, article in enumerate(corpus):
            IDF_vector[key] += (key in article['tokenized_text'])
        IDF_vector[key] = np.log(corpus_size/IDF_vector[key])      
    return IDF_vector

In [14]:
def compute_TF_IDF_for_query(*, token_list: List[str], corpus: List[dict], vocabulary: list) -> 'OrderedDict[str, float]':
    """Given a token_list, a corpus and its vocabulary, compute TF_IDF vector for that token_list relative to the corpus vocabulary."""
    IDF_vector = compute_IDF(vocabulary=vocabulary, corpus=corpus)
    query_tf_idf = compute_TF(vocabulary=vocabulary, tokenized_text=token_list)
    query_tf_idf = OrderedDict({key: value*IDF_vector[key] for key, value in query_tf_idf.items()})
    return query_tf_idf

In [15]:
test_query_tfidf = compute_TF_IDF_for_query(token_list=example_queries[0], vocabulary=CDC_vocabulary, corpus=CDC_data)
set(test_query_tfidf.values())

{0.0, 0.6412373393653842, 0.8145241345053705}

In [16]:
len(test_query_tfidf)

1484

## Search function

In [17]:
def reshape_tfidf(tfidf_dict: 'OrderedDict[str, float]') -> np.ndarray:
    return np.array(list(tfidf_dict.values())).reshape(1, -1)

In [18]:
def search_function(query_text: str, index_vector: 'OrderedDict[str, set]', corpus: List[dict], vocabulary: list) -> set:
    """Given a query text, search inverted index and return sorted search results."""
    query_token_list = tokenize_string(query_text)
    query_tf_idf = compute_TF_IDF_for_query(token_list=query_token_list, corpus=corpus, vocabulary=vocabulary)
    matching_articles = set(range(0, len(corpus)))
    for _, token in enumerate(query_token_list):
        print(f'token: {token}')
        print(f'articles: {index_vector.get(token, set())}')
        matching_articles = matching_articles.union(index_vector.get(token, set()))
    matching_dictionary = {}
    for article_id in matching_articles:
        matching_dictionary[article_id] = (corpus[article_id].get('title'), cosine_similarity(reshape_tfidf(query_tf_idf), reshape_tfidf(corpus[article_id].get('tf_idf'))).item())
    matching_dictionary  = dict(sorted(matching_dictionary .items(), key=lambda x: x[1][1], reverse=True))
    return matching_dictionary 

### Example queries

In [19]:
search_function(query_text="symptoms of swine flu", index_vector=CDC_index_vector, corpus=CDC_data, vocabulary=CDC_vocabulary)

token: symptom
articles: {10, 21, 5, 6}
token: swine
articles: {19, 21}
token: flu
articles: {0, 19, 21, 23}


{21: ('Swine influenza', 0.6034175738441063),
 19: ('Spanish flu', 0.176017910117174),
 10: ('HIV/AIDS', 0.062357191121067516),
 5: ('Cholera', 0.05424748555959457),
 0: ('Pandemic', 0.04173810259134081),
 6: ('COVID-19 pandemic', 0.02627223421480461),
 23: ('Unified Victim Identification System', 0.01987191057765038),
 1: ('Epidemiology of HIV/AIDS', 0.0),
 2: ('Antonine Plague', 0.0),
 3: ('Basic reproduction number', 0.0),
 4: ('Bills of mortality', 0.0),
 7: ('Crimson Contagion', 0.0),
 8: ('Disease X', 0.0),
 9: ('Event 201', 0.0),
 11: ('HIV/AIDS in Yunnan', 0.0),
 12: ('Pandemic prevention', 0.0),
 13: ('Pandemic Severity Assessment Framework', 0.0),
 14: ('Pandemic severity index', 0.0),
 15: ('Plague of Cyprian', 0.0),
 16: ('PREDICT (USAID)', 0.0),
 17: ('1929–1930 psittacosis pandemic', 0.0),
 18: ('Science diplomacy and pandemics', 0.0),
 20: ('Superspreader', 0.0),
 22: ('Targeted immunization strategies', 0.0),
 24: ('Viral load', 0.0),
 25: ('Virus', 0.0)}

In [20]:
CDC_data[21].get('title')

'Swine influenza'

In [21]:
search_function(query_text='pandemic prevention organizations', index_vector=CDC_index_vector, corpus=CDC_data, vocabulary=CDC_vocabulary)

token: pandemic
articles: {0, 1, 2, 5, 6, 7, 10, 12, 13, 14, 15, 16, 17, 18, 19, 21, 23}
token: prevention
articles: {10, 11, 12, 13}
token: organization
articles: {7, 8, 9, 12, 18, 21}


{12: ('Pandemic prevention', 0.3041589800078785),
 10: ('HIV/AIDS', 0.09679149398950185),
 9: ('Event 201', 0.0645795139378938),
 13: ('Pandemic Severity Assessment Framework', 0.05938588132148788),
 11: ('HIV/AIDS in Yunnan', 0.04635920332145792),
 7: ('Crimson Contagion', 0.04617674320647011),
 8: ('Disease X', 0.035136725807399724),
 18: ('Science diplomacy and pandemics', 0.026848880940320015),
 0: ('Pandemic', 0.022972322447226474),
 21: ('Swine influenza', 0.022347937551331895),
 19: ('Spanish flu', 0.015811437475675677),
 14: ('Pandemic severity index', 0.01265838869003447),
 6: ('COVID-19 pandemic', 0.008262874561963156),
 15: ('Plague of Cyprian', 0.006859504249888185),
 16: ('PREDICT (USAID)', 0.006792123716773705),
 2: ('Antonine Plague', 0.003730765589772143),
 17: ('1929–1930 psittacosis pandemic', 0.0036260536629245223),
 1: ('Epidemiology of HIV/AIDS', 0.001989268044455747),
 23: ('Unified Victim Identification System', 0.0015624775482279542),
 5: ('Cholera', 0.001421780

In [22]:
CDC_data[12].get('text')

'Pandemic prevention is the organization and management of preventive measures against pandemics. Those include measures to reduce causes of new infectious diseases and measures to prevent outbreaks and epidemics from becoming pandemics.\nIt is not to be mistaken for pandemic preparedness or pandemic mitigation which largely seek to mitigate the magnitude of negative effects of pandemics and may overlap with pandemic prevention in some respects.'

In [23]:
for num in range(0, 9):
    print(f'Example query {num}: {example_queries[num]}')
    print(f"All matching articles: {search_function(query_text=example_queries[num], index_vector=CDC_index_vector, corpus=CDC_data, vocabulary=CDC_vocabulary)}\n")

Example query 0: black death
token: black
articles: {0}
token: death
articles: {0, 1, 2, 4, 5, 6, 10, 13, 17, 19, 21}
All matching articles: {0: ('Pandemic', 0.14796495667829496), 5: ('Cholera', 0.016709067271053694), 1: ('Epidemiology of HIV/AIDS', 0.011689151798555933), 2: ('Antonine Plague', 0.010961188821491111), 19: ('Spanish flu', 0.010323299752928604), 10: ('HIV/AIDS', 0.009603491208189157), 17: ('1929–1930 psittacosis pandemic', 0.007102359720328011), 13: ('Pandemic Severity Assessment Framework', 0.006365984145465946), 6: ('COVID-19 pandemic', 0.006069191838953452), 4: ('Bills of mortality', 0.005639229448548818), 21: ('Swine influenza', 0.004403582274724383), 3: ('Basic reproduction number', 0.0), 7: ('Crimson Contagion', 0.0), 8: ('Disease X', 0.0), 9: ('Event 201', 0.0), 11: ('HIV/AIDS in Yunnan', 0.0), 12: ('Pandemic prevention', 0.0), 14: ('Pandemic severity index', 0.0), 15: ('Plague of Cyprian', 0.0), 16: ('PREDICT (USAID)', 0.0), 18: ('Science diplomacy and pandemics',