In [1]:
import pandas as pd
import spacy

In [3]:
processed_text = pd.read_json("processed_text.json")
processed_text_tf_idf = pd.read_json("processed_text_tf_idf.json")

In [4]:
processed_text_tf_idf.head()

Unnamed: 0,title,text,url,processed_text,tfidf_vector
0,Pandemic,"A pandemic (from Greek πᾶν, pan, ""all"" and δῆμ...",https://en.wikipedia.org/wiki/Pandemic,"[pandemic, greek, pan, demo, people, epidemic,...","{'pandemic': 0.1814556331, 'greek': 26.0, 'pan..."
1,Epidemiology of HIV/AIDS,"HIV/AIDS, or Human Immunodeficiency Virus, is ...",https://en.wikipedia.org/wiki/Epidemiology_of_...,"[hiv, aid, human, immunodeficiency, virus, con...","{'pandemic': 0.0518444666, 'greek': 0.0, 'pan'..."
2,Antonine Plague,"The Antonine Plague of 165 to 180 AD, also kno...",https://en.wikipedia.org/wiki/Antonine_Plague,"[plague, ad, know, plague, galen, galen, physi...","{'pandemic': 0.0518444666, 'greek': 0.0, 'pan'..."
3,Basic reproduction number,"In epidemiology, the basic reproduction number...",https://en.wikipedia.org/wiki/Basic_reproducti...,"[epidemiology, basic, reproduction, number, ba...","{'pandemic': 0.0, 'greek': 0.0, 'pan': 0.0, 'd..."
4,Bills of mortality,Bills of mortality were the weekly mortality s...,https://en.wikipedia.org/wiki/Bills_of_mortality,"[bill, mortality, weekly, mortality, statistic...","{'pandemic': 0.0, 'greek': 0.0, 'pan': 0.0, 'd..."


In [5]:
# Create dictionary 
vocab = set([word for document in processed_text_tf_idf.processed_text for word in document])
vocab_dict = {word:[] for word in vocab}


In [6]:
for idx, document_idf in enumerate(processed_text_tf_idf.tfidf_vector):
    for key, value in document_idf.items():
        if value > 0:
            vocab_dict[key].append((processed_text.title[idx], value))

In [7]:
vocab_dict

{'typical': [('Superspreader', 26.0)],
 'indo': [('Antonine Plague', 26.0)],
 'user': [('HIV/AIDS in Yunnan', 26.0)],
 'deliberate': [('Basic reproduction number', 26.0)],
 'force': [('Virus', 26.0)],
 'feature': [('1929–1930 psittacosis pandemic', 26.0)],
 'attention': [('HIV/AIDS', 26.0)],
 'diagnose': [('Epidemiology of HIV/AIDS', 2.8888888889),
  ('Cholera', 2.8888888889),
  ('1929–1930 psittacosis pandemic', 2.8888888889)],
 'slow': [('HIV/AIDS', 26.0)],
 'allow': [('Science diplomacy and pandemics', 6.5), ('Swine influenza', 6.5)],
 'rotavirus': [('Virus', 26.0)],
 'wuhan': [('COVID-19 pandemic', 26.0)],
 'blueprint': [('Disease X', 26.0)],
 'disproportionately': [('Spanish flu', 26.0)],
 'expectancy': [('HIV/AIDS', 26.0)],
 'garden': [('Viral load', 26.0)],
 'shortage': [('COVID-19 pandemic', 6.5), ('Plague of Cyprian', 6.5)],
 'use': [('Epidemiology of HIV/AIDS', 0.28888888890000003),
  ('Basic reproduction number', 0.5777777778000001),
  ('Bills of mortality', 0.28888888890000

In [None]:
# Build a search funciton
# 1. Tokenize the new phrase
# 2. Filter to words in the dictionary that are in the phrase
# 3. Sum grouped by the documents to get the highest tf-idf

In [10]:
def make_query_tokens(text, vocab = vocab):
    """ Accepts any list of words and returns a tokenized list

    Args:
        text (string):

    Returns:
        list: Tokenizeed list of lemmatized words
    """
    nlp = spacy.load("en_core_web_lg") # Initialize the vocabulary
    doc = nlp(text.lower())
    filtered_sentence =[] 
    for word in doc:
        lexeme = nlp.vocab[str(word)]
        if lexeme.is_stop == False and lexeme.is_punct == False and lexeme.is_oov == False and lexeme.lower_ in vocab:
            filtered_sentence.append(word.lemma_) 
    return filtered_sentence

In [50]:
tokenized_search_term = make_query_tokens("pandemic prevention organizations")
tokenized_search_term
# if search word in vocab
relavent_documents_dictionary = {token:vocab_dict[token] for token in tokenized_search_term}
    

In [51]:
tokenized_search_term

['pandemic', 'prevention']

In [52]:
pd.DataFrame([doc for word in list(relavent_documents_dictionary.values()) for doc in word],
             columns=["title", "tf_idf_score"]).groupby("title").sum().sort_values(by = 'tf_idf_score', ascending = False)

Unnamed: 0_level_0,tf_idf_score
title,Unnamed: 1_level_1
HIV/AIDS,2.811637
Pandemic prevention,2.038598
Pandemic Severity Assessment Framework,1.135949
HIV/AIDS in Yunnan,0.928571
Spanish flu,0.2333
Pandemic,0.181456
COVID-19 pandemic,0.103689
Pandemic severity index,0.103689
Swine influenza,0.077767
1929–1930 psittacosis pandemic,0.077767


In [58]:
def inverted_index_search(search_string:str, inverted_index = vocab_dict):
    """Accept a search string and an inverted index, return the most sorted search results in teh form of a dataframe

    Args:
        search_string (str): _description_
        inverted_index (_type_): _description_
    """
    tokenized_search_term = make_query_tokens(search_string)
    relavent_documents_dictionary = {token:vocab_dict[token] for token in tokenized_search_term}
    
    search_results = pd.DataFrame([doc for word in list(relavent_documents_dictionary.values()) for doc in word],
             columns=["title", "tf_idf_score"]).groupby("title").sum().sort_values(by = 'tf_idf_score', ascending = False)
    
    return search_results


In [60]:
inverted_index_search("Black Death")

Unnamed: 0_level_0,tf_idf_score
title,Unnamed: 1_level_1
Pandemic,26.112554
Cholera,0.450216
Epidemiology of HIV/AIDS,0.450216
1929–1930 psittacosis pandemic,0.225108
Antonine Plague,0.225108
HIV/AIDS,0.225108
Spanish flu,0.225108
Bills of mortality,0.112554
COVID-19 pandemic,0.112554
Pandemic Severity Assessment Framework,0.112554
