In [59]:
import pandas as pd
import spacy

In [60]:
processed_text = pd.read_json("processed_text.json")
processed_text_tf_idf = pd.read_json("processed_text_tf_idf.json")

In [61]:
processed_text_tf_idf.head()

Unnamed: 0,title,text,url,processed_text,tfidf_vector
0,Pandemic,"A pandemic (from Greek πᾶν, pan, ""all"" and δῆμ...",https://en.wikipedia.org/wiki/Pandemic,"[pandemic, greek, pan, demo, people, epidemic,...","{'pandemic': 0.1814556331, 'greek': 26.0, 'pan..."
1,Epidemiology of HIV/AIDS,"HIV/AIDS, or Human Immunodeficiency Virus, is ...",https://en.wikipedia.org/wiki/Epidemiology_of_...,"[hiv, aid, human, immunodeficiency, virus, con...","{'pandemic': 0.0518444666, 'greek': 0.0, 'pan'..."
2,Antonine Plague,"The Antonine Plague of 165 to 180 AD, also kno...",https://en.wikipedia.org/wiki/Antonine_Plague,"[plague, ad, know, plague, galen, galen, physi...","{'pandemic': 0.0518444666, 'greek': 0.0, 'pan'..."
3,Basic reproduction number,"In epidemiology, the basic reproduction number...",https://en.wikipedia.org/wiki/Basic_reproducti...,"[epidemiology, basic, reproduction, number, ba...","{'pandemic': 0.0, 'greek': 0.0, 'pan': 0.0, 'd..."
4,Bills of mortality,Bills of mortality were the weekly mortality s...,https://en.wikipedia.org/wiki/Bills_of_mortality,"[bill, mortality, weekly, mortality, statistic...","{'pandemic': 0.0, 'greek': 0.0, 'pan': 0.0, 'd..."


In [62]:
# Create dictionary 
vocab = set([word for document in processed_text_tf_idf.processed_text for word in document])
vocab_dict = {word:[] for word in vocab}


In [63]:
for idx, document_idf in enumerate(processed_text_tf_idf.tfidf_vector):
    for key, value in document_idf.items():
        if value > 0:
            vocab_dict[key].append((processed_text.title[idx], value))

In [64]:
vocab_dict

{'involved': [('Viral load', 26.0)],
 'compile': [('Unified Victim Identification System', 26.0)],
 'earth': [('Virus', 26.0)],
 'wide': [('Bills of mortality', 26.0)],
 'study': [('Epidemiology of HIV/AIDS', 1.625),
  ('Cholera', 1.625),
  ('Swine influenza', 1.625),
  ('Virus', 1.625)],
 'morale': [('Spanish flu', 26.0)],
 'antiretroviral': [('HIV/AIDS', 26.0)],
 'dio': [('Antonine Plague', 26.0)],
 'immunity': [('Superspreader', 6.5), ('Virus', 6.5)],
 'material': [('Virus', 26.0)],
 'care': [('Epidemiology of HIV/AIDS', 26.0)],
 'base': [('Basic reproduction number', 0.8666666667),
  ('Cholera', 0.8666666667),
  ('Pandemic severity index', 0.8666666667),
  ('Science diplomacy and pandemics', 0.8666666667),
  ('Unified Victim Identification System', 1.7333333333)],
 'produce': [('Bills of mortality', 0.9285714286000001),
  ('Cholera', 0.9285714286000001),
  ('Viral load', 1.8571428571),
  ('Virus', 2.7857142857)],
 'key': [('Unified Victim Identification System', 6.5), ('Virus', 6.5

In [None]:
# Build a search funciton
# 1. Tokenize the new phrase
# 2. Filter to words in the dictionary that are in the phrase
# 3. Sum grouped by the documents to get the highest tf-idf

In [84]:
def make_tokens(text, vocab = vocab):
    """ Accepts any list of words and returns a tokenized list

    Args:
        text (string):

    Returns:
        list: Tokenizeed list of lemmatized words
    """
    nlp = spacy.load("en_core_web_lg") # Initialize the vocabulary
    doc = nlp(text.lower())
    filtered_sentence =[] 
    for word in doc:
        lexeme = nlp.vocab[str(word)]
        if lexeme.is_stop == False and lexeme.is_punct == False and lexeme.is_oov == False and lexeme.lower_ in vocab:
            filtered_sentence.append(word.lemma_) 
    return filtered_sentence

In [89]:
tokenized_search_term = make_tokens("Pandemic Covid Cough lung Greek")
tokenized_search_term

# if search word in vocab
{token:vocab_dict[token] for token in tokenized_search_term}
    

{'pandemic': [('Pandemic', 0.1814556331),
  ('Epidemiology of HIV/AIDS', 0.0518444666),
  ('Antonine Plague', 0.0518444666),
  ('Cholera', 0.0259222333),
  ('COVID-19 pandemic', 0.1036889332),
  ('Crimson Contagion', 0.0777666999),
  ('HIV/AIDS', 0.0259222333),
  ('Pandemic prevention', 0.1814556331),
  ('Pandemic Severity Assessment Framework', 0.2073778664),
  ('Pandemic severity index', 0.1036889332),
  ('Plague of Cyprian', 0.0518444666),
  ('PREDICT (USAID)', 0.0259222333),
  ('1929–1930 psittacosis pandemic', 0.0777666999),
  ('Science diplomacy and pandemics', 0.0259222333),
  ('Spanish flu', 0.2333000997),
  ('Swine influenza', 0.0777666999),
  ('Unified Victim Identification System', 0.0259222333)],
 'cough': [('COVID-19 pandemic', 4.3333333333),
  ('Swine influenza', 2.1666666667),
  ('Virus', 2.1666666667)],
 'greek': [('Pandemic', 26.0)]}

In [None]:
def inverted_index_search(search_string:str, inverted_index):
    """Accept a search string and an inverted index, return the most relaent document to the search string

    Args:
        search_string (str): _description_
        inverted_index (_type_): _description_
    """
    tokenized_search_term = make_tokens(search_string)
    for token in tokenized_search_term:
        
