In [1]:
import pandas as pd
import spacy

In [2]:
processed_text = pd.read_json("processed_text.json")
processed_text_tf_idf = pd.read_json("processed_text_tf_idf.json")

In [3]:
processed_text_tf_idf.head()

Unnamed: 0,title,text,url,processed_text,tfidf_vector
0,Pandemic,"A pandemic (from Greek πᾶν, pan, ""all"" and δῆμ...",https://en.wikipedia.org/wiki/Pandemic,"[pandemic, greek, pan, demo, people, epidemic,...","{'pandemic': 0.0504098705, 'greek': 3.25809653..."
1,Epidemiology of HIV/AIDS,"HIV/AIDS, or Human Immunodeficiency Virus, is ...",https://en.wikipedia.org/wiki/Epidemiology_of_...,"[hiv, aids, human, immunodeficiency, virus, co...","{'pandemic': 0.0144028201, 'greek': 0.0, 'pan'..."
2,Antonine Plague,"The Antonine Plague of 165 to 180 AD, also kno...",https://en.wikipedia.org/wiki/Antonine_Plague,"[plague, ad, know, plague, galen, galen, physi...","{'pandemic': 0.0144028201, 'greek': 0.0, 'pan'..."
3,Basic reproduction number,"In epidemiology, the basic reproduction number...",https://en.wikipedia.org/wiki/Basic_reproducti...,"[epidemiology, basic, reproduction, number, ba...","{'pandemic': 0.0, 'greek': 0.0, 'pan': 0.0, 'd..."
4,Bills of mortality,Bills of mortality were the weekly mortality s...,https://en.wikipedia.org/wiki/Bills_of_mortality,"[bill, mortality, weekly, mortality, statistic...","{'pandemic': 0.0, 'greek': 0.0, 'pan': 0.0, 'd..."


In [4]:
# Create dictionary 
# vocab dict contains 'cornovirus' but the lemmatized verion is turning into 'coronaviru'
vocab = set([word for document in processed_text_tf_idf.processed_text for word in document])
vocab_dict = {word:[] for word in vocab}


In [6]:
# Create Inverse Document object
for idx, document_idf in enumerate(processed_text_tf_idf.tfidf_vector):
    for key, value in document_idf.items():
        if value > 0:
            vocab_dict[key].append((processed_text.title[idx], value))

In [67]:
# Build a search funciton
# 1. Tokenize the new phrase
# 2. Filter to words in the dictionary that are in the phrase
# 3. Sum grouped by the documents to get the highest tf-idf

In [7]:
def make_query_tokens(text, vocab = vocab):
    """ Accepts any list of words and returns a tokenized list

    Args:
        text (string):

    Returns:
        list: Tokenizeed list of lemmatized words
    """
    nlp = spacy.load("en_core_web_lg") # Initialize the vocabulary
    doc = nlp(text.lower())
    filtered_sentence =[] 
    for word in doc:
        lexeme = nlp.vocab[str(word)]
        if lexeme.is_stop == False and lexeme.is_punct == False and lexeme.is_oov == False and lexeme.lower_ in vocab:
            filtered_sentence.append(word.lemma_) 
    return filtered_sentence

In [8]:
tokenized_search_term = make_query_tokens("coronavirus")
tokenized_search_term
# Need to implment fix here to check if search word in vocab
#relavent_documents_dictionary = {token:vocab_dict[token] for token in tokenized_search_term}
    

['coronavirus']

In [10]:
def inverted_index_search(search_string:str, inverted_index = vocab_dict):
    """Accept a search string and an inverted index, return the most sorted search results in teh form of a dataframe

    Args:
        search_string (str): _description_
        inverted_index (_type_): _description_
    """
    tokenized_search_term = make_query_tokens(search_string)
    relavent_documents_dictionary = {token:vocab_dict[token] for token in tokenized_search_term}
    
    search_results = pd.DataFrame([doc for word in list(relavent_documents_dictionary.values()) for doc in word],
             columns=["title", "tf_idf_score"]).groupby("title").sum().sort_values(by = 'tf_idf_score', ascending = False)
    
    return search_results


In [11]:
inverted_index_search("common symptoms of coronavirus")

Unnamed: 0_level_0,tf_idf_score
title,Unnamed: 1_level_1
COVID-19 pandemic,6.935147
Swine influenza,0.837907
Cholera,0.418953
HIV/AIDS,0.418953
HIV/AIDS in Yunnan,0.418953
Virus,0.418953
