# TF-IDF Search Using Cosine Similarity

In [4]:
import pandas as pd
import numpy as np
import spacy
from collections import Counter
import math
from sklearn.metrics.pairwise import cosine_similarity

In [5]:
text_df = pd.read_json("processed_text.json")
text_df.head()

Unnamed: 0,title,text,url,processed_text
0,Pandemic,"A pandemic (from Greek πᾶν, pan, ""all"" and δῆμ...",https://en.wikipedia.org/wiki/Pandemic,"[pandemic, greek, pan, demo, people, epidemic,..."
1,Epidemiology of HIV/AIDS,"HIV/AIDS, or Human Immunodeficiency Virus, is ...",https://en.wikipedia.org/wiki/Epidemiology_of_...,"[hiv, aids, human, immunodeficiency, virus, co..."
2,Antonine Plague,"The Antonine Plague of 165 to 180 AD, also kno...",https://en.wikipedia.org/wiki/Antonine_Plague,"[plague, ad, know, plague, galen, galen, physi..."
3,Basic reproduction number,"In epidemiology, the basic reproduction number...",https://en.wikipedia.org/wiki/Basic_reproducti...,"[epidemiology, basic, reproduction, number, ba..."
4,Bills of mortality,Bills of mortality were the weekly mortality s...,https://en.wikipedia.org/wiki/Bills_of_mortality,"[bill, mortality, weekly, mortality, statistic..."


## Term Frequency

**The number of times a word appears in a document divided by the total times it appears in the corpus. Every document has its own term frequency.**

In [6]:
def computeTF(single_doc, docs):
    """Computes the Term Frequency given the tokenized text string doc and a list of tokenized documents all_docs. Returns a complete vocabulary of all_docs as a  dictionary of words with number of word appearances in doc divided by number of word appearances in all_docs

    Args:
        doc (list): A preprocessed text string tokenized into a list of words
        all_docs (list): A list of tokenized text strings now represented as word lists
    """
    doc_counts = Counter(single_doc)
    corpus = [word for sublist in docs for word in sublist]
    corpus_counts = Counter(corpus)
    tf_dict = dict.fromkeys(corpus, 0)
    for key in doc_counts.keys():
       tf_dict[key] = doc_counts[key]/corpus_counts[key]
      
    return(tf_dict)

## Inverse Document Frequency

**The number of documents divided by the number of documents that contain the word w. Inverse data frequency determines the weight of rare words across all documents in the corpus.**

In [7]:
def computeIDF(docs):    
    """ Accepts a list of documents and returns the number of total documents divided by the number of documents containing a given word for each word
    Args:
        doc (list): A list of tokenized text strings now represented as word lists
    """
    N = len(docs)
    corpus = [word for sublist in docs for word in sublist]
    idfDict = dict.fromkeys(corpus, 0)
    for document in docs:
        for word in list(set(document)):
            idfDict[word] += 1
    
    for word, val in idfDict.items():
        idfDict[word] = math.log(N/float(val))
    return idfDict

## Putting TF and IDF together

In [8]:
# Compute Inverse Document Frequency
def build_tf_idf(doc, docs):
    """Given a preprocessed string and a list of documents returns the TF-IDF vectorization

    Args:

        doc (list): A tokenized list of words
        docs (list): A list of tokenized documents that make up entire vocab
    """

    tf_idf_dictionary = dict()
    tf_dict = computeTF(doc, docs)
    idf_dict = computeIDF(docs)

    for word, val in tf_dict.items():
        tf_idf_dictionary[word] = (tf_dict[word]*idf_dict[word])
    
    return tf_idf_dictionary
        


In [9]:
text_df["tfidf_vector"] = text_df.processed_text.apply(build_tf_idf, docs=text_df.processed_text)
text_df.head()


Unnamed: 0,title,text,url,processed_text,tfidf_vector
0,Pandemic,"A pandemic (from Greek πᾶν, pan, ""all"" and δῆμ...",https://en.wikipedia.org/wiki/Pandemic,"[pandemic, greek, pan, demo, people, epidemic,...","{'pandemic': 0.05040987047045528, 'greek': 3.2..."
1,Epidemiology of HIV/AIDS,"HIV/AIDS, or Human Immunodeficiency Virus, is ...",https://en.wikipedia.org/wiki/Epidemiology_of_...,"[hiv, aids, human, immunodeficiency, virus, co...","{'pandemic': 0.014402820134415793, 'greek': 0...."
2,Antonine Plague,"The Antonine Plague of 165 to 180 AD, also kno...",https://en.wikipedia.org/wiki/Antonine_Plague,"[plague, ad, know, plague, galen, galen, physi...","{'pandemic': 0.014402820134415793, 'greek': 0...."
3,Basic reproduction number,"In epidemiology, the basic reproduction number...",https://en.wikipedia.org/wiki/Basic_reproducti...,"[epidemiology, basic, reproduction, number, ba...","{'pandemic': 0.0, 'greek': 0.0, 'pan': 0.0, 'd..."
4,Bills of mortality,Bills of mortality were the weekly mortality s...,https://en.wikipedia.org/wiki/Bills_of_mortality,"[bill, mortality, weekly, mortality, statistic...","{'pandemic': 0.0, 'greek': 0.0, 'pan': 0.0, 'd..."


In [10]:
text_tf_idf = text_df.to_json()

with open('processed_text_tf_idf.json', 'w') as outfile:
    outfile.write(text_tf_idf)

## Create a Search Function using cosine similarity

In [1]:
def make_tokens(text):
    """ Accepts any list of words and returns a tokenized list

    Args:
        text (string):

    Returns:
        list: Tokenizeed list of lemmatized words
    """
    nlp = spacy.load("en_core_web_lg") # Initialize the vocabulary
    doc = nlp(text.lower())
    filtered_sentence =[] 
    for word in doc:
        lexeme = nlp.vocab[str(word)]
        if lexeme.is_stop == False and lexeme.is_punct == False and lexeme.is_oov == False:
            filtered_sentence.append(word.lemma_) 
    return filtered_sentence

In [2]:
def get_cosine_similarities(string, titles, documents):
    """Accepts a string, list of title, documents, and a tfidf vector returns a dictionary of title with cosine similarity score ordered by most similar to least similar

    Args:
        string (string): Any string of words
        titles (list): A list of document titles associated with the  vectorized documents given in the tfidf_vector argument
        documents (list): A list of preprocessed document token vectors to build vocabularity for tf-idf computation
    """
    tokenized_string = make_tokens(string)
    input_tfidf = np.array([list(build_tf_idf(tokenized_string, documents).values())])
    document_tf_idf = pd.Series([np.array([list(dictionary.values())]) for dictionary in text_df.processed_text.apply(build_tf_idf, docs = text_df.processed_text)])
    similarity_scores = document_tf_idf.apply(cosine_similarity, Y=input_tfidf)
    similarity_scores = pd.Series([value.item(0) for value in similarity_scores])
    scores_df = pd.concat([titles, similarity_scores.rename("similarity_score")], axis=1)
    sorted_scores = scores_df.sort_values(by = "similarity_score", ascending=False)
    sorted_scores_dict = sorted_scores.set_index("title")['similarity_score'].to_dict()
    return sorted_scores_dict

In [3]:
get_cosine_similarities("Current Covid-19 Pandemic", text_df.title, text_df.processed_text)

NameError: name 'text_df' is not defined