# TF-IDF Search Using Cosine Similarity

In [1]:
import pandas as pd
import spacy
from collections import Counter
import math

In [2]:
text_df = pd.read_json("processed_text.json")
text_df.head()

Unnamed: 0,title,text,url,processed_text
0,Pandemic,"A pandemic (from Greek πᾶν, pan, ""all"" and δῆμ...",https://en.wikipedia.org/wiki/Pandemic,"[pandemic, greek, pan, demo, people, epidemic,..."
1,Epidemiology of HIV/AIDS,"HIV/AIDS, or Human Immunodeficiency Virus, is ...",https://en.wikipedia.org/wiki/Epidemiology_of_...,"[hiv, aids, human, immunodeficiency, virus, co..."
2,Antonine Plague,"The Antonine Plague of 165 to 180 AD, also kno...",https://en.wikipedia.org/wiki/Antonine_Plague,"[plague, 165, 180, ad, know, plague, galen, ga..."
3,Basic reproduction number,"In epidemiology, the basic reproduction number...",https://en.wikipedia.org/wiki/Basic_reproducti...,"[epidemiology, basic, reproduction, number, ba..."
4,Bills of mortality,Bills of mortality were the weekly mortality s...,https://en.wikipedia.org/wiki/Bills_of_mortality,"[bill, mortality, weekly, mortality, statistic..."


In [46]:
all_text_list = list(text_df["processed_text"])
corpus = [word for sublist in all_text_list for word in sublist]
len(Counter(corpus))


1408

In [32]:
def computeTF(wordDict):
    tfDict = {}
    WordsCount = len(wordDict)
    for word, count in wordDict.items():
        tfDict[word] = count / float(WordsCount)
    return tfDict

## Term Frequency

**The number of times a word appears in a document divided by the total times it appears in the corpus. Every document has its own term frequency.**

In [22]:
def computeTF(doc, all_docs):
    """Computes the Term Frequency given the tokenized text string doc and a list of tokenized documents all_docs. Returns a complete vocabulary of all_docs as a  dictionary of words with number of word appearances in doc divided by number of word appearances in all_docs

    Args:
        doc (list): A preprocessed text string tokenized into a list of words
        all_docs (list): A list of tokenized text strings now represented as word lists
    """
    doc_counts = Counter(doc)
    corpus = [word for sublist in all_docs for word in sublist]
    corpus_counts = Counter(corpus)
    tf_dict = dict.fromkeys(corpus, 0)
    for key in doc_counts.keys():
       tf_dict[key] = doc_counts[key]/corpus_counts[key]
      
    return(tf_dict)

In [27]:
text_df["term_frequency"] = text_df["processed_text"].apply(computeTF, all_docs=text_df["processed_text"])
text_df.head()

Unnamed: 0,title,text,url,processed_text,term_frequency
0,Pandemic,"A pandemic (from Greek πᾶν, pan, ""all"" and δῆμ...",https://en.wikipedia.org/wiki/Pandemic,"[pandemic, greek, pan, demo, people, epidemic,...","{'pandemic': 0.11864406779661017, 'greek': 1.0..."
1,Epidemiology of HIV/AIDS,"HIV/AIDS, or Human Immunodeficiency Virus, is ...",https://en.wikipedia.org/wiki/Epidemiology_of_...,"[hiv, aids, human, immunodeficiency, virus, co...","{'pandemic': 0.03389830508474576, 'greek': 0, ..."
2,Antonine Plague,"The Antonine Plague of 165 to 180 AD, also kno...",https://en.wikipedia.org/wiki/Antonine_Plague,"[plague, 165, 180, ad, know, plague, galen, ga...","{'pandemic': 0.03389830508474576, 'greek': 0, ..."
3,Basic reproduction number,"In epidemiology, the basic reproduction number...",https://en.wikipedia.org/wiki/Basic_reproducti...,"[epidemiology, basic, reproduction, number, ba...","{'pandemic': 0, 'greek': 0, 'pan': 0, 'demo': ..."
4,Bills of mortality,Bills of mortality were the weekly mortality s...,https://en.wikipedia.org/wiki/Bills_of_mortality,"[bill, mortality, weekly, mortality, statistic...","{'pandemic': 0, 'greek': 0, 'pan': 0, 'demo': ..."


## Inverse Document Frequency

The log of the number of documents divided by the number of documents that contain the word w. Inverse data frequency determines the weight of rare words across all documents in the corpus.

In [None]:
def computeIDF(docs):    
    # Need to accept list of docs as parameter
    idfDict = dict.fromkeys(corpus.keys(), 0)
    for document in text_df.processed_text:
        for word in list(set(document)):
            idfDict[word] += 1
    
    for word, val in idfDict.items():
        idfDict[word] = math.log(n_docs / float(val))
    return idfDict

In [53]:
# Compute Inverse Document Frequency
def build_tf_idf(string, vocab):
    idfDict = dict.fromkeys(vocab, 0)
    for document in text_df.processed_text:
        for word in list(set(document)):
            idfDict[word] += 1


## Create a Search Function using cosine similarity