# TF-IDF Search Using Cosine Similarity

## Setup

In [1]:
import json
from collections import Counter, OrderedDict
from typing import List
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

## Open data

In [2]:
with open("tokenized_data.json", "r") as read_file:
    CDC_data = json.load(read_file)

## Create corpus vocabulary

In [3]:
def compute_vocabulary(corpus: List[dict]) -> List[str]:
    """Determine the vocabulary, i.e. the sorted list of all unique tokens, for a corpus of lemmatized articles."""
    all_corpus_tokens = []
    for _, article in enumerate(corpus):
        all_corpus_tokens += article.get("tokenized_text")
    vocabulary = sorted(list(set(all_corpus_tokens)))
    return vocabulary

In [4]:
CDC_vocabulary = compute_vocabulary(corpus = CDC_data)

In [5]:
len(CDC_vocabulary)

1484

In [None]:
with open('CDC_vocabulary.json', 'w') as out:
    json.dump(vectorized_data, out)

## TF-builder function

In [None]:
def compute_TF(vocabulary: list, tokenized_text: List[str]) -> 'OrderedDict[str, float]':
    """Compute TF given a vocabulary and a tokenized document, and return the document’s TF-vector."""
    counter = Counter(tokenized_text)
    normalizer = sum(counter.values())
    TF_vector = OrderedDict((token, 0) for token in vocabulary)
    for key, value in counter.items():
        TF_vector[key] = value/normalizer
    return TF_vector

In [None]:
example_TF = compute_TF(vocabulary=CDC_vocabulary, tokenized_text=CDC_data[2].get('tokenized_text'))

In [None]:
example_TF['plague']

## IDF-builder function

In [None]:
def compute_IDF(vocabulary: list, corpus: List[dict]) -> 'OrderedDict[str, float]':
    """Compute DF for each word in the vocabulary given a corpus of tokenized texts."""
    IDF_vector = OrderedDict((token, 0) for token in vocabulary)
    corpus_size = len(corpus)
    for key in IDF_vector.keys():
        for _, article in enumerate(corpus):
            IDF_vector[key] += (key in article['tokenized_text'])
        IDF_vector[key] = np.log(corpus_size/IDF_vector[key])      
    return IDF_vector

In [None]:
CDC_IDF = compute_IDF(vocabulary=CDC_vocabulary, corpus=CDC_data)

In [None]:
CDC_IDF['plague']

## Add TF_IDF field to CDC corpus

In [None]:
def compute_TF_IDF(corpus: List[dict]) -> List[dict]:
    """Add a TF_IDF_vector field to each article in a given lemmatized corpus."""
    vocabulary = compute_vocabulary(corpus)
    IDF_vector = compute_IDF(vocabulary=vocabulary, corpus=corpus)
    for _, article in enumerate(corpus):
        article['tf_idf'] = compute_TF(vocabulary=vocabulary, tokenized_text=article['tokenized_text'])
        article['tf_idf'] = OrderedDict({key: value*IDF_vector[key] for key, value in article['tf_idf'].items()})
    return corpus

In [None]:
vectorized_data, vocabulary = compute_TF_IDF(corpus=CDC_data)

In [None]:
vectorized_data[2]['tf_idf']['plague']

## Save corpus with TF_IDF vectors

In [None]:
with open('vectorized_data.json', 'w') as out:
    json.dump(vectorized_data, out)

## Query comparison to corpus with cosine similarity 

In [None]:
def TF_IDF_dict_to_array(corpus: List[dict]) -> np.ndarray:
    """Given a corpus, create the array of tf-idf vectors for the whole corpus."""
    TF_IDF_list = []
    for _, article in enumerate(corpus):
        TF_IDF_list.append(list(article.get('tf_idf').values()))
    return np.asarray(TF_IDF_list)

In [None]:
def reshape_tfidf(tfidf_dict: 'OrderedDict[str, float]') -> np.ndarray:
    return np.array(list(tfidf_dict.values())).reshape(1, -1)

In [None]:
def compute_similarity_to_corpus(query: dict, corpus: List[dict]):
    """Given a query article, compute its cosine similarity to each article in the corpus."""
    corpus_tfidf = TF_IDF_dict_to_array(corpus)
    distances = cosine_similarity(reshape_tfidf(query.get('tf_idf')), corpus_tfidf)
    return pd.Series(distances.flatten())

In [None]:
distances_to_corpus_2 = compute_similarity_to_corpus(query=vectorized_data[2], corpus=vectorized_data)

In [None]:
distances_to_corpus_10 = compute_similarity_to_corpus(query=vectorized_data[10], corpus=vectorized_data)

In [None]:
distances_to_corpus_2.sort_values(ascending=False)

In [None]:
distances_to_corpus_10.sort_values(ascending=False)