# TF-IDF Search Using Cosine Similarity

## Setup

In [1]:
import json
from collections import Counter, OrderedDict
from typing import List
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

## Open data

In [2]:
with open("tokenized_data.json", "r") as read_file:
    CDC_data = json.load(read_file)

## Create corpus vocabulary

In [3]:
def compute_vocabulary(corpus: List[dict]) -> List[str]:
    """Determine the vocabulary, i.e. the sorted list of all unique tokens, for a corpus of lemmatized articles."""
    all_corpus_tokens = []
    for _, article in enumerate(corpus):
        all_corpus_tokens += article.get("tokenized_text")
    vocabulary = sorted(list(set(all_corpus_tokens)))
    return vocabulary

In [4]:
CDC_vocabulary = compute_vocabulary(corpus = CDC_data)

In [5]:
len(CDC_vocabulary)

1484

In [6]:
with open('CDC_vocabulary.json', 'w') as out:
    json.dump(CDC_vocabulary, out)

## TF-builder function

In [7]:
def compute_TF(vocabulary: list, tokenized_text: List[str]) -> 'OrderedDict[str, float]':
    """Compute TF given a vocabulary and a tokenized document, and return the document’s TF-vector."""
    counter = Counter(tokenized_text)
    normalizer = sum(counter.values())
    TF_vector = OrderedDict((token, 0) for token in vocabulary)
    for key, value in counter.items():
        TF_vector[key] = value/normalizer
    return TF_vector

In [8]:
example_TF = compute_TF(vocabulary=CDC_vocabulary, tokenized_text=CDC_data[2].get('tokenized_text'))

In [9]:
example_TF['plague']

0.06153846153846154

## IDF-builder function

In [10]:
def compute_IDF(vocabulary: list, corpus: List[dict]) -> 'OrderedDict[str, float]':
    """Compute DF for each word in the vocabulary given a corpus of tokenized texts."""
    IDF_vector = OrderedDict((token, 0) for token in vocabulary)
    corpus_size = len(corpus)
    for key in IDF_vector.keys():
        for _, article in enumerate(corpus):
            IDF_vector[key] += (key in article['tokenized_text'])
        IDF_vector[key] = np.log(corpus_size/IDF_vector[key])      
    return IDF_vector

In [11]:
CDC_IDF = compute_IDF(vocabulary=CDC_vocabulary, corpus=CDC_data)

In [12]:
CDC_IDF['plague']

2.159484249353372

## Add TF_IDF field to CDC corpus

In [13]:
def compute_TF_IDF(corpus: List[dict]) -> List[dict]:
    """Add a TF_IDF_vector field to each article in a given lemmatized corpus."""
    vocabulary = compute_vocabulary(corpus)
    IDF_vector = compute_IDF(vocabulary=vocabulary, corpus=corpus)
    for _, article in enumerate(corpus):
        article['tf_idf'] = compute_TF(vocabulary=vocabulary, tokenized_text=article['tokenized_text'])
        article['tf_idf'] = OrderedDict({key: value*IDF_vector[key] for key, value in article['tf_idf'].items()})
    return corpus

In [14]:
vectorized_data = compute_TF_IDF(corpus=CDC_data)

In [15]:
vectorized_data[2]['tf_idf']['plague']

0.132891338421746

## Save corpus with TF_IDF vectors

In [16]:
with open('vectorized_data.json', 'w') as out:
    json.dump(vectorized_data, out)

## Query comparison to corpus with cosine similarity 

In [17]:
def TF_IDF_dict_to_array(corpus: List[dict]) -> np.ndarray:
    """Given a corpus, create the array of tf-idf vectors for the whole corpus."""
    TF_IDF_list = []
    for _, article in enumerate(corpus):
        TF_IDF_list.append(list(article.get('tf_idf').values()))
    return np.asarray(TF_IDF_list)

In [18]:
def reshape_tfidf(tfidf_dict: 'OrderedDict[str, float]') -> np.ndarray:
    return np.array(list(tfidf_dict.values())).reshape(1, -1)

In [19]:
def compute_similarity_to_corpus(query: dict, corpus: List[dict]):
    """Given a query article, compute its cosine similarity to each article in the corpus."""
    corpus_tfidf = TF_IDF_dict_to_array(corpus)
    distances = cosine_similarity(reshape_tfidf(query.get('tf_idf')), corpus_tfidf)
    return pd.Series(distances.flatten())

In [20]:
distances_to_corpus_2 = compute_similarity_to_corpus(query=vectorized_data[2], corpus=vectorized_data)

In [21]:
distances_to_corpus_10 = compute_similarity_to_corpus(query=vectorized_data[10], corpus=vectorized_data)

In [22]:
distances_to_corpus_2.sort_values(ascending=False)

2     1.000000
15    0.349125
0     0.074371
11    0.065140
10    0.031332
5     0.031121
6     0.030052
4     0.028216
1     0.027421
7     0.026526
17    0.024144
3     0.017936
19    0.016914
25    0.014236
8     0.012659
21    0.011262
23    0.010593
20    0.008291
12    0.006328
22    0.006017
16    0.005696
18    0.005475
13    0.004698
14    0.002994
9     0.001559
24    0.001139
dtype: float64

In [23]:
distances_to_corpus_10.sort_values(ascending=False)

10    1.000000
1     0.180081
11    0.151779
25    0.104621
6     0.089411
5     0.089371
0     0.085912
24    0.082944
19    0.074112
21    0.063049
16    0.058468
12    0.049429
20    0.037985
17    0.036746
14    0.035690
13    0.035402
2     0.031332
22    0.029978
8     0.028662
3     0.027454
23    0.024984
7     0.024258
18    0.022718
15    0.022150
9     0.015973
4     0.008713
dtype: float64