# Milestone 2: TF-IDF Search using Cosine Similarity

Import spacy and the small English model

In [None]:
import spacy
import json
import os
from sklearn.metrics.pairwise import cosine_similarity
nlp = spacy.load('en_core_web_sm')

In [None]:
#Opens the json file at ./outputs/milestone-1/tokenized-data.json
with open('./outputs/milestone-1/tokenized-data.json') as f:
    data = json.load(f)

In [None]:
#Creates a corpus of tokens from the json file
corpus = []
for item in data:
    for token in item['tokenized_text']:
        corpus.append(token)
corpus = set(corpus)

In [None]:
def build_tf_idf_vector(corpus, text_input):
    #Creates a dictionary of the corpus with the value of 0
    tf_idf_vector = dict.fromkeys(corpus, 0)
    #Tokenizes the input text
    tokens = nlp(text_input)
    #Counts the frequency of each token in the input text
    for token in tokens:
        if token.text in tf_idf_vector:
            tf_idf_vector[token.text] += 1
    return tf_idf_vector

In [None]:
#Computes the tf-idf vector for each document in the data
for item in data:
    item['tf_idf'] = build_tf_idf_vector(corpus, item['text'])

In [None]:
def search(query, data):
    #Computes the tf-idf vector for the query
    query_tf_idf = build_tf_idf_vector(corpus, query)
    search_data = []
    #Computes the cosine similarity between the query and each document in the data
    for item in data:
        similarity = cosine_similarity([list(item['tf_idf'].values())], [list(query_tf_idf.values())])
        search_data.append({
            'title': item['title'],
            'text': item['text'],
            'url': item['url'],
            'similarity': similarity[0][0]
        })
    #Sorts the search results by similarity
    search_data = sorted(search_data, key=lambda x: x['similarity'], reverse=True)
    return search_data

In [None]:
#Ensures that the directory ./outputs/milestone-2 exists
if not os.path.exists('./outputs/milestone-2'):
    os.makedirs('./outputs/milestone-2')
#Saves the updated data to a json file
with open('./outputs/milestone-2/tf-idf-data.json', 'w') as f:
    json.dump(data, f)