# TF-IDF

## Importing libraries

In [1]:
from nltk.corpus import stopwords
from collections import Counter

import numpy as np
import pandas as pd
import math

## Loading the data

In [2]:
def load_data(path):
    dataframe = pd.read_csv(path)
    return dataframe

In [3]:
PATH = 'C:\\Users\\maxel\\OneDrive\\Search_Engine\\Version_1\\processed_TED_Talks.csv'
df_info = load_data(PATH)
N = len(df_info)

In [4]:
processed_exposition = np.array([np.array(row['exposition'][1:].split(',')) for i, row in df_info.iterrows()])
processed_transcript = np.array([np.array(row['transcript'][1:].split(',')) for i, row in df_info.iterrows()])

## Calculating DF for both the exposition and and the transcript

In [5]:
DF = {}

for i in range(N):
    tokens = processed_exposition[i]
    for w in tokens:
        try:
            DF[w].add(i)
        except:
            DF[w] = {i}

    tokens = processed_transcript[i]
    for w in tokens:
        try:
            DF[w].add(i)
        except:
            DF[w] = {i}
            
for i in DF:
    DF[i] = len(DF[i])

In [6]:
total_vocab_size = len(DF)
total_vocab_size

40737

In [7]:
total_vocab = list(DF.keys())

In [8]:
def get_DF(word):
    try:
        return DF[word]
    except:
        return 0

## Calculating TF-IDF for transcript

In [9]:
tf_idf = {}

for i in range(N):
    
    tokens = processed_transcript[i]
    
    counter = Counter(list(tokens) + list(processed_exposition[i]))
    words_count = len(tokens) + len(processed_exposition[i])
    
    for token in np.unique(tokens):
        
        tf = counter[token]/words_count
        df = get_DF(token)
        idf = 1 + np.log((N+1)/(df+1))
        
        tf_idf[i, token] = tf*idf

## Calculate the TF-IDF for the exposition

In [10]:
tf_idf_expo = {}

for i in range(N):
    tokens = processed_exposition[i]
    
    counter = Counter(list(tokens) + list(processed_transcript[i]))
    words_count = len(tokens) + len(processed_transcript[i])

    for token in np.unique(tokens):
        
        tf = counter[token]/words_count
        df = get_DF(token)
        idf = np.log((N+1)/(df+1))
        
        tf_idf_expo[i, token] = tf*idf

In [11]:
tf_idf[(1948,"quantum")]

0.2378347668491331

In [12]:
tf_idf_expo[(1948,"quantum")]

0.1876029275910187

In [13]:
len(tf_idf)

1086962

## Merging the TF-IDF according to the weights

In [14]:
alpha = 0.7

In [15]:
for i in tf_idf:
    tf_idf[i] = alpha * tf_idf[i]
    
    try:
        tf_idf[i] += (1 - alpha) * tf_idf_expo[i]
    except:
        pass

# Matching score simularity

In [16]:
from preprocess import preprocess_query

In [31]:
def matching_score(k, query):
    preprocessed_query = preprocess_query(query)
    tokens = preprocessed_query

    print("Matching Score")
    print("Search Query:", query, '\n')
    
    query_weights = {}

    for key in tf_idf:
        
        if key[1] in tokens:
            try:
                query_weights[key[0]] += tf_idf[key]
            except:
                query_weights[key[0]] = tf_idf[key]
    
    query_weights = sorted(query_weights.items(), key=lambda x: x[1], reverse=True)
    
    for i in query_weights[:k]:
        print(i[0], df_info['headline'][i[0]])

In [32]:
matching_score(10, "Quantum biology")

Matching Score
Search Query: Quantum biology 

1948 How quantum biology might explain lifeâ€™s biggest questions
1198 The levitating superconductor
912 Making sense of a visible quantum object
970 Making matter come alive
706 The bio-future of joint replacement
1347 Print your own medicine
414 Stunning data visualization in the AlloSphere
1159 What's left to explore?
1528 Why our universe might exist on a knife-edge
2155 Clues to prehistoric times, found in blind cavefish


# TF-IDF Cosine Similarity Ranking

In [33]:
def cosine_sim(a, b):
    cos_sim = np.dot(a, b)/(np.linalg.norm(a)*np.linalg.norm(b))
    return cos_sim

In [34]:
D = np.zeros((N, total_vocab_size))
for i in tf_idf:
    try:
        ind = total_vocab.index(i[1])
        D[i[0]][ind] = tf_idf[i]
    except:
        pass

In [42]:
def gen_vector(tokens):

    Q = np.zeros((len(total_vocab)))
    
    counter = Counter(tokens)
    words_count = len(tokens)

    query_weights = {}
    
    for token in np.unique(tokens):
        
        tf = counter[token]/words_count
        df = get_DF(token)
        idf = math.log((N+1)/(df+1))

        try:
            ind = total_vocab.index(token)
            Q[ind] = tf*idf
        except:
            pass
    return Q

In [50]:
def cosine_similarity(k, query):
    print("Cosine Similarity")
    preprocessed_query = preprocess_query(query)
    tokens = preprocessed_query
    
    print("Search Query:", query, '\n')
    
    d_cosines = []
    query_vector = gen_vector(tokens)
    
    for d in D:
        d_cosines.append(cosine_sim(query_vector, d))
        
    out = np.array(d_cosines).argsort()[-k:][::-1]
    
    for i in out[:k]:
        print(i, df_info['headline'][i])

In [51]:
Q = cosine_similarity(10, "Groundwater")

Cosine Similarity
Search Query: Groundwater 

2035 The mysterious world of underwater caves
2031 4 ways we can avoid a catastrophic drought
1734 An engineer's vision for tiny forests, everywhere
173 Why aren't we more compassionate?
2155 Clues to prehistoric times, found in blind cavefish
940 Let's take back the Internet!
2020 What happens when a city runs out of room for its dead
2178 The taboo secret to better health
554 The ancient ingenuity of water harvesting
2105 Hunting for dinosaurs showed me our place in the universe


In [28]:
# Save to a file
np.save('tf_idf_dict.npy', tf_idf)

In [34]:
def save_obj(obj, name ):
    with open('obj/'+ name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(name ):
    with open('obj/' + name + '.pkl', 'rb') as f:
        return pickle.load(f)

In [35]:
save_obj(tf_idf, 'tf_idf_pickle')

In [37]:
import time

start = time.time()
read_tf_idf = np.load('tf_idf_dict.npy', allow_pickle = True).item()
print('Numpy loading took: %s' % (time.time() - start))

start = time.time()
load_obj('tf_idf_pickle')
print('Pickle loading took: %s' % (time.time() - start))

Numpy loading took: 7.867189407348633
Pickle loading took: 7.454195499420166
