# Document Retrieval using TF-IDF Weighted Rank and TF-IDF Cosine Similarity

## Imports

In [39]:
pip install num2words



In [40]:
# Libraries
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from collections import Counter
from num2words import num2words

import nltk
nltk.download('punkt')
nltk.download('stopwords')
import os
import string
import numpy as np
import copy
import pandas as pd
import pickle
import re
import math
import plotly.graph_objects as go

# %load_ext autotime

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Importing data

In [41]:
# Import csv

df = pd.read_csv (r'collars2.csv')

In [42]:
# Display rows
df.tail(10)

Unnamed: 0,collar_info_name,collar_info_url,collar_info_rating_count
278,"Lu&Ba Dog Shock Collar with Remote,2 Dogs Trai...",https://www.amazon.com/Lu-Ba-Rechargeable-Wate...,1693
279,Downtown Pet Supply Deluxe Adjustable Thick Do...,https://www.amazon.com/Downtown-Pet-Supply-Adj...,1803
280,PetSpy 1100 Yard Waterproof Rechargeable Remot...,https://www.amazon.com/PetSpy-Waterproof-Recha...,1294
281,Dog Bark Collar-7 Adjustable Sensitivity and I...,https://www.amazon.com/Adjustable-Sensitivity-...,3430
282,Dr.Trainer B1Pro Dog Bark Collar APP & Watch C...,https://www.amazon.com/Dr-Trainer-Adjustable-V...,79
283,"Tcoale Dog Training Collar, Rechargeable Dog S...",https://www.amazon.com/Tcoale-Dog-Rechargeable...,41
284,PetSafe Big Dog Remote Training Collar for Med...,https://www.amazon.com/PetSafe-Remote-Training...,1404
285,Ruzixt 15 Pack 6V Pet Collar Batteries Compati...,https://www.amazon.com/Ruzixt-Compatible-Repla...,($11.99/Count)
286,"Small Dog Bark Collar, MASBRILL Dog Bark Colla...",https://www.amazon.com/MASBRILL-5-15lbs-Rechar...,2
287,"Dog Training Collar with 1000Ft Remote,Dog Sho...",https://www.amazon.com/Training-Collar-1000Ft-...,1


# Preprocessing

In [43]:
# Convert entries from object to string
dataset = df['collar_info_name'].astype(str)

In [45]:
# Length of the dataset
N = len (dataset)

The following functions are self explanatory, with the exception of stemming. Stemming has the goal of removing suffixes in order to obtain the root of a word.

In [46]:
def convert_lower_case(data):
    return np.char.lower(data)

In [47]:
def remove_stop_words(data):
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(str(data))
    new_text = ""
    for w in words:
        if w not in stop_words and len(w) > 1:
            new_text = new_text + " " + w
    return new_text

In [48]:
def remove_punctuation(data):
    symbols = "!\"#$%&()*+-./:;<=>?@[\]^™®_`{|}~\n"
    for i in range(len(symbols)):
        data = np.char.replace(data, symbols[i], ' ')
        data = np.char.replace(data, "  ", " ")
    data = np.char.replace(data, ',', '')
    return data

In [49]:
def remove_apostrophe(data):
    return np.char.replace(data, "'", "")

In [50]:
def stemming(data):
    stemmer= PorterStemmer()
    
    tokens = word_tokenize(str(data))
    new_text = ""
    for w in tokens:
        new_text = new_text + " " + stemmer.stem(w)
    return new_text

In [51]:
def convert_numbers(data):
    tokens = word_tokenize(str(data))
    new_text = ""
    for w in tokens:
        try:
            w = num2words(int(w))
        except:
            a = 0
        new_text = new_text + " " + w
    new_text = np.char.replace(new_text, "-", " ")
    return new_text

In [52]:
# All previous function are present here.
def preprocess(data):
    data = convert_lower_case(data)
    data = remove_punctuation(data) #remove comma seperately
    data = remove_apostrophe(data)
    data = remove_stop_words(data)
    data = convert_numbers(data)
    data = stemming(data)
    data = remove_punctuation(data)
    data = convert_numbers(data)
    data = stemming(data) #needed again as we need to stem the words
    data = remove_punctuation(data) #needed again as num2word is giving few hypens and commas fourty-one
    data = remove_stop_words(data) #needed again as num2word is giving stop words 101 - one hundred and one
    return data

## Extracting Data

In [53]:
# Apply the preprocessing to the column
processed_text = []

for i in range(N):
  processed_text.append(word_tokenize(str(preprocess(dataset[i]))))

## Calculating DF for all words

In [55]:
# Word count for each word
DF = {}

for i in range(N):
    tokens = processed_text[i]
    for w in tokens:
        try:
            DF[w].add(i)
        except:
            DF[w] = {i}

for i in DF:
    DF[i] = len(DF[i])

In [57]:
# Total vocabulary size
total_vocab_size = len(DF)
print(total_vocab_size)

In [59]:
# Array with every word
total_vocab = [x for x in DF]

In [61]:
# Function to obtain quantity of a word
def doc_freq(word):
    c = 0
    try:
        c = DF[word]
    except:
        pass
    return c

### Calculating TF-IDF.

In [62]:
# Term frequency - inverse document frequency
# Statistic for importance of words within a corpus in a certain section
doc = 0

tf_idf = {}

for i in range(N):
    
    tokens = processed_text[i]
    
    counter = Counter(tokens) #+ processed_title[i])
    words_count = len(tokens) #+ processed_title[i])
    
    for token in np.unique(tokens):
        
        tf = counter[token]/words_count
        df = doc_freq(token)
        idf = np.log((N+1)/(df+1))
        
        tf_idf[doc, token] = tf*idf

    doc += 1

In [63]:
# Values of the TF-IDF
tf_idf

{(0, 'activ'): 0.14097624230231345,
 (0, 'app'): 0.12618431500983035,
 (0, 'collar'): 0.0022570722929936803,
 (0, 'contain'): 0.17612123722201714,
 (0, 'dog'): 0.005334118328055107,
 (0, 'free'): 0.1576910050352824,
 (0, 'freedom'): 0.22605815943420396,
 (0, 'geofenc'): 0.22605815943420396,
 (0, 'gp'): 0.10450594809663447,
 (0, 'health'): 0.16304477938329984,
 (0, 'pet'): 0.08255814029101698,
 (0, 'phone'): 0.2076279272474692,
 (0, 'shock'): 0.032621218078830186,
 (0, 'smart'): 0.0729992580711824,
 (0, 'system'): 0.13153808935784778,
 (0, 'track'): 0.2523686300196607,
 (0, 'virtual'): 0.22605815943420396,
 (0, 'wagz'): 0.4521163188684079,
 (0, 'well'): 0.22605815943420396,
 (0, 'wireless'): 0.16911438813895996,
 (1, 'anti'): 0.06125653462610731,
 (1, 'app'): 0.09914481893629526,
 (1, 'b1spro'): 0.15286186882116218,
 (1, 'bark'): 0.12381076945666951,
 (1, 'collar'): 0.005320241833485103,
 (1, 'control'): 0.06642014994078972,
 (1, 'correct'): 0.1081203342320419,
 (1, 'custom'): 0.1281066

In [64]:
# Count of TF_IDF terms within a threshold
value_count = {}
for key, value in tf_idf.items():
  if value > 0.15:
    if key[1] not in value_count:
      value_count[key[1]] = 1
    else:
      value_count[key[1]] += 1

{'contain': 5, 'free': 7, 'freedom': 1, 'geofenc': 1, 'health': 6, 'phone': 2, 'track': 7, 'virtual': 1, 'wagz': 1, 'well': 1, 'wireless': 6, 'b1spro': 3, 'ip68': 3, 'report': 3, 'fit': 4, 'four': 4, 'gp': 9, 'model': 4, 'newest': 3, 'sleek': 1, 'switch': 2, 'tracker': 10, 'bluetooth': 3, 'handheld': 2, 'smartphon': 5, 'solut': 2, 'use': 6, 'petsaf': 16, 'play': 1, 'stay': 1, 'bark': 25, 'custom': 2, 'record': 4, 'set': 4, 'watch': 1, 'laptom': 1, 'pro': 2, 'v1': 1, 'app': 3, 'b1': 4, 'b1pro': 4, 'stop': 3, 'chip': 3, 'effect': 4, 'fast': 2, 'highli': 1, 'minut': 1, 'np': 1, '4g': 2, 'android': 1, 'coverag': 2, 'fitbark': 1, 'iphon': 1, 'lte': 3, 'nationwid': 2, 'seventeen': 1, 'us': 3, 'verizon': 1, 'detect': 4, 'devic': 6, 'harmless': 3, 'pain': 5, 'smartest': 1, 'strong': 2, 'breed': 1, 'pack': 6, 'upgrad': 5, 'augley': 1, 'ip67': 4, 'progress': 1, '425x': 1, 'brand': 10, 'five': 4, 'sportdog': 8, 'tone': 4, 'yard': 5, 'alert': 1, 'built': 1, 'escap': 1, 'explor': 2, 'go': 3, 'grey'

In [66]:
# Print information about the previous threshold
print(value_count)
print(len(value_count))
print("cat" in value_count)

724
True


Analyzing DF

In [67]:
# Mean appearance of each individual word
# loop to sum all values 
res = 0
for val in DF.values():
    res += val
  
# using len() to get total keys for mean computation
res = res / len(DF)
  
# printing result 
print("The computed mean : " + str(res)) 

The computed mean : 6.4013333333333335


In [68]:
# Storing words with count above the average
popular_words = []
for key, value in DF.items():
  if value > 6:
    popular_words.append([0,key,value/288*100])

In [69]:
# Sort by frequency
sorted_by_second = sorted(popular_words, key=lambda tup: tup[2], reverse = True)
i = 1
for j in sorted_by_second:
  j[0] = i
  i += 1
print(sorted_by_second[:30])

[[1, 'collar', 95.13888888888889], [2, 'dog', 88.88888888888889], [3, 'train', 54.861111111111114], [4, 'shock', 48.61111111111111], [5, 'vibrat', 48.61111111111111], [6, 'waterproof', 47.22222222222222], [7, 'recharg', 45.83333333333333], [8, 'remot', 43.40277777777778], [9, 'larg', 41.66666666666667], [10, 'medium', 38.54166666666667], [11, 'small', 36.80555555555556], [12, 'bark', 31.25], [13, 'beep', 30.555555555555557], [14, 'mode', 26.38888888888889], [15, 'smart', 19.791666666666664], [16, 'hundr', 19.09722222222222], [17, 'adjust', 18.75], [18, 'anti', 17.708333333333336], [19, 'one', 17.708333333333336], [20, 'rang', 17.36111111111111], [21, 'pet', 15.972222222222221], [22, 'control', 15.277777777777779], [23, 'level', 14.23611111111111], [24, 'trainer', 12.847222222222221], [25, 'static', 12.5], [26, 'mile', 11.11111111111111], [27, 'electr', 10.069444444444445], [28, 'gp', 9.722222222222223], [29, 'petsaf', 9.722222222222223], [30, 'thousand', 9.027777777777777]]


In [70]:
# Arrays for the table
s = [sorted_by_second[i][0] for i in range(len(sorted_by_second))]
d = [sorted_by_second[i][1] for i in range(len(sorted_by_second))]
t = [sorted_by_second[i][2] for i in range(len(sorted_by_second))]
print(s)

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115]


In [71]:
# Table
fig = go.Figure(data=[go.Table(header=dict(values=['Position', 'Word', 'Frequency (%)']),
                 cells=dict(values=[s, d, t]))
                     ])
fig.show()