# Document Retrieval using TF-IDF Weighted Rank and TF-IDF Cosine Similarity

## Imports

In [1]:
pip install num2words

Collecting num2words
  Downloading num2words-0.5.10-py3-none-any.whl (101 kB)
[?25l[K     |███▎                            | 10 kB 17.3 MB/s eta 0:00:01[K     |██████▌                         | 20 kB 1.7 MB/s eta 0:00:01[K     |█████████▊                      | 30 kB 2.4 MB/s eta 0:00:01[K     |█████████████                   | 40 kB 3.1 MB/s eta 0:00:01[K     |████████████████▏               | 51 kB 3.8 MB/s eta 0:00:01[K     |███████████████████▍            | 61 kB 4.5 MB/s eta 0:00:01[K     |██████████████████████▋         | 71 kB 5.1 MB/s eta 0:00:01[K     |█████████████████████████▉      | 81 kB 5.7 MB/s eta 0:00:01[K     |█████████████████████████████   | 92 kB 4.0 MB/s eta 0:00:01[K     |████████████████████████████████| 101 kB 3.0 MB/s 
Installing collected packages: num2words
Successfully installed num2words-0.5.10


In [2]:
# Libraries
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from collections import Counter
from num2words import num2words

import nltk
nltk.download('punkt')
nltk.download('stopwords')
import os
import string
import numpy as np
import copy
import pandas as pd
import pickle
import re
import math
import plotly.graph_objects as go

# %load_ext autotime

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


# Importing data

In [3]:
# Import csv

df = pd.read_csv (r'owner_problems.csv')

In [4]:
# Display rows
df.tail(10)

Unnamed: 0,problems_name
553,These owners (no matter the flavour) create an...
554,These owners (no matter the flavour) create an...
555,Dog-dog incidents: The lack of proper socializ...
556,Dog-dog incidents: The lack of proper socializ...
557,Dog-Human incidents: The lack of proper social...
558,Dog-Human incidents: The lack of proper social...
559,"I am confronted with stressed out, non-social,..."
560,"I am confronted with stressed out, non-social,..."
561,That would help me and my dog most. And theirs...
562,That would help me and my dog most. And theirs...


# Preprocessing

In [5]:
# Convert entries from object to string
dataset = df['problems_name'].astype(str)

In [6]:
# Length of the dataset
N = len (dataset)

The following functions are self explanatory, with the exception of stemming. Stemming has the goal of removing suffixes in order to obtain the root of a word.

In [7]:
def convert_lower_case(data):
    return np.char.lower(data)

In [8]:
def remove_stop_words(data):
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(str(data))
    new_text = ""
    for w in words:
        if w not in stop_words and len(w) > 1:
            new_text = new_text + " " + w
    return new_text

In [9]:
def remove_punctuation(data):
    symbols = "!\"#$%&()*+-./:;<=>?@[\]^™®_`{|}~\n"
    for i in range(len(symbols)):
        data = np.char.replace(data, symbols[i], ' ')
        data = np.char.replace(data, "  ", " ")
    data = np.char.replace(data, ',', '')
    return data

In [10]:
def remove_apostrophe(data):
    return np.char.replace(data, "'", "")

In [11]:
def stemming(data):
    stemmer= PorterStemmer()
    
    tokens = word_tokenize(str(data))
    new_text = ""
    for w in tokens:
        new_text = new_text + " " + stemmer.stem(w)
    return new_text

In [12]:
def convert_numbers(data):
    tokens = word_tokenize(str(data))
    new_text = ""
    for w in tokens:
        try:
            w = num2words(int(w))
        except:
            a = 0
        new_text = new_text + " " + w
    new_text = np.char.replace(new_text, "-", " ")
    return new_text

In [13]:
# All previous function are present here.
def preprocess(data):
    data = convert_lower_case(data)
    data = remove_punctuation(data) #remove comma seperately
    data = remove_apostrophe(data)
    data = remove_stop_words(data)
    data = convert_numbers(data)
    data = stemming(data)
    data = remove_punctuation(data)
    data = convert_numbers(data)
    data = stemming(data) #needed again as we need to stem the words
    data = remove_punctuation(data) #needed again as num2word is giving few hypens and commas fourty-one
    data = remove_stop_words(data) #needed again as num2word is giving stop words 101 - one hundred and one
    return data

## Extracting Data

In [14]:
# Apply the preprocessing to the column
processed_text = []

for i in range(N):
  processed_text.append(word_tokenize(str(preprocess(dataset[i]))))

## Calculating DF for all words

In [15]:
# Word count for each word
DF = {}

for i in range(N):
    tokens = processed_text[i]
    for w in tokens:
        try:
            DF[w].add(i)
        except:
            DF[w] = {i}

for i in DF:
    DF[i] = len(DF[i])

In [16]:
# Total vocabulary size
total_vocab_size = len(DF)
print(total_vocab_size)

812


In [17]:
# Array with every word
total_vocab = [x for x in DF]

In [18]:
# Function to obtain quantity of a word
def doc_freq(word):
    c = 0
    try:
        c = DF[word]
    except:
        pass
    return c

### Calculating TF-IDF.

In [19]:
# Term frequency - inverse document frequency
# Statistic for importance of words within a corpus in a certain section
doc = 0

tf_idf = {}

for i in range(N):
    
    tokens = processed_text[i]
    
    counter = Counter(tokens) #+ processed_title[i])
    words_count = len(tokens) #+ processed_title[i])
    
    for token in np.unique(tokens):
        
        tf = counter[token]/words_count
        df = doc_freq(token)
        idf = np.log((N+1)/(df+1))
        
        tf_idf[doc, token] = tf*idf

    doc += 1

In [20]:
# Values of the TF-IDF
tf_idf

{(0, 'abil'): 0.06372408486004828,
 (0, 'afford'): 0.06372408486004828,
 (0, 'anim'): 0.05116824864675146,
 (0, 'ask'): 0.04478249367611536,
 (0, 'bill'): 0.04842575391175685,
 (0, 'care'): 0.0978809203459165,
 (0, 'continu'): 0.023419319122025592,
 (0, 'current'): 0.06372408486004828,
 (0, 'didnt'): 0.06372408486004828,
 (0, 'die'): 0.05845888400571756,
 (0, 'dont'): 0.05757451686110462,
 (0, 'euthanasia'): 0.06372408486004828,
 (0, 'ever'): 0.053050995059940986,
 (0, 'fault…'): 0.06372408486004828,
 (0, 'find'): 0.05516482819946395,
 (0, 'front'): 0.0614218194699703,
 (0, 'go'): 0.04437679544490511,
 (0, 'got'): 0.04894046017295825,
 (0, 'grow'): 0.06372408486004828,
 (0, 'idiot'): 0.05757451686110462,
 (0, 'inconveni'): 0.06372408486004828,
 (0, 'kill'): 0.05939107747589473,
 (0, 'limit'): 0.06372408486004828,
 (0, 'line'): 0.10745442850447787,
 (0, 'logic'): 0.06372408486004828,
 (0, 'medic'): 0.05939107747589473,
 (0, 'mine'): 0.0614218194699703,
 (0, 'move'): 0.05757451686110462,

In [21]:
# Count of TF_IDF terms within a threshold
value_count = {}
for key, value in tf_idf.items():
  if value > 0.15:
    if key[1] not in value_count:
      value_count[key[1]] = 1
    else:
      value_count[key[1]] += 1

In [22]:
# Print information about the previous threshold
print(value_count)
print(len(value_count))
print("cat" in value_count)

{'nan': 62, 'abil': 4, 'afford': 4, 'didnt': 4, 'die': 6, 'ever': 6, 'limit': 4, 'medic': 4, 'mine': 4, 'pack': 4, 'plan': 4, 'properli': 6, 'size': 4, 'ta': 4, 'though': 4, 'line': 4, 'often': 4, 'take': 4, 'current': 2, 'grow': 2, 'idiot': 4, 'neighbor': 2, 'owner': 18, 'pet': 4, 'pup': 2, 'continu': 70, 'read': 71, 'dog': 52, 'approach': 13, 'bite': 4, 'without': 4, 'warn': 2, 'differ': 2, 'made': 2, 'one': 45, 'save': 2, 'kid': 21, 'believ': 4, 'everyon': 8, 'frequent': 8, 'may': 6, 'problem': 22, 'reason': 6, 'relev': 4, 'still': 8, 'age': 4, 'allow': 4, 'daili': 8, 'list': 4, 'local': 4, 'park': 6, 'rule': 8, 'sign': 4, 'sixteen': 4, 'user': 4, 'ye': 4, 'actual': 4, 'anyon': 10, 'follow': 4, 'ive': 4, 'never': 10, 'seen': 4, 'enter': 4, 'flaw': 4, 'forc': 6, 'gener': 4, 'great': 4, 'pretti': 4, 'social': 8, 'thing': 6, 'way': 5, 'accept': 2, 'bring': 2, 'issu': 2, 'knock': 2, 'parent': 6, 'play': 2, 'soon': 2, 'steal': 2, 'toddler': 2, 'total': 2, 'toy': 4, 'upset': 2, 'anywh': 2

Analyzing DF

In [23]:
# Mean appearance of each individual word
# loop to sum all values 
res = 0
for val in DF.values():
    res += val
  
# using len() to get total keys for mean computation
res = res / len(DF)
  
# printing result 
print("The computed mean : " + str(res)) 

The computed mean : 12.316502463054187


In [24]:
# Storing words with count above the average
popular_words = []
for key, value in DF.items():
  if value > 12:
    popular_words.append([0,key,value/288*100])

In [25]:
# Sort by frequency
sorted_by_second = sorted(popular_words, key=lambda tup: tup[2], reverse = True)
i = 1
for j in sorted_by_second:
  j[0] = i
  i += 1
print(sorted_by_second[:30])

[[1, 'dog', 73.26388888888889], [2, 'read', 54.166666666666664], [3, 'continu', 50.0], [4, 'one', 41.66666666666667], [5, 'problem', 31.944444444444443], [6, 'owner', 27.430555555555557], [7, 'peopl', 25.0], [8, 'nan', 21.52777777777778], [9, 'get', 21.52777777777778], [10, 'never', 20.833333333333336], [11, 'know', 20.48611111111111], [12, 'like', 19.09722222222222], [13, 'mani', 19.09722222222222], [14, 'vet', 18.40277777777778], [15, 'time', 17.36111111111111], [16, 'us', 15.625], [17, 'human', 15.625], [18, 'friend', 15.625], [19, 'come', 15.277777777777779], [20, 'good', 15.277777777777779], [21, 'away', 14.930555555555555], [22, 'huge', 14.930555555555555], [23, 'go', 14.583333333333334], [24, 'may', 14.583333333333334], [25, 'biggest', 14.583333333333334], [26, 'ask', 14.23611111111111], [27, 'face', 14.23611111111111], [28, 'came', 14.23611111111111], [29, 'puppi', 13.88888888888889], [30, 'take', 13.541666666666666]]


In [26]:
# Arrays for the table
s = [sorted_by_second[i][0] for i in range(len(sorted_by_second))]
d = [sorted_by_second[i][1] for i in range(len(sorted_by_second))]
t = [sorted_by_second[i][2] for i in range(len(sorted_by_second))]
print(s)

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 22

In [27]:
# Table
fig = go.Figure(data=[go.Table(header=dict(values=['Position', 'Word', 'Frequency (%)']),
                 cells=dict(values=[s, d, t]))
                     ])
fig.show()