# **Roll No : 19BCE102**
# **Data Mining - 2CSDE71**

# Practical-9 Clustering using TF-IDF & Kmeans

In [None]:
import numpy as np
import pandas as pd
import math

### Data Generation

In [None]:
corpus = """
Simple example with Cats and Mouse
Another simple example with dogs and cats
Another simple example with horse and chana
""".split("\n")[1:-1]

### Tf-Idf 

In [None]:
#tokenization and vocabulary generation
vocab = []

for idx,cur_sent in enumerate(corpus):
  corpus[idx] = cur_sent.lower().split(" ")
  vocab += corpus[idx]

vocab = list(set(vocab))
print("Vocabulary generated - :")
vocab

Vocabulary generated - :


['mouse',
 'and',
 'simple',
 'cats',
 'chana',
 'another',
 'with',
 'example',
 'horse',
 'dogs']

In [None]:
# Generating dictionary of word for each sentence
word_dict = []
for idx,cur_sent in enumerate(corpus):
  cur_word_dict = dict()
  for cur_word in vocab:
    cur_word_dict[cur_word] = cur_sent.count(cur_word)
  word_dict.append(cur_word_dict)

In [None]:
print(word_dict)

[{'mouse': 1, 'and': 1, 'simple': 1, 'cats': 1, 'chana': 0, 'another': 0, 'with': 1, 'example': 1, 'horse': 0, 'dogs': 0}, {'mouse': 0, 'and': 1, 'simple': 1, 'cats': 1, 'chana': 0, 'another': 1, 'with': 1, 'example': 1, 'horse': 0, 'dogs': 1}, {'mouse': 0, 'and': 1, 'simple': 1, 'cats': 0, 'chana': 1, 'another': 1, 'with': 1, 'example': 1, 'horse': 1, 'dogs': 0}]


In [None]:
def compute_tf(word_dict, l):
    tf = {}
    sum_nk = len(l)
    for word, count in word_dict.items():
        tf[word] = count/sum_nk
    return tf

In [None]:
def compute_idf(strings_list):
    n = len(strings_list)
    idf = dict.fromkeys(strings_list[0].keys(), 0)
    for l in strings_list:
        for word, count in l.items():
            if count > 0:
                idf[word] += 1
    
    for word, v in idf.items():
        idf[word] = math.log(n / float(v))
    return idf

In [None]:
def compute_tf_idf(tf, idf):
    tf_idf = dict.fromkeys(tf.keys(), 0)
    for word, v in tf.items():
        tf_idf[word] = v * idf[word]
    return tf_idf

In [None]:
tf_list = []
for idx,cur_dict in enumerate(word_dict):
  tf_list.append(compute_tf(cur_dict, corpus[idx]))

In [None]:
idf = compute_idf(word_dict)

In [None]:
idf

{'mouse': 1.0986122886681098,
 'and': 0.0,
 'simple': 0.0,
 'cats': 0.4054651081081644,
 'chana': 1.0986122886681098,
 'another': 0.4054651081081644,
 'with': 0.0,
 'example': 0.0,
 'horse': 1.0986122886681098,
 'dogs': 1.0986122886681098}

In [None]:
tf_idf_list = []
for idx,cur_tf in enumerate(tf_list):
  tf_idf_list.append(compute_tf_idf(cur_tf, idf))

In [None]:
tf_idf_list[0].values()

dict_values([0.1831020481113516, 0.0, 0.0, 0.06757751801802739, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0])

In [None]:
tf_idf_vectors = []

for idx,cur_tf in enumerate(tf_list):
  tf_idf_vectors.append(list(tf_idf_list[idx].values()))

In [None]:
tf_idf_vectors = np.array(tf_idf_vectors)

In [None]:
tf_idf_vectors

array([[0.18310205, 0.        , 0.        , 0.06757752, 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.05792359, 0.        ,
        0.05792359, 0.        , 0.        , 0.        , 0.15694461],
       [0.        , 0.        , 0.        , 0.        , 0.15694461,
        0.05792359, 0.        , 0.        , 0.15694461, 0.        ]])

### K-Means

In [None]:
from scipy.spatial.distance import cdist 
def kmeans(x,k, no_of_iterations):
    idx = np.random.choice(len(x), k, replace=False)
    #Randomly choosing Centroids 
    centroids = x[idx, :] #Step 1
     
    #finding the distance between centroids and all the data points
    distances = cdist(x, centroids) #Step 2
     
    #Centroid with the minimum Distance
    points = np.array([np.argmin(i) for i in distances]) #Step 3
     
    #Repeating the above steps for a defined number of iterations
    #Step 4
    for _ in range(no_of_iterations): 
        centroids = []
        for idx in range(k):
            #Updating Centroids by taking mean of Cluster it belongs to
            temp_cent = x[points==idx].mean(axis=0) 
            centroids.append(temp_cent)
 
        centroids = np.vstack(centroids) #Updated Centroids 
         
        distances = cdist(x, centroids ,'euclidean')
        points = np.array([np.argmin(i) for i in distances])
         
    return points 

In [None]:
kmeans_labels = kmeans(tf_idf_vectors,2, 5)

In [None]:
print(f"The labels generated by kmeans algorithm :")
print(kmeans_labels)

The labels generated by kmeans algorithm :
[0 0 1]


In [None]:
for i in range(2):
  print(f"sentence belong to class {i} are : ")
  for idx,cur_sent in enumerate(corpus):
    if kmeans_labels[idx] == i:
      print(" ".join(cur_sent))
  print()
  print()

sentence belong to class 0 are : 
simple example with cats and mouse
another simple example with dogs and cats


sentence belong to class 1 are : 
another simple example with horse and chana




## **CONCLUSION**
Term Frequency-Inverse Document Frequency is a numerical statistic that demonstrates how important a word is to a corpus. We can compute TF_IDF vectors and use Kmeans clustering to cluster cluster documents and articles.