<div style="direction:rtl;text-align:center"><img src="https://mohammadkh.ir/github/logo.png" alt="Mohammadkh.ir" style="width: 250px;"/></div>
<h1><div style="direction:rtl;text-align:center">Clustering</div></h1>

In [2]:
import pandas as pd
import numpy as np
import random
import re
import umap
from collections import Counter
from nltk import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture

# data - preproccessing

In [3]:
n = 216930 - 1 #number of records in file (excludes header)
s = 1000 #desired sample size
skip = sorted(random.sample(range(1,n+1),n-s)) #the 0-indexed header will not be included in the skip list

data = pd.read_csv('../__data/JEOPARDY_CSV.csv', skiprows=skip)

X = data[[' Question']]

data.head(2)

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer
0,2825,1996-12-06,Jeopardy!,TELEVISION,$300,This crime drama with Robert Wagner & Stefanie...,Hart to Hart
1,2735,1996-06-21,Double Jeopardy!,MEDICAL MILESTONES,$200,"In 1751, with Benjamin Franklin's help, the 1s...",Philadelphia


In [3]:
#stopworsd en
with open('stopwords.txt', encoding="utf8") as stopwords_file:
    stopwords = stopwords_file.readlines()
    
stopwords = [line.replace('\n', '') for line in stopwords]

In [4]:
# ex : re
s = 'this. is: a string32 2 333?'

s = re.sub('[^\w\s]', '', s) # del ,. ! ...
s = re.sub("\d+", '', s)     # del numbers
s

'this is a string  '

In [5]:
dataset = pd.DataFrame(columns=['title_body']) # create new dataset

for index, row in X.iterrows():
    title_body_tokenized = word_tokenize(row[' Question'])
    title_body_tokenized_filtered = [w.lower() for w in title_body_tokenized if not w.lower() in stopwords]
    s = re.sub('[^\w\s]', '', ' '.join(title_body_tokenized_filtered))
    s = re.sub("\d+", "", s)
    dataset.loc[index] = {'title_body': s}
    
dataset.head(2)

Unnamed: 0,title_body
0,corniche silver shadow
1,develop single ovum variety twins genetic makeup


In [6]:
vectorizer = TfidfVectorizer()
vectorizer.fit(dataset['title_body'])
X = vectorizer.transform(dataset['title_body'])
X

<1001x4026 sparse matrix of type '<class 'numpy.float64'>'
	with 6565 stored elements in Compressed Sparse Row format>

In [7]:
reducer = umap.UMAP(n_components=300)
X_reduced = reducer.fit_transform(X)

# MeanShift

In [8]:
kmeans = KMeans(n_clusters=4, max_iter=3000,random_state=1)
kmeans.fit(X) 
dataset['Cluster'] = kmeans.labels_

In [9]:
dataset['Cluster'].shape

(1001,)

In [10]:
silhouette_score(X, dataset['Cluster'])

0.006433026046825197

In [11]:
dataset[dataset['Cluster'] == 1][:3]

Unnamed: 0,title_body,Cluster
4,guyana s largest cities s york city,1
18,called declaration war germany safe democracy,1
56,natives city s ok okc,1


In [12]:
Counter(" ".join(dataset[dataset['Cluster'] == 1]["title_body"]).split()).most_common(20) # 20 top repeat word category

[('city', 37),
 ('s', 16),
 ('war', 12),
 ('called', 5),
 ('south', 4),
 ('largest', 3),
 ('york', 3),
 ('capital', 3),
 ('american', 3),
 ('alexander', 3),
 ('mayor', 2),
 ('january', 2),
 ('spanish', 2),
 ('george', 2),
 ('miles', 2),
 ('north', 2),
 ('port', 2),
 ('joseph', 2),
 ('national', 2),
 ('mark', 2)]

# GaussianMixture

In [13]:
gaussian = GaussianMixture(n_components=4, covariance_type="full",random_state=1)
dataset['Cluster'] = gaussian.fit_predict(X_reduced)

In [14]:
silhouette_score(X, dataset['Cluster'])

-0.0005529337739190555

In [15]:
dataset[dataset['Cluster'] == 1][:3]

Unnamed: 0,title_body,Cluster
0,corniche silver shadow,1
1,develop single ovum variety twins genetic makeup,1
5,hurt,1


In [16]:
Counter(" ".join(dataset[dataset['Cluster'] == 1]["title_body"]).split()).most_common(20) # 20 top repeat word category

[('s', 76),
 ('city', 17),
 ('href', 16),
 ('a', 16),
 ('target', 13),
 ('_blank', 13),
 ('named', 10),
 ('film', 9),
 ('country', 9),
 ('south', 8),
 ('wwwjarchivecommedia_j_jpg', 8),
 ('war', 8),
 ('th', 8),
 ('called', 7),
 ('title', 7),
 ('word', 6),
 ('king', 5),
 ('white', 5),
 ('classic', 5),
 ('german', 5)]

<div class="alert alert-block alert-info">
<div style="direction:rtl;text-align:left"><strong>Clustering</strong><br>MohammadReza <strong>Khajedaloi</strong><br><br>
</div>
<div style="direction:rtl;text-align:right">
<a href="http://mohammadkh.ir/">WebSite</a> - <a href="https://github.com/khajedaloi/">GitHub</a> - <a href="https://www.linkedin.com/in/mohammad-kh/">Linkedin</a>
</div>
</div>