In [21]:
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
import umap
import numpy
from sklearn.cluster import MeanShift, KMeans
from sklearn.metrics import silhouette_score

In [2]:
data = pd.read_csv('JEOPARDY_CSV.csv')
data.head()

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,Arizona
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$200,"In 1963, live on ""The Art Linkletter Show"", th...",McDonald's
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Co...",John Adams


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 216930 entries, 0 to 216929
Data columns (total 7 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   Show Number  216930 non-null  int64 
 1    Air Date    216930 non-null  object
 2    Round       216930 non-null  object
 3    Category    216930 non-null  object
 4    Value       216930 non-null  object
 5    Question    216930 non-null  object
 6    Answer      216928 non-null  object
dtypes: int64(1), object(6)
memory usage: 11.6+ MB


In [4]:
X = data[[' Question']]

In [5]:
X.head()

Unnamed: 0,Question
0,"For the last 8 years of his life, Galileo was ..."
1,No. 2: 1912 Olympian; football star at Carlisl...
2,The city of Yuma in this state has a record av...
3,"In 1963, live on ""The Art Linkletter Show"", th..."
4,"Signer of the Dec. of Indep., framer of the Co..."


In [7]:
with open('stopwords.txt', encoding='utf-8') as stopwords_file:
    stopwords = stopwords_file.readlines()
stopwords = [line.replace('\n', '') for line in stopwords]
stopwords

['a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 "can't",
 'cannot',
 'could',
 "couldn't",
 'did',
 "didn't",
 'do',
 'does',
 "doesn't",
 'doing',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 "hadn't",
 'has',
 "hasn't",
 'have',
 "haven't",
 'having',
 'he',
 "he'd",
 "he'll",
 "he's",
 'her',
 'here',
 "here's",
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 "how's",
 'i',
 "i'd",
 "i'll",
 "i'm",
 "i've",
 'if',
 'in',
 'into',
 'is',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 "let's",
 'me',
 'more',
 'most',
 "mustn't",
 'my',
 'myself',
 'no',
 'nor',
 'not',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'ought',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'same',
 "shan't",
 'she',
 "she'd",
 "she'll",
 "she's",
 'should',
 "s

In [8]:
s = 'this: is a 4 hi'
s = re.sub('[^\w\s]', '', s)
s

'this is a 4 hi'

In [9]:
s = re.sub("\d+", "", s)
s

'this is a  hi'

In [10]:
from nltk import word_tokenize
import nltk
import re
nltk.download('punkt')
dataset = pd.DataFrame(columns=['title_body'])
for index, row in X.iterrows():
    title_body_tokenized = word_tokenize(row[' Question'])
    title_body_tokenized_filtered = [w.lower() for w in title_body_tokenized if not w.lower() in stopwords]
#     title_body_tokenized_filtered_stemmed = [stemmer.stem(w) for w in title_body_tokenized_filtered]
    s = re.sub('[^\w\s]', '', ' '.join(title_body_tokenized_filtered))
    s = re.sub("\d+", "", s)
    dataset.loc[index] = {
        'title_body': s
    }

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Hacker\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [11]:
dataset

Unnamed: 0,title_body
0,life galileo house arrest espousing s theory
1,olympian football star carlisle indian sc...
2,city yuma record average hours sunshine
3,live art linkletter company served billio...
4,signer dec indep framer constitution mass pr...
...,...
216925,puccini opera solution riddles posed heroine
216926,north america term properly applied species c...
216927,penny lane hellraiser grew barber shaves c...
216928,ft sill okla plea arizona land father s la...


In [12]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(dataset['title_body'])

In [13]:
X

<216930x93512 sparse matrix of type '<class 'numpy.float64'>'
	with 1430161 stored elements in Compressed Sparse Row format>

In [15]:
kmeans = KMeans(n_clusters=10, max_iter=300)
kmeans.fit(X)

In [16]:
kmeans.labels_

array([5, 5, 3, ..., 5, 5, 5])

In [17]:
kmeans.inertia_

213583.9191268674

In [None]:
silhouette_score(X, kmeans.labels_)

In [None]:
reducer = umap.UMAP(1000)
X_reduced = reducer.fit_transform(X)

In [None]:
import pickle
with open('X_reduced.mohamad', 'wb') as f:
    pickle.dump(X_reduced, f)

In [None]:
np.shape(X_reduced)

In [None]:
from sklean.cluster import MeanShift

msh = MeanShift()
msh.fit(X_reduced)

In [None]:
msh.labels_

In [None]:
dataset['Cluster'] = msh.labels_

In [None]:
dataset[dataset['Cluster'] == 1]

In [None]:
from collections import Counter

In [None]:
Counter(' '.join(dataset[dataset['Cluster'] == 1][dataset['Cluster']]).split()).most_common(20)