In [65]:
import pandas as pd # import for dataframe handle
import numpy as np # import for math and array operations
import matplotlib.pyplot as plt # import for visual representation
#import seaborn as sns # import for visual representation

import re
import string
from nltk.corpus import stopwords 
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer

from bs4 import BeautifulSoup
import requests

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import OrdinalEncoder
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans


%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [66]:
df = pd.read_csv("../raw_data/clean_df.csv")
df.head()

Unnamed: 0,url,name,developer,tags,languages,genre,game_description,mature_content,price,reviews,date,achievements,op_sys,metadata
0,https://store.steampowered.com/app/379720/DOOM/,DOOM,id Software,fps gore action demon shooter first person gre...,english french italian german spanish spain ja...,action,game developed id software studio pioneered fi...,0,17.991,Very Positive,2016-05-12 00:00:00,1.0,windows,soundtrack multiplayer singleplayer fast paced...
1,https://store.steampowered.com/app/578080/PLAY...,PLAYERUNKNOWN'S BATTLEGROUNDS,PUBG Corporation,survival shooter multiplayer battle royale pvp...,english korean simplified chinese french germa...,action adventure massively multiplayer,game playerunknown battleground battle royale ...,1,26.991,Mixed,2017-12-21 00:00:00,1.0,windows,multiplayer shooter action online person team ...
2,https://store.steampowered.com/app/637090/BATT...,BATTLETECH,Harebrained Schemes,mechs strategy turn based turn based tactic sc...,english french german russian,action adventure strategy,game original battletech mechwarrior creator j...,0,35.991,Mostly Positive,2018-04-24 00:00:00,1.0,windows macOS Linux,tactic sci fi turn based mechs strategy turn b...
3,https://store.steampowered.com/app/221100/DayZ/,DayZ,Bohemia Interactive,survival zombie open world multiplayer pvp mas...,english french italian german spanish spain cz...,action adventure massively multiplayer,game post soviet country chernarus struck unkn...,0,40.491,Mixed,2018-12-13 00:00:00,0.0,windows,access simulation fps post apocalyptic surviva...
4,https://store.steampowered.com/app/8500/EVE_On...,EVE Online,CCP,space massively multiplayer sci fi sandbox mmo...,english german russian french,action free play massively multiplayer rpg str...,game,0,,Mostly Positive,2003-05-06 00:00:00,0.0,windows macOS,multiplayer rpg strategy play economy strategy...


In [9]:
def clean (text):
    for punctuation in string.punctuation:
        text = text.replace(punctuation, ' ') # Remove Punctuation
    lowercased = text.lower() # Lower Case
    tokenized = word_tokenize(lowercased) # Tokenize
    words_only = [word for word in tokenized if word.isalpha()] # Remove numbers
    stop_words = set(stopwords.words('english')) # Make stopword list
    without_stopwords = [word for word in words_only if not word in stop_words] # Remove Stop Words
    lemma=WordNetLemmatizer() # Initiate Lemmatizer
    lemmatized = [lemma.lemmatize(word) for word in without_stopwords] # Lemmatize
    return " ".join(lemmatized)

In [10]:
clean_df=df.assign(clean_description=lambda x: x.game_description.astype('str').apply(clean))
clean_df.head()

Unnamed: 0,url,name,developer,tags,languages,genre,game_description,mature_content,price,reviews,date,achievements,op_sys,clean_description
0,https://store.steampowered.com/app/379720/DOOM/,DOOM,id Software,"FPS,Gore,Action,Demons,Shooter,First-Person,Gr...","English,French,Italian,German,Spanish - Spain,...",Action,"About This Game Developed by id software, the...",0,17.991,Very Positive,2016-05-12 00:00:00,1.0,windows,game developed id software studio pioneered fi...
1,https://store.steampowered.com/app/578080/PLAY...,PLAYERUNKNOWN'S BATTLEGROUNDS,PUBG Corporation,"Survival,Shooter,Multiplayer,Battle Royale,PvP...","English,Korean,Simplified Chinese,French,Germa...","Action,Adventure,Massively Multiplayer",About This Game PLAYERUNKNOWN'S BATTLEGROUND...,1,26.991,Mixed,2017-12-21 00:00:00,1.0,windows,game playerunknown battleground battle royale ...
2,https://store.steampowered.com/app/637090/BATT...,BATTLETECH,Harebrained Schemes,"Mechs,Strategy,Turn-Based,Turn-Based Tactics,S...","English,French,German,Russian","Action,Adventure,Strategy",About This Game From original BATTLETECH/Mec...,0,35.991,Mostly Positive,2018-04-24 00:00:00,1.0,windows macOS Linux,game original battletech mechwarrior creator j...
3,https://store.steampowered.com/app/221100/DayZ/,DayZ,Bohemia Interactive,"Survival,Zombies,Open World,Multiplayer,PvP,Ma...","English,French,Italian,German,Spanish - Spain,...","Action,Adventure,Massively Multiplayer",About This Game The post-soviet country of Ch...,0,40.491,Mixed,2018-12-13 00:00:00,0.0,windows,game post soviet country chernarus struck unkn...
4,https://store.steampowered.com/app/8500/EVE_On...,EVE Online,CCP,"Space,Massively Multiplayer,Sci-fi,Sandbox,MMO...","English,German,Russian,French","Action,Free to Play,Massively Multiplayer,RPG,...",About This Game,0,,Mostly Positive,2003-05-06 00:00:00,0.0,windows macOS,game


In [57]:
# Tuned TFidfvectorizer
vec = TfidfVectorizer(min_df = 0.04,ngram_range=(1,2)).fit(clean_df.clean_description)

vectors = vec.transform(clean_df.clean_description) # Transform text to vectors
sum_tfidf = vectors.sum(axis=0) # Sum of tfidf weighting by word
tfidf_list = [(word, sum_tfidf[0, idx]) for word, idx in vec.vocabulary_.items()]  # Get the word and associated weight
sorted_tfidf_list =sorted(tfidf_list, key = lambda x: x[1], reverse=True)  # Sort
sorted_tfidf_list

[('game', 2845.290446835472),
 ('world', 1181.8949337513448),
 ('level', 1153.3183669875718),
 ('player', 1130.0493535974222),
 ('new', 1062.0506699517246),
 ('play', 909.2270206751421),
 ('time', 856.1294658743665),
 ('mode', 849.6906607048571),
 ('enemy', 837.3558933645428),
 ('puzzle', 825.2479789138846),
 ('feature', 812.003801828686),
 ('one', 794.8166203826302),
 ('story', 770.9994662114602),
 ('different', 759.546136325372),
 ('character', 745.2977936933515),
 ('find', 701.191383880587),
 ('get', 690.4062216544905),
 ('weapon', 682.8067354946745),
 ('way', 663.4956241555884),
 ('unique', 661.8335478629541),
 ('battle', 653.2460131621135),
 ('take', 646.2192360097004),
 ('experience', 643.7232608246675),
 ('adventure', 621.8772377314147),
 ('make', 619.9135545000145),
 ('use', 610.8673657189966),
 ('gameplay', 577.1873052197064),
 ('like', 575.1902800059488),
 ('system', 560.2746193066391),
 ('life', 549.5986516084688),
 ('action', 546.9119723914426),
 ('control', 538.18679602417

In [58]:
X_proj = pd.DataFrame(vectors.toarray(), index=clean_df.name.tolist())

In [59]:
km = KMeans(n_clusters=50)
km.fit(X_proj)

KMeans(n_clusters=50)

In [60]:
km.cluster_centers_.shape

(50, 485)

In [61]:
km.cluster_centers_.shape

(50, 485)

In [62]:
labels = km.labels_
clustered_df = pd.DataFrame(data=labels, index=clean_df.name.tolist(), columns = ['cluster'])

In [63]:
clustered_df[clustered_df['cluster'] == 7]

Unnamed: 0,cluster
Beat Saber,7
Rocksmith™,7
BOXVR,7
Mixcraft 8 Home Studio,7
Beat Hazard 2,7
...,...
Beat Miner,7
Rhythm Doctor,7
Muse Dash,7
Zima uhodi!,7


In [64]:
clustered_df.value_counts()

cluster
18         1325
8          1129
9           950
19          890
16          889
23          876
49          864
4           767
24          748
27          707
37          653
35          643
48          637
3           623
0           620
39          616
12          594
46          524
30          498
47          496
29          457
15          442
20          430
25          423
10          413
43          389
2           379
41          370
40          359
33          352
45          349
28          348
34          346
6           336
21          330
5           314
26          307
11          304
7           296
44          293
42          282
32          276
38          271
22          258
17          229
14          226
13          198
36          190
1           188
31          163
dtype: int64

In [None]:

# inertias = []
# ks = range(1,100)
# for k in ks:
#     km_test = KMeans(n_clusters=k).fit(X_proj)
#     inertias.append(km_test.inertia_)
# plt.plot(ks, inertias)
# plt.xlabel('k cluster number')

KeyboardInterrupt: 

Exception ignored in: 'sklearn.cluster._k_means_common._relocate_empty_clusters_dense'
Traceback (most recent call last):
  File "<__array_function__ internals>", line 177, in where
KeyboardInterrupt: 


KeyboardInterrupt: 

Exception ignored in: 'sklearn.cluster._k_means_common._relocate_empty_clusters_dense'
Traceback (most recent call last):
  File "<__array_function__ internals>", line 177, in where
KeyboardInterrupt: 


KeyboardInterrupt: 

Exception ignored in: 'sklearn.cluster._k_means_common._relocate_empty_clusters_dense'
Traceback (most recent call last):
  File "<__array_function__ internals>", line 177, in where
KeyboardInterrupt: 


KeyboardInterrupt: 

Exception ignored in: 'sklearn.cluster._k_means_common._relocate_empty_clusters_dense'
Traceback (most recent call last):
  File "<__array_function__ internals>", line 177, in where
KeyboardInterrupt: 


KeyboardInterrupt: 

Exception ignored in: 'sklearn.cluster._k_means_common._relocate_empty_clusters_dense'
Traceback (most recent call last):
  File "<__array_function__ internals>", line 177, in where
KeyboardInterrupt: 


KeyboardInterrupt: 

Exception ignored in: 'sklearn.cluster._k_means_common._relocate_empty_clusters_dense'
Traceback (most recent call last):
  File "<__array_function__ internals>", line 177, in where
KeyboardInterrupt: 


## Testing different n_gram values

In [42]:
# Tuned TFidfvectorizer with different n_gram values
vec2 = TfidfVectorizer(min_df = 0.05,ngram_range=(2,2)).fit(clean_df.clean_description)

vectors2 = vec2.transform(clean_df.clean_description) # Transform text to vectors
sum_tfidf2 = vectors2.sum(axis=0) # Sum of tfidf weighting by word
tfidf_list2 = [(word, sum_tfidf2[0, idx]) for word, idx in vec2.vocabulary_.items()]  # Get the word and associated weight
sorted_tfidf_list2 =sorted(tfidf_list2, key = lambda x: x[1], reverse=True)  # Sort
sorted_tfidf_list2

[('key feature', 2356.0997995678854),
 ('game mode', 1431.248727608694),
 ('game feature', 1334.6418452150015),
 ('single player', 1241.3912167979283),
 ('game play', 1164.1713143197808),
 ('fast paced', 1079.1122243844788)]

In [68]:
X_proj2 = pd.DataFrame(vectors2.toarray(), index=clean_df.name.tolist())

In [44]:
km2 = KMeans(n_clusters=50)
km2.fit(X_proj2)

KMeans(n_clusters=50)

In [45]:
km2.cluster_centers_.shape

(50, 6)

In [46]:
labels2 = km2.labels_
clustered_df2 = pd.DataFrame(data=labels2, index=clean_df.name.tolist(), columns = ['cluster'])

In [47]:
clustered_df2[clustered_df2['cluster'] == 7]

Unnamed: 0,cluster
BioShock Infinite,7
Killing Floor 2,7
World War 3,7
Insurgency: Sandstorm,7
BATTALION 1944,7
...,...
Juggly,7
Stack Gun Heroes,7
Killer Clowns,7
Land of Chaos Online II: Revolution,7


In [48]:
clustered_df2.value_counts()

cluster
0          16651
1           1962
3           1083
2            936
4            872
5            780
6            710
8            141
9            119
7            107
17            84
11            79
13            69
15            68
10            61
18            60
12            59
14            59
16            55
20            53
21            47
19            41
29            35
22            34
39            31
23            29
32            28
24            24
34            24
45            24
36            22
27            21
41            18
25            17
33            16
47            16
44            15
26            15
38            13
30            11
48            11
35             9
31             9
42             9
49             8
40             7
28             7
37             6
43             6
46             6
dtype: int64

### testing clusters

In [89]:
vec3 = TfidfVectorizer(min_df = 0.05,ngram_range=(1,2)).fit(df.game_description)

vectors3 = vec3.transform(df.game_description) # Transform text to vectors
sum_tfidf3 = vectors3.sum(axis=0) # Sum of tfidf weighting by word
tfidf_list3 = [(word, sum_tfidf3[0, idx]) for word, idx in vec3.vocabulary_.items()]  # Get the word and associated weight
sorted_tfidf_list3 =sorted(tfidf_list2, key = lambda x: x[1], reverse=True)  # Sort
sorted_tfidf_list3

[('key feature', 2356.0997995678854),
 ('game mode', 1431.248727608694),
 ('game feature', 1334.6418452150015),
 ('single player', 1241.3912167979283),
 ('game play', 1164.1713143197808),
 ('fast paced', 1079.1122243844788)]

In [90]:
X_proj3 = pd.DataFrame(vectors3.toarray(), index=df.name.tolist())

In [91]:
km3 = KMeans(n_clusters=70)
km3.fit(X_proj3)

KMeans(n_clusters=70)

In [92]:
km3.cluster_centers_.shape

(70, 350)

In [93]:
labels3 = km3.labels_
clustered_df3 = pd.DataFrame(data=labels3, index=df.name.tolist(), columns = ['cluster'])

In [94]:
clustered_df3[clustered_df3['cluster'] == 7]

Unnamed: 0,cluster
EVE Online,7
Ring of Elysium,7
Neverwinter,7
Crypt of the NecroDancer,7
Star Trek Online,7
...,...
Mr Dirt Poor,7
東方輝針城 〜 Double Dealing Character.,7
東方神霊廟 〜 Ten Desires.,7
Вкудахте - Симулятор Соцсети,7


In [88]:
clustered_df3.value_counts()

cluster
54         1196
21          907
2           682
64          548
40          547
           ... 
42          191
3           190
63          178
48          176
7           167
Length: 70, dtype: int64

In [95]:
clustered_df3.value_counts()

cluster
39         727
50         692
0          649
2          635
13         589
          ... 
68         193
41         186
27         182
53         180
30         164
Length: 70, dtype: int64

In [106]:
pd.set_option('display.max_rows', 300)

In [107]:
clustered_df3[clustered_df3['cluster']==55]

Unnamed: 0,cluster
DOOM,55
Call of Duty®: Black Ops,55
Grand Theft Auto IV: Complete Edition,55
Men of War: Assault Squad 2,55
Stick Fight: The Game,55
...,...
Mask of Fury,55
SUPERVERSE,55
Bullet Force,55
HyperDot,55
