# Imports

In [102]:
import pandas as pd # import for dataframe handle
import numpy as np # import for math and array operations
import matplotlib.pyplot as plt # import for visual representation
import seaborn as sns # import for visual representation

from bs4 import BeautifulSoup
import requests

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords 
import string
import nltk

# pipeline imports
from sklearn import set_config; set_config(display='diagram')
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import make_union
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.preprocessing import FunctionTransformer
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

# scalers, encoder, knn, vectorizer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import RobustScaler, StandardScaler, OneHotEncoder



%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [103]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/pyfenix/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/pyfenix/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/pyfenix/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [110]:
from nltk.stem import WordNetLemmatizer

def clean_text(text):
    
    for punctuation in string.punctuation:
        text = text.replace(punctuation, ' ') # Remove Punctuation
        
    lowercased = text.lower() # Lower Case
    lemmatizer = WordNetLemmatizer()
    tokenized = word_tokenize(lowercased) # Tokenize
    words_only = [word for word in tokenized if word.isalpha()] # Remove numbers
    stop_words = set(stopwords.words('english')) # Make stopword list
    without_stopwords = [word for word in words_only if not word in stop_words]# Remove Stop Words
#     lemmatized = [lemmatizer.lemmatize(word) for word in without_stopwords]
    
    return " ".join(without_stopwords)

In [171]:
def re_clean(text):
    text_t=[text].copy()
    temp=text_t[0].split(' ')
    non_imp_words=["game","games","gaming","studios","inc","studio"]
    for word in non_imp_words:
        while word in temp:
            temp.remove(word)
    return " ".join(temp)

In [169]:
re_clean(clean_text(df['developer'][1]))

'indigoblue'

# Load Dataset 

In [172]:
df = pd.read_csv("../raw_data/clean_bigger_df.csv").drop(columns='Unnamed: 0').rename(columns={'clean_review':'reviews','mature_encoded':'mature_content','clean_description':'game_description'})
df.head()
df.head()

Unnamed: 0,url,img_url,developer,requirements,name,metadata,game_description,pegi_url,mature_content,all_reviews,reviews
0,https://store.steampowered.com/app/10/CounterS...,https://steamcdn-a.akamaihd.net/steam/apps/10/...,Valve,{},Counter-Strike,First- Shooter Violent+ Score Survival Team- P...,game play world number online action game enga...,,0,"Overwhelmingly Positive(94,680)- 96% of the 94...",Overwhelmingly Positive
1,https://store.steampowered.com/app/1000000/ASC...,https://steamcdn-a.akamaihd.net/steam/apps/100...,IndigoBlue Game Studio,{'minimum': {'windows': {'processor': ' Intel ...,ASCENXION,Indie Stick about game Minimalist Controller2 ...,game ascenxion game combining shoot em adventu...,,0,Winter 2020,Winter 2020
2,https://store.steampowered.com/app/1000010/Cro...,https://steamcdn-a.akamaihd.net/steam/apps/100...,NEXT Studios,{'minimum': {'windows': {'processor': ' Intel ...,Crown Trick,Replay Female Support Cards Magic+ Steam Value...,game crown trick beautifully animated rogue li...,,0,"16 Oct, 2020","16 Oct, 2020"
3,https://store.steampowered.com/app/1000030/Coo...,https://steamcdn-a.akamaihd.net/steam/apps/100...,Vertigo Gaming Inc.,"{'minimum': {'windows': {'processor': '', 'mem...","Cook, Serve, Delicious! 3?!",Typing Play on controller Tablet Family Campai...,game hit road massive sequel million selling c...,https://steamstore-a.akamaihd.net/public/share...,1,Overwhelmingly Positive(761)- 96% of the 761 u...,Overwhelmingly Positive
4,https://store.steampowered.com/app/1000040/_/,https://steamcdn-a.akamaihd.net/steam/apps/100...,DoubleC Games,{},细胞战争,Indie Simulation+ Features Casual Action Singl...,game qq com,,0,"30 Mar, 2019","30 Mar, 2019"


## Cleaning developer column

In [173]:
df['developer'] = df.developer.astype('str').apply(clean).apply(re_clean)
df.head()

Unnamed: 0,url,img_url,developer,requirements,name,metadata,game_description,pegi_url,mature_content,all_reviews,reviews
0,https://store.steampowered.com/app/10/CounterS...,https://steamcdn-a.akamaihd.net/steam/apps/10/...,valve,{},Counter-Strike,First- Shooter Violent+ Score Survival Team- P...,game play world number online action game enga...,,0,"Overwhelmingly Positive(94,680)- 96% of the 94...",Overwhelmingly Positive
1,https://store.steampowered.com/app/1000000/ASC...,https://steamcdn-a.akamaihd.net/steam/apps/100...,indigoblue,{'minimum': {'windows': {'processor': ' Intel ...,ASCENXION,Indie Stick about game Minimalist Controller2 ...,game ascenxion game combining shoot em adventu...,,0,Winter 2020,Winter 2020
2,https://store.steampowered.com/app/1000010/Cro...,https://steamcdn-a.akamaihd.net/steam/apps/100...,next,{'minimum': {'windows': {'processor': ' Intel ...,Crown Trick,Replay Female Support Cards Magic+ Steam Value...,game crown trick beautifully animated rogue li...,,0,"16 Oct, 2020","16 Oct, 2020"
3,https://store.steampowered.com/app/1000030/Coo...,https://steamcdn-a.akamaihd.net/steam/apps/100...,vertigo,"{'minimum': {'windows': {'processor': '', 'mem...","Cook, Serve, Delicious! 3?!",Typing Play on controller Tablet Family Campai...,game hit road massive sequel million selling c...,https://steamstore-a.akamaihd.net/public/share...,1,Overwhelmingly Positive(761)- 96% of the 761 u...,Overwhelmingly Positive
4,https://store.steampowered.com/app/1000040/_/,https://steamcdn-a.akamaihd.net/steam/apps/100...,doublec,{},细胞战争,Indie Simulation+ Features Casual Action Singl...,game qq com,,0,"30 Mar, 2019","30 Mar, 2019"


# Preprocessing

In [256]:
def kmeans_labels(df, n , mi):
    vec = TfidfVectorizer(min_df = mi ,ngram_range=(1,2))
    X = vec.fit_transform(df['game_description'])
    kmodel = KMeans(n_clusters=n)
    kmodel.fit(X)
    
    return kmodel.labels_

In [257]:
def create_pipeline(df, m=0.05 , c=1, n =50, mi = 0.04,mii=0.02):
    array_transf = FunctionTransformer(lambda array: array.toarray())
    df['cluster'] = kmeans_labels(df, n, mi)
    
    dev_transf=make_pipeline(
        TfidfVectorizer(min_df=mii),
        array_transf,
        RobustScaler()
    )
    
    meta_transf = make_pipeline(
        TfidfVectorizer(min_df=m), 
        array_transf,
        RobustScaler()
    )
    
    ord_encoder = OrdinalEncoder(
        categories=[
            [
                "Overwhelmingly Negative",
                "Very Negative",
                "Negative",
                "Mostly Negative",
                'Mixed',
                "Mostly Positive",
                "Positive",
                "Very Positive",
                "Overwhelmingly Positive"
            ]],
        dtype=np.int64,
        handle_unknown="use_encoded_value",
        unknown_value=-1
    )
    
    ord_transf = make_pipeline(
        ord_encoder, 
        StandardScaler())
    
    cluster_transf = make_pipeline(
        OneHotEncoder(sparse=False), 
         StandardScaler()
    )
    
    num_transf = make_pipeline(StandardScaler())


    preproc_basic = make_column_transformer(
        (meta_transf, 'metadata'),
        (dev_transf, 'developer')
        (cluster_transf, ['cluster']),
        (ord_transf, ['reviews']),
        (num_transf, ['mature_content']),
        remainder='drop'
    )
    
    full_pipe = make_pipeline(preproc_basic, PCA(n_components=c) )
    return full_pipe.fit_transform(df)
    
    #return preproc_basic.fit_transform(df)

SyntaxError: closing parenthesis ')' does not match opening parenthesis '[' (212663238.py, line 48)

In [209]:
def train(X, y):
    return KNeighborsRegressor().fit(X,y)

In [210]:
def recommending_system(model, X, game):
    
    neighbors_index = model.kneighbors(X.loc[[game]],n_neighbors=df.shape[0])[1][0]
    neighbors_distance = model.kneighbors(X.loc[[game]],n_neighbors=df.shape[0])[0][0]
    
    neighbors_list = list(neighbors_index)
    
    # new_df_values = {
    #     'distance': neighbors_distance,
    #     'url': [],
    #     'price': [],
    #     'reviews': [],
    #     'op_sys': [],
    #     'developer': [],
    # }
    
    # for index in neighbors_index:
    #     new_df_values['url'].append(df.loc[index, 'url'])
    #     new_df_values['price'].append(df.loc[index, 'price'])
    #     new_df_values['reviews'].append(df.loc[index, 'reviews'])
    #     new_df_values['op_sys'].append(df.loc[index, 'op_sys'])
    #     new_df_values['developer'].append(df.loc[index, 'developer'])
    
    return pd.DataFrame(neighbors_distance, index = X.iloc[neighbors_list, :].index, columns=['distance'])
    

In [61]:
pipe = create_pipeline(df, 0.03, 1, 70, 0.07,0.02)
pipe.shape
X = pd.DataFrame(pipe, index=df.name.tolist())
model = train(X, df['url'])


Unnamed: 0,distance
S.T.A.L.K.E.R.: Shadow of Chernobyl,0.000000
Tom Clancy's Ghost Recon® Wildlands,0.000169
Dying Light,0.001580
DayZ,0.002276
Quake Live™,0.002678
...,...
ASTERELIS Demo,8.649554
Realm of Perpetual Guilds Demo,10.073068
12 Labours of Hercules Demo,10.100776
Orion: A Sci-Fi Visual Novel Demo,10.265040


In [62]:
recommending_system(model, X, 'Left 4 Dead').head(10)

Unnamed: 0,distance
Left 4 Dead,0.0
Ultimate Epic Battle Simulator,0.001514
Dark Messiah of Might & Magic,0.001979
Shadow Man,0.002738
Wrench,0.002841
Hakuoki: Kyoto Winds,0.002917
War Thunder,0.002947
Hnefatafl,0.003367
Coffee Talk,0.003605
ZOMBI,0.00392


In [63]:
recommending_system(model, X, 'S.T.A.L.K.E.R.: Shadow of Chernobyl').head(10)

Unnamed: 0,distance
S.T.A.L.K.E.R.: Shadow of Chernobyl,0.0
Tom Clancy's Ghost Recon® Wildlands,0.000169
Dying Light,0.00158
DayZ,0.002276
Quake Live™,0.002678
APB Reloaded,0.005317
Call of Duty®: Infinite Warfare,0.005673
Men of Valor,0.007082
Sonic Adventure 2,0.007108
The Jackbox Party Pack 6,0.007351


In [64]:
recommending_system(model, X, 'QUAKE').head(10)

Unnamed: 0,distance
QUAKE,0.0
The Emulator,6.2e-05
Aloof,7.1e-05
We Are Legion,0.00036
The Magic Circle,0.00058
X4: Foundations,0.001001
Crimson Tide: Operation Online,0.001071
GTTOD: Get To The Orange Door,0.001261
Aztaka,0.001308
Disney Epic Mickey 2: The Power of Two,0.001316


In [65]:
recommending_system(model, X, 'DOOM').head(10)

Unnamed: 0,distance
DOOM,0.0
Wolfenstein II: The New Colossus,0.001183
How to Survive 2,0.004659
Hitman: Contracts,0.004779
DEAD OR ALIVE 6,0.005344
STAR WARS™ - The Force Unleashed™ Ultimate Sith Edition,0.00572
The Elder Scrolls®: Legends™,0.006417
Grounded,0.006522
Total War: WARHAMMER,0.006718
Grand Theft Auto III,0.008719


In [66]:
recommending_system(model, X, 'Counter-Strike').head(10)

Unnamed: 0,distance
Counter-Strike,0.0
Hearts of Iron III: Semper Fi,0.000351
Rush for Berlin Gold,0.000351
Silent Hunter®: Wolves of the Pacific U-Boat Missions,0.000392
Gauntlet™ Slayer Edition,0.000394
Kohan II: Kings of War,0.000792
Redneck Rampage Rides Again,0.001638
Knights and Merchants,0.001745
Castlevania Anniversary Collection,0.002197
Team Fortress 2,0.002216


# Testing Data

In [223]:
user_df = pd.read_csv('../raw_data/steam-200k.csv',usecols=[0,1,2,3],names=['userid','game','behavior','hoursplayed'])

In [224]:
# keeping only play entries

df_play = user_df[user_df['behavior']=='play']
df_play=df_play.drop(columns='behavior')

#keeping only games that are also in the main dataset
user_name= pd.DataFrame(df_play['game'].unique(),columns=['name']).merge(df, on = 'name')
join_name = list(user_name.name.unique())
df_play = df_play[df_play['game'].isin(join_name)]

In [225]:
# Creating DF of users favorites 2 games

def get_fav_games(df,user):
    db = df[df['userid']==user].sort_values(by='hoursplayed', ascending=False)
    return list(db['game'].iloc[0:2])

def get_user_list(df):
    temp_df=df.groupby('userid').count()[['game']]
    return list(temp_df[temp_df['game']>1].index)

def get_fav_list(df):
    user_list= get_user_list(df)
    fav_list=[]
    for user in user_list:
        fav_list.append(get_fav_games(df,user))
    fav1=[]
    fav2=[]
    for fav in fav_list:
        fav1.append(fav[0])
        fav2.append(fav[1])
    return pd.DataFrame(data=list(zip( fav1, fav2)),
                         columns=['most_fav_game', 'sec_fav_game'],index=user_list)

In [261]:
test_df = get_fav_list(df_play)

In [227]:
def testing_models(df, model):

    df['distance'] = ''
    for index, row in df.iterrows():
        neighbors_list = list(model.kneighbors(X.loc[[row['most_fav_game']]],n_neighbors=X.shape[0])[1][0])
        res = pd.DataFrame(model.kneighbors(X.loc[[row['most_fav_game']]],n_neighbors=X.shape[0])[0][0],\
                           index = X.iloc[neighbors_list, :]\
                        .index, columns = ['distance']).loc[row['sec_fav_game']][0]
        df.loc[index, 'distance'] = res
        
    return df

In [228]:
X.shape

(36412, 1)

# Testing Params

In [220]:
def test_params(test_df,df, X, model, name, index_name):
    for index, row in test_df.iterrows():
        neighbors_list = list(model.kneighbors(X.loc[[row['most_fav_game']]],n_neighbors=df.shape[0])[1][0])
        res = pd.DataFrame(model.kneighbors(X.loc[[row['most_fav_game']]],n_neighbors=df.shape[0])[0][0],\
                           index = X.iloc[neighbors_list, :]\
                        .index, columns = [name])
        test_df.loc[index, name] = res.loc[row['sec_fav_game']][0]
        test_df.loc[index, index_name]= res.index.get_loc(row['sec_fav_game'])
    return test_df

In [221]:
def check_params(df, m=0.05 , c=1, n =50, mi = 0.1, mii=0.02):
    pipe = create_pipeline(df, m=0.05 , c=1, n =50, mi = 0.04,mii=0.02)
    X = pd.DataFrame(pipe, index=df.name.tolist())
    model = train(X, df['url'])
    name = str(m)
    index_name = 'index' + str(m)
    test_params(test_df,df, X, model, name, index_name)
    return test_df

In [217]:
m_list = [0.03, 0.05, 0.7]
c_list = [1,5,10]
n_list = [10, 50, 70]
mi_list = [0.05, 0.07, 0.1]
mii_list=[0.01,0.02,0.03,0.04,0.05]

In [218]:
main_list = [m_list, c_list, n_list, mi_list, mii_list]

In [219]:
import itertools
params_list = list(itertools.product(*main_list))
len(params_list)

405

In [75]:
# fun loop to go through!
count=50
for params in params_list[51:75]:
    check_params(df, params)
    count +=1
    print(count)

51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74


In [76]:
test_df.to_csv("../raw_data/big_df_test_params_0_24_51_74.csv")

# Testing with developer in pipeline

In [211]:
pipe = create_pipeline(df, 0.03, 1, 70, 0.07)
pipe.shape
X = pd.DataFrame(pipe, index=df.name.tolist())


In [212]:
model = train(X, df['url'])

In [213]:
recommending_system(model, X, 'Left 4 Dead').head(10)

Unnamed: 0,distance
Left 4 Dead,0.0
Heroes & Generals,0.000292
TrackMania Nations Forever,0.00128
Colony Survival,0.001648
Hellsplit: Arena,0.001812
Full Metal Furies,0.002191
Rogue Legacy 2,0.002596
Days of War: Definitive Edition,0.002727
Painkiller: Resurrection,0.003121
Obscure II (Obscure: The Aftermath),0.00332


In [214]:
recommending_system(model, X, 'S.T.A.L.K.E.R.: Shadow of Chernobyl').head(10)

Unnamed: 0,distance
S.T.A.L.K.E.R.: Shadow of Chernobyl,0.0
Life is Feudal: MMO,0.001125
State of Decay: YOSE,0.003989
Max Payne 2: The Fall of Max Payne,0.004557
RUINER,0.004686
Command & Conquer™ Remastered Collection,0.005351
Galactic Civilizations III,0.005398
Deus Ex: Game of the Year Edition,0.005648
Battlestations Pacific,0.005907
Sword of the Stars II: Enhanced Edition,0.006506


In [215]:
recommending_system(model, X, 'QUAKE').head(10)

Unnamed: 0,distance
QUAKE,0.0
Archers,3e-05
Treasure Hunt VR,3.3e-05
Heroes of Annihilated Empires,0.000532
League of Angels-Heaven's Fury,0.000898
MegaRace 2,0.001009
Metal Heads,0.001095
Kona,0.001207
Princesses Never Lose!,0.001329
Genesis Alpha One Deluxe Edition,0.00142


In [255]:
# fun loop to go through! Again...
count=349
for params in params_list[350:]:
    check_params(df, params)
    count +=1
    print(count)

350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404


In [258]:
test_df.sum()

most_fav_game                    Portal 2Counter-StrikeFar Cry 3Counter-StrikeT...
sec_fav_game                     Alien SwarmBanishedLeft 4 Dead 2Worms Reloaded...
(0.7, 1, 70, 0.05, 0.01)                                               2967.314637
index(0.7, 1, 70, 0.05, 0.01)                                           13141142.0
(0.7, 1, 70, 0.05, 0.02)                                               2891.660083
                                                       ...                        
index(0.7, 10, 70, 0.1, 0.03)                                           13220838.0
(0.7, 10, 70, 0.1, 0.04)                                               2918.169104
index(0.7, 10, 70, 0.1, 0.04)                                           12671083.0
(0.7, 10, 70, 0.1, 0.05)                                               2914.308467
index(0.7, 10, 70, 0.1, 0.05)                                           12390256.0
Length: 212, dtype: object

In [259]:
test_df.shape

(3184, 212)

In [260]:
test_df.to_csv('../raw_data/dev_test_300_404.csv')