# Imports

In [328]:
import pandas as pd # import for dataframe handle
import numpy as np # import for math and array operations
import matplotlib.pyplot as plt # import for visual representation
import seaborn as sns # import for visual representation

from bs4 import BeautifulSoup
import requests

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords 
import string

# pipeline imports
from sklearn import set_config; set_config(display='diagram')
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import make_union
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.preprocessing import FunctionTransformer

# scalers, encoder, knn, vectorizer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import RobustScaler, StandardScaler



%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [329]:
from nltk.stem import WordNetLemmatizer

def clean_text(text):
    
    for punctuation in string.punctuation:
        text = text.replace(punctuation, ' ') # Remove Punctuation
        
    lowercased = text.lower() # Lower Case
    lemmatizer = WordNetLemmatizer()
    #lemmatized = [lemmatizer.lemmatize(word) for word in lowercased]
    #lowercased = lemmatized     
    tokenized = word_tokenize(lowercased) # Tokenize
    words_only = [word for word in tokenized if word.isalpha()] # Remove numbers
    stop_words = set(stopwords.words('English')) # Make stopword list
    without_stopwords = [word for word in words_only if not word in stop_words]# Remove Stop Words
    lemmatized = [lemmatizer.lemmatize(word) for word in without_stopwords]
    
    return " ".join(lemmatized)

# Load Dataset 

In [330]:
df = pd.read_csv('../raw_data/clean_df.csv' )
df.head()

Unnamed: 0,url,name,developer,tags,languages,genre,game_description,mature_content,price,reviews,date,achievements,op_sys,demo,trial,metadata
0,https://store.steampowered.com/app/379720/DOOM/,DOOM,id Software,fps gore action demon shooter first person gre...,english french italian german spanish spain ja...,action,game developed id software studio pioneered fi...,0,17.991,Very Positive,2016-05-12 00:00:00,1.0,windows,False,False,atmospheric co zombie demon fi fast blood gore...
1,https://store.steampowered.com/app/578080/PLAY...,PLAYERUNKNOWN'S BATTLEGROUNDS,PUBG Corporation,survival shooter multiplayer battle royale pvp...,english korean simplified chinese french germa...,action adventure massively multiplayer,game playerunknown battleground battle royale ...,1,26.991,Mixed,2017-12-21 00:00:00,1.0,windows,False,False,access simulation pvp tactical co adventure on...
2,https://store.steampowered.com/app/637090/BATT...,BATTLETECH,Harebrained Schemes,mechs strategy turn based turn based tactic sc...,english french german russian,action adventure strategy,game original battletech mechwarrior creator j...,0,35.991,Mostly Positive,2018-04-24 00:00:00,1.0,windows macOS Linux,False,False,character tactical adventure rpg robot rich ba...
3,https://store.steampowered.com/app/221100/DayZ/,DayZ,Bohemia Interactive,survival zombie open world multiplayer pvp mas...,english french italian german spanish spain cz...,action adventure massively multiplayer,game post soviet country chernarus struck unkn...,0,40.491,Mixed,2018-12-13 00:00:00,0.0,windows,False,False,access simulation open pvp atmospheric post ad...
4,https://store.steampowered.com/app/8500/EVE_On...,EVE Online,CCP,space massively multiplayer sci fi sandbox mmo...,english german russian french,action free play massively multiplayer rpg str...,game,0,,Mostly Positive,2003-05-06 00:00:00,0.0,windows macOS,False,False,atmospheric simulation open pvp tactical pve r...


In [331]:
df.shape

(24567, 16)

In [332]:
df.isnull().sum()

url                     0
name                    3
developer              34
tags                    3
languages               0
genre                   6
game_description        0
mature_content          0
price                4558
reviews                 0
date                  259
achievements            0
op_sys              12383
demo                    0
trial                   0
metadata                3
dtype: int64

In [333]:
df['tags'].fillna('', inplace=True)
df['genre'].fillna('', inplace=True)

In [334]:
df['metadata'] = df[['tags', 'genre']].apply(lambda x: ' '.join(x), axis = 1)


df['metadata'] = df['metadata'].apply(
    lambda x: ' '.join(list(set(x.split())))
)

In [335]:
df.head()

Unnamed: 0,url,name,developer,tags,languages,genre,game_description,mature_content,price,reviews,date,achievements,op_sys,demo,trial,metadata
0,https://store.steampowered.com/app/379720/DOOM/,DOOM,id Software,fps gore action demon shooter first person gre...,english french italian german spanish spain ja...,action,game developed id software studio pioneered fi...,0,17.991,Very Positive,2016-05-12 00:00:00,1.0,windows,False,False,atmospheric co zombie demon fi fast blood gore...
1,https://store.steampowered.com/app/578080/PLAY...,PLAYERUNKNOWN'S BATTLEGROUNDS,PUBG Corporation,survival shooter multiplayer battle royale pvp...,english korean simplified chinese french germa...,action adventure massively multiplayer,game playerunknown battleground battle royale ...,1,26.991,Mixed,2017-12-21 00:00:00,1.0,windows,False,False,access simulation pvp tactical co adventure on...
2,https://store.steampowered.com/app/637090/BATT...,BATTLETECH,Harebrained Schemes,mechs strategy turn based turn based tactic sc...,english french german russian,action adventure strategy,game original battletech mechwarrior creator j...,0,35.991,Mostly Positive,2018-04-24 00:00:00,1.0,windows macOS Linux,False,False,character tactical adventure rpg robot rich ba...
3,https://store.steampowered.com/app/221100/DayZ/,DayZ,Bohemia Interactive,survival zombie open world multiplayer pvp mas...,english french italian german spanish spain cz...,action adventure massively multiplayer,game post soviet country chernarus struck unkn...,0,40.491,Mixed,2018-12-13 00:00:00,0.0,windows,False,False,access simulation open pvp atmospheric post ad...
4,https://store.steampowered.com/app/8500/EVE_On...,EVE Online,CCP,space massively multiplayer sci fi sandbox mmo...,english german russian french,action free play massively multiplayer rpg str...,game,0,,Mostly Positive,2003-05-06 00:00:00,0.0,windows macOS,False,False,atmospheric simulation open pvp tactical pve r...


In [336]:
df['metadata'] = df['metadata'].apply(clean_text)

df['game_description'] = df['game_description'].apply(clean_text)

In [337]:
df['tags'] = df['tags'].apply(clean_text)

In [338]:
df['languages'] = df['languages'].apply(clean_text)

In [339]:
df['genre'] = df['genre'].apply(clean_text)

In [340]:
df.head()

Unnamed: 0,url,name,developer,tags,languages,genre,game_description,mature_content,price,reviews,date,achievements,op_sys,demo,trial,metadata
0,https://store.steampowered.com/app/379720/DOOM/,DOOM,id Software,fps gore action demon shooter first person gre...,english french italian german spanish spain ja...,action,game developed id software studio pioneered fi...,0,17.991,Very Positive,2016-05-12 00:00:00,1.0,windows,False,False,atmospheric co zombie demon fi fast blood gore...
1,https://store.steampowered.com/app/578080/PLAY...,PLAYERUNKNOWN'S BATTLEGROUNDS,PUBG Corporation,survival shooter multiplayer battle royale pvp...,english korean simplified chinese french germa...,action adventure massively multiplayer,game playerunknown battleground battle royale ...,1,26.991,Mixed,2017-12-21 00:00:00,1.0,windows,False,False,access simulation pvp tactical co adventure on...
2,https://store.steampowered.com/app/637090/BATT...,BATTLETECH,Harebrained Schemes,mechs strategy turn based turn based tactic sc...,english french german russian,action adventure strategy,game original battletech mechwarrior creator j...,0,35.991,Mostly Positive,2018-04-24 00:00:00,1.0,windows macOS Linux,False,False,character tactical adventure rpg robot rich ba...
3,https://store.steampowered.com/app/221100/DayZ/,DayZ,Bohemia Interactive,survival zombie open world multiplayer pvp mas...,english french italian german spanish spain cz...,action adventure massively multiplayer,game post soviet country chernarus struck unkn...,0,40.491,Mixed,2018-12-13 00:00:00,0.0,windows,False,False,access simulation open pvp atmospheric post ad...
4,https://store.steampowered.com/app/8500/EVE_On...,EVE Online,CCP,space massively multiplayer sci fi sandbox mmo...,english german russian french,action free play massively multiplayer rpg str...,game,0,,Mostly Positive,2003-05-06 00:00:00,0.0,windows macOS,False,False,atmospheric simulation open pvp tactical pve r...


In [341]:
df.to_csv('../raw_data/clean_df.csv', index=False)

# Preprocessing

In [342]:
def create_pipeline(df):
    array_transf = FunctionTransformer(lambda array: array.toarray())
    
    meta_transf = make_pipeline(TfidfVectorizer(min_df=0.05), array_transf,RobustScaler())
    #desc_transf = make_pipeline(TfidfVectorizer(min_df=0.1),array_transf, RobustScaler())
    
    ord_encoder = OrdinalEncoder(
        categories=[
            [
                "Overwhelmingly Negative",
                "Very Negative",
                "Negative",
                "Mostly Negative",
                'Mixed',
                "Mostly Positive",
                "Positive",
                "Very Positive",
                "Overwhelmingly Positive"
            ]],
        dtype=np.int64,
        handle_unknown="use_encoded_value",
        unknown_value=-1
    )
    
    ord_transf = make_pipeline(ord_encoder, StandardScaler())
    
    num_transf = make_pipeline(StandardScaler())


    preproc_basic = make_column_transformer(
        (meta_transf, 'metadata'),
        #(desc_transf, 'game_description'),
        (ord_transf, ['reviews']),
        (num_transf, ['mature_content', 'achievements']),
        remainder='drop'
    )
    
    return preproc_basic.fit_transform(df)

In [343]:
def train(X, y):
    return KNeighborsRegressor().fit(X,y)

In [344]:
def recommending_system(model, X, game):
    
    neighbors_index = model.kneighbors(X.loc[[game]],n_neighbors=df.shape[0])[1][0]
    neighbors_distance = model.kneighbors(X.loc[[game]],n_neighbors=df.shape[0])[0][0]
    
    neighbors_list = list(neighbors_index)
    return pd.DataFrame(neighbors_distance, index = X.iloc[neighbors_list, :].index, columns=['distance']).head(10)
    

In [345]:
pipe = create_pipeline(df)
pipe.shape

(24567, 50)

In [346]:
X = pd.DataFrame(pipe, index=df.name.tolist())

In [347]:
model = train(X, df['url'])

In [348]:
recommending_system(model, X, 'Counter-Strike')

Unnamed: 0,distance
Counter-Strike,0.0
Armajet,1.170097
Horror Legends,1.21129
Team Fortress Classic,1.211677
The Light Keeps Us Safe,1.212547
Versus World,1.227177
Outworld Battlegrounds,1.236037
ArtPose Pro,1.262666
Hunt Showdown,1.346713
1001st Hyper Tower,1.354704


# Testing Models

In [349]:
user_df = pd.read_csv('../raw_data/steam-200k.csv',usecols=[0,1,2,3],names=['userid','game','behavior','hoursplayed'])

In [350]:
# keeping only play entries

df_play = user_df[user_df['behavior']=='play']
df_play=df_play.drop(columns='behavior')

#keeping only games that are also in the main dataset
user_name= pd.DataFrame(df_play['game'].unique(),columns=['name']).merge(df, on = 'name')
join_name = list(user_name.name.unique())
df_play = df_play[df_play['game'].isin(join_name)]

In [351]:
# Creating DF of users favorites 2 games

def get_fav_games(df,user):
    db = df[df['userid']==user].sort_values(by='hoursplayed', ascending=False)
    return list(db['game'].iloc[0:2])

def get_user_list(df):
    temp_df=df.groupby('userid').count()[['game']]
    return list(temp_df[temp_df['game']>1].index)

def get_fav_list(df):
    user_list= get_user_list(df)
    fav_list=[]
    for user in user_list:
        fav_list.append(get_fav_games(df,user))
    fav1=[]
    fav2=[]
    for fav in fav_list:
        fav1.append(fav[0])
        fav2.append(fav[1])
    return pd.DataFrame(data=list(zip( fav1, fav2)),
                         columns=['most_fav_game', 'sec_fav_game'],index=user_list)


In [352]:
test_df = get_fav_list(df_play)

In [353]:
def testing_models(df, model):

    df['distance'] = ''
    for index, row in df.iterrows():
        neighbors_list = list(model.kneighbors(X.loc[[row['most_fav_game']]],n_neighbors=df.shape[0])[1][0])
        res = pd.DataFrame(model.kneighbors(X.loc[[row['most_fav_game']]],n_neighbors=df.shape[0])[0][0],\
                           index = X.iloc[neighbors_list, :]\
                        .index, columns = ['distance']).loc[row['sec_fav_game']][0]
        df.loc[index, 'distance'] = res
        
    return df

In [354]:
df.shape

(24567, 16)

# PCA

In [356]:
from sklearn.decomposition import PCA
pca = PCA(n_components=1)
pca.fit(X)

In [357]:
X_proj = pca.transform(X)

In [358]:
X_proj = pd.DataFrame(X_proj, index=df.name.tolist())

In [359]:

model_pca = train(X_proj, df['url'])

In [360]:
recommending_system(model_pca, X_proj, 'Far Cry 3')

Unnamed: 0,distance
Far Cry 3,0.0
Return to Castle Wolfenstein,1.7e-05
MEGALOMANIAC,5.9e-05
The Castle Disaster,5.9e-05
MOAI 2: Path to Another World,5.9e-05
Crimson Keep,0.000122
Car Mechanic Simulator 2015,0.000219
Serious Sam 4: Planet Badass,0.000287
Call of Duty: World at War,0.00033
Far Cry®,0.000346


In [317]:
def test(test_df, df, model , X , name ):
    for index, row in test_df.iterrows():
            neighbors_list = list(model.kneighbors(X.loc[[row['most_fav_game']]],n_neighbors=df.shape[0])[1][0])
            res = pd.DataFrame(model.kneighbors(X.loc[[row['most_fav_game']]],n_neighbors=df.shape[0])[0][0],\
                               index = X.iloc[neighbors_list, :]\
                            .index, columns = name).loc[row['sec_fav_game']][0]
            test_df.loc[index, name] = res
            
    return test_df



# Testing params

In [362]:
def create_pipeline(df, m, n):
    array_transf = FunctionTransformer(lambda array: array.toarray())
    
    meta_transf = make_pipeline(TfidfVectorizer(min_df=m), array_transf,RobustScaler())
    
    ord_encoder = OrdinalEncoder(
        categories=[
            [
                "Overwhelmingly Negative",
                "Very Negative",
                "Negative",
                "Mostly Negative",
                'Mixed',
                "Mostly Positive",
                "Positive",
                "Very Positive",
                "Overwhelmingly Positive"
            ]],
        dtype=np.int64,
        handle_unknown="use_encoded_value",
        unknown_value=-1
    )
    
    ord_transf = make_pipeline(ord_encoder, StandardScaler())
    
    num_transf = make_pipeline(StandardScaler())


    preproc_basic = make_column_transformer(
        (meta_transf, 'metadata'),
        #(desc_transf, 'game_description'),
        (ord_transf, ['reviews']),
        (num_transf, ['mature_content', 'achievements']),
        remainder='drop'
    )
    
    full_pipe = make_pipeline(preproc_basic, PCA(n_components=n) )
    return full_pipe.fit_transform(df)

In [430]:
pipe = create_pipeline(df, 0.05, 1)
pipe.shape

(24567, 1)

In [431]:
X = pd.DataFrame(pipe, index=df.name.tolist())
model = train(X, df['url'])

In [434]:
recommending_system(model, X, 'Mad Games Tycoon')

Unnamed: 0,distance
Mad Games Tycoon,0.0
Blueprint Tycoon,0.0
Boson X,0.000267
Farming Simulator 17,0.000287
The Dark Eye: Chains of Satinav,0.000307
Unforeseen Incidents,0.00033
Metro: Last Light Redux,0.00037
Left 4 Dead,0.000465
Year Walk,0.000659
Red Orchestra 2: Heroes of Stalingrad with Rising Storm,0.001031


In [392]:
def test_params(test_df,df, X, model, name):
    for index, row in test_df.iterrows():
        neighbors_list = list(model.kneighbors(X.loc[[row['most_fav_game']]],n_neighbors=df.shape[0])[1][0])
        res = pd.DataFrame(model.kneighbors(X.loc[[row['most_fav_game']]],n_neighbors=df.shape[0])[0][0],\
                           index = X.iloc[neighbors_list, :]\
                        .index, columns = [name]).loc[row['sec_fav_game']][0]
        test_df.loc[index, name] = res
    return test_df

In [398]:
def check_params(df, m, n):
    pipe = create_pipeline(df, m, n)
    X = pd.DataFrame(pipe, index=df.name.tolist())
    model = train(X, df['url'])
    name = str(m)+'_'+str(n)
    test_params(test_df,df, X, model, name)
    return test_df

In [418]:
check_params(df, 0.05, 2)

Unnamed: 0,most_fav_game,sec_fav_game,distance,0.05_0.05,0.05_1,0.05_2
5250,Portal 2,Alien Swarm,0.232685,0.250193,0.231537,0.232685
76767,Counter-Strike,Banished,4.061292,4.063091,1.264747,4.061292
86540,Far Cry 3,Left 4 Dead 2,3.257763,4.088480,1.435829,3.257763
229911,Counter-Strike,Worms Reloaded,4.038592,4.044377,1.228317,4.038592
298950,Team Fortress 2,Far Cry 3,3.03484,4.004455,0.996047,3.034840
...,...,...,...,...,...,...
308468736,Magic Duels,War Thunder,0.226368,0.227665,0.208186,0.226368
308695132,Champions Online,Brawlhalla,0.369265,0.369413,0.368585,0.369265
308760273,Toribash,Unturned,0.214967,0.272196,0.205379,0.214967
309052991,Brawlhalla,Heroes & Generals,1.843958,2.218917,1.794470,1.843958


In [423]:
test_df['0.05_2'].sum()

6228.462223104942

In [420]:
pipe = create_pipeline(df, 0.05, 1)

In [421]:
X = pd.DataFrame(pipe, index=df.name.tolist())
model = train(X, df['url'])

In [426]:
recommending_system(model, X, 'Left 4 Dead 2')

Unnamed: 0,distance
Left 4 Dead 2,0.0
Hexcells Infinite,0.000446
Another Brick in The Mall,0.001737
RESIDENT EVIL 2 / BIOHAZARD RE:2,0.001738
Magic 2014 — Duels of the Planeswalkers,0.001925
Contradiction - Spot The Liar!,0.003112
Kingdom Rush Frontiers,0.003742
SENRAN KAGURA Burst Re:Newal,0.004515
Hyperdevotion Noire: Goddess Black Heart (Neptunia),0.004747
Saints Row: The Third,0.006201


In [None]:
for m in [0.02, 0.05, 0.08]:
    for n in range (1,10):
        check_params(df, m, n)