# Imports

In [1]:
import pandas as pd # import for dataframe handle
import numpy as np # import for math and array operations
import matplotlib.pyplot as plt # import for visual representation
#import seaborn as sns # import for visual representation

from bs4 import BeautifulSoup
import requests

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords 
import string

# pipeline imports
from sklearn import set_config; set_config(display='diagram')
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import make_union
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.preprocessing import FunctionTransformer

# scalers, encoder, knn, vectorizer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import RobustScaler, StandardScaler

%load_ext autoreload
%autoreload 2

In [2]:
from nltk.stem import WordNetLemmatizer

def clean_text(text):
    
    for punctuation in string.punctuation:
        text = text.replace(punctuation, ' ') # Remove Punctuation
        
    lowercased = text.lower() # Lower Case
    lemmatizer = WordNetLemmatizer()
    #lemmatized = [lemmatizer.lemmatize(word) for word in lowercased]
    #lowercased = lemmatized     
    tokenized = word_tokenize(lowercased) # Tokenize
    words_only = [word for word in tokenized if word.isalpha()] # Remove numbers
    stop_words = set(stopwords.words('English')) # Make stopword list
    without_stopwords = [word for word in words_only if not word in stop_words]# Remove Stop Words
    lemmatized = [lemmatizer.lemmatize(word) for word in without_stopwords]
    
    return " ".join(lemmatized)

# Load Dataset 

In [3]:
df = pd.read_csv('../raw_data/clean_df.csv' )
df.head()

Unnamed: 0,url,name,developer,tags,languages,genre,game_description,mature_content,price,reviews,date,achievements,op_sys,demo,trial
0,https://store.steampowered.com/app/379720/DOOM/,DOOM,id Software,"FPS,Gore,Action,Demons,Shooter,First-Person,Gr...","English,French,Italian,German,Spanish - Spain,...",Action,"About This Game Developed by id software, the...",0,17.991,Very Positive,2016-05-12 00:00:00,1.0,windows,False,False
1,https://store.steampowered.com/app/578080/PLAY...,PLAYERUNKNOWN'S BATTLEGROUNDS,PUBG Corporation,"Survival,Shooter,Multiplayer,Battle Royale,PvP...","English,Korean,Simplified Chinese,French,Germa...","Action,Adventure,Massively Multiplayer",About This Game PLAYERUNKNOWN'S BATTLEGROUND...,1,26.991,Mixed,2017-12-21 00:00:00,1.0,windows,False,False
2,https://store.steampowered.com/app/637090/BATT...,BATTLETECH,Harebrained Schemes,"Mechs,Strategy,Turn-Based,Turn-Based Tactics,S...","English,French,German,Russian","Action,Adventure,Strategy",About This Game From original BATTLETECH/Mec...,0,35.991,Mostly Positive,2018-04-24 00:00:00,1.0,windows macOS Linux,False,False
3,https://store.steampowered.com/app/221100/DayZ/,DayZ,Bohemia Interactive,"Survival,Zombies,Open World,Multiplayer,PvP,Ma...","English,French,Italian,German,Spanish - Spain,...","Action,Adventure,Massively Multiplayer",About This Game The post-soviet country of Ch...,0,40.491,Mixed,2018-12-13 00:00:00,0.0,windows,False,False
4,https://store.steampowered.com/app/8500/EVE_On...,EVE Online,CCP,"Space,Massively Multiplayer,Sci-fi,Sandbox,MMO...","English,German,Russian,French","Action,Free to Play,Massively Multiplayer,RPG,...",About This Game,0,,Mostly Positive,2003-05-06 00:00:00,0.0,windows macOS,False,False


In [4]:
df.isnull().sum()

url                     0
name                    3
developer              34
tags                    3
languages               0
genre                   5
game_description        0
mature_content          0
price                4558
reviews                 0
date                  259
achievements            0
op_sys              12383
demo                    0
trial                   0
dtype: int64

In [5]:
df['tags'].fillna('', inplace=True)
df['genre'].fillna('', inplace=True)

In [6]:
df['metadata'] = df[['tags', 'genre']].apply(lambda x: ' '.join(x), axis = 1)


df['metadata'] = df['metadata'].apply(
    lambda x: ' '.join(list(set(x.split())))
)

In [7]:
df.head()

Unnamed: 0,url,name,developer,tags,languages,genre,game_description,mature_content,price,reviews,date,achievements,op_sys,demo,trial,metadata
0,https://store.steampowered.com/app/379720/DOOM/,DOOM,id Software,"FPS,Gore,Action,Demons,Shooter,First-Person,Gr...","English,French,Italian,German,Spanish - Spain,...",Action,"About This Game Developed by id software, the...",0,17.991,Very Positive,2016-05-12 00:00:00,1.0,windows,False,False,"FPS,Gore,Action,Demons,Shooter,First-Person,Gr..."
1,https://store.steampowered.com/app/578080/PLAY...,PLAYERUNKNOWN'S BATTLEGROUNDS,PUBG Corporation,"Survival,Shooter,Multiplayer,Battle Royale,PvP...","English,Korean,Simplified Chinese,French,Germa...","Action,Adventure,Massively Multiplayer",About This Game PLAYERUNKNOWN'S BATTLEGROUND...,1,26.991,Mixed,2017-12-21 00:00:00,1.0,windows,False,False,"Co-Op,Tactical,Co-op,First-Person,Early Royale..."
2,https://store.steampowered.com/app/637090/BATT...,BATTLETECH,Harebrained Schemes,"Mechs,Strategy,Turn-Based,Turn-Based Tactics,S...","English,French,German,Russian","Action,Adventure,Strategy",About This Game From original BATTLETECH/Mec...,0,35.991,Mostly Positive,2018-04-24 00:00:00,1.0,windows macOS Linux,False,False,"Customization,Management,Adventure,Space,Story..."
3,https://store.steampowered.com/app/221100/DayZ/,DayZ,Bohemia Interactive,"Survival,Zombies,Open World,Multiplayer,PvP,Ma...","English,French,Italian,German,Spanish - Spain,...","Action,Adventure,Massively Multiplayer",About This Game The post-soviet country of Ch...,0,40.491,Mixed,2018-12-13 00:00:00,0.0,windows,False,False,"World,Multiplayer,PvP,Massively Survival,Zombi..."
4,https://store.steampowered.com/app/8500/EVE_On...,EVE Online,CCP,"Space,Massively Multiplayer,Sci-fi,Sandbox,MMO...","English,German,Russian,French","Action,Free to Play,Massively Multiplayer,RPG,...",About This Game,0,,Mostly Positive,2003-05-06 00:00:00,0.0,windows macOS,False,False,"World,RPG,PvP,Multiplayer,Free Action,Free Mul..."


In [8]:
df['metadata'] = df['metadata'].apply(clean_text)

df['game_description'] = df['game_description'].apply(clean_text)

In [9]:
df['tags'] = df['tags'].apply(clean_text)

In [10]:
df['languages'] = df['languages'].apply(clean_text)

In [11]:
df['genre'] = df['genre'].apply(clean_text)

In [12]:
df.head()

Unnamed: 0,url,name,developer,tags,languages,genre,game_description,mature_content,price,reviews,date,achievements,op_sys,demo,trial,metadata
0,https://store.steampowered.com/app/379720/DOOM/,DOOM,id Software,fps gore action demon shooter first person gre...,english french italian german spanish spain ja...,action,game developed id software studio pioneered fi...,0,17.991,Very Positive,2016-05-12 00:00:00,1.0,windows,False,False,fps gore action demon shooter first person gre...
1,https://store.steampowered.com/app/578080/PLAY...,PLAYERUNKNOWN'S BATTLEGROUNDS,PUBG Corporation,survival shooter multiplayer battle royale pvp...,english korean simplified chinese french germa...,action adventure massively multiplayer,game playerunknown battleground battle royale ...,1,26.991,Mixed,2017-12-21 00:00:00,1.0,windows,False,False,co op tactical co op first person early royale...
2,https://store.steampowered.com/app/637090/BATT...,BATTLETECH,Harebrained Schemes,mechs strategy turn based turn based tactic sc...,english french german russian,action adventure strategy,game original battletech mechwarrior creator j...,0,35.991,Mostly Positive,2018-04-24 00:00:00,1.0,windows macOS Linux,False,False,customization management adventure space story...
3,https://store.steampowered.com/app/221100/DayZ/,DayZ,Bohemia Interactive,survival zombie open world multiplayer pvp mas...,english french italian german spanish spain cz...,action adventure massively multiplayer,game post soviet country chernarus struck unkn...,0,40.491,Mixed,2018-12-13 00:00:00,0.0,windows,False,False,world multiplayer pvp massively survival zombi...
4,https://store.steampowered.com/app/8500/EVE_On...,EVE Online,CCP,space massively multiplayer sci fi sandbox mmo...,english german russian french,action free play massively multiplayer rpg str...,game,0,,Mostly Positive,2003-05-06 00:00:00,0.0,windows macOS,False,False,world rpg pvp multiplayer free action free mul...


In [13]:
df.to_csv('../raw_data/clean_df.csv', index=False)

# Preprocessing

In [42]:
def create_pipeline(df):
    array_transf = FunctionTransformer(lambda array: array.toarray())
    
    meta_transf = make_pipeline(TfidfVectorizer(min_df=0.05), array_transf,RobustScaler())
    desc_transf = make_pipeline(TfidfVectorizer(min_df=0.1),array_transf, RobustScaler())
    
    ord_encoder = OrdinalEncoder(
        categories=[
            [
                "Overwhelmingly Negative",
                "Very Negative",
                "Negative",
                "Mostly Negative",
                'Mixed',
                "Mostly Positive",
                "Positive",
                "Very Positive",
                "Overwhelmingly Positive"
            ]],
        dtype=np.int64,
        handle_unknown="use_encoded_value",
        unknown_value=-1
    )
    
    ord_transf = make_pipeline(ord_encoder, StandardScaler())
    
    num_transf = make_pipeline(StandardScaler())


    preproc_basic = make_column_transformer(
        (meta_transf, 'metadata'),
        (desc_transf, 'game_description'),
        (ord_transf, ['reviews']),
        (num_transf, ['mature_content', 'achievements']),
        remainder='drop'
    )
    
    return preproc_basic.fit_transform(df)

In [43]:
def train(X, y):
    return KNeighborsRegressor().fit(X,y)

In [44]:
def recommending_system(model, X, game):
    
    neighbors_index = model.kneighbors(X.loc[[game]],n_neighbors=df.shape[0])[1][0]
    neighbors_distance = model.kneighbors(X.loc[[game]],n_neighbors=df.shape[0])[0][0]
    
    neighbors_list = list(neighbors_index)
    return pd.DataFrame(neighbors_distance, index = X.iloc[neighbors_list, :].index, columns=['distance']).head(10)
    

In [45]:
pipe = create_pipeline(df)
pipe.shape

(24567, 172)

In [46]:
X = pd.DataFrame(pipe, index=df.name.tolist())

In [47]:
model = train(X, df['url'])

In [48]:
recommending_system(model, X, 'Counter-Strike')

Unnamed: 0,distance
Counter-Strike,0.0
10-4 Indirect Contact,3.616833
Long Gone Days,4.009126
Ultramegon,4.128793
Soul Saber 2,4.209247
Titan Outpost,4.215098
Sekiro™: Shadows Die Twice,4.262409
Battle Grounds III,4.375661
Sakura Clicker,4.483817
Feral Fury,4.49075


# Testing Models

In [27]:
user_df = pd.read_csv('../raw_data/steam-200k.csv',usecols=[0,1,2,3],names=['userid','game','behavior','hoursplayed'])

In [28]:
# keeping only play entries

df_play = user_df[user_df['behavior']=='play']
df_play=df_play.drop(columns='behavior')

#keeping only games that are also in the main dataset
user_name= pd.DataFrame(df_play['game'].unique(),columns=['name']).merge(df, on = 'name')
join_name = list(user_name.name.unique())
df_play = df_play[df_play['game'].isin(join_name)]

In [29]:
# Creating DF of users favorites 2 games

def get_fav_games(df,user):
    db = df[df['userid']==user].sort_values(by='hoursplayed', ascending=False)
    return list(db['game'].iloc[0:2])

def get_user_list(df):
    temp_df=df.groupby('userid').count()[['game']]
    return list(temp_df[temp_df['game']>1].index)

def get_fav_list(df):
    user_list= get_user_list(df)
    fav_list=[]
    for user in user_list:
        fav_list.append(get_fav_games(df,user))
    fav1=[]
    fav2=[]
    for fav in fav_list:
        fav1.append(fav[0])
        fav2.append(fav[1])
    return pd.DataFrame(data=list(zip( fav1, fav2)),
                         columns=['most_fav_game', 'sec_fav_game'],index=user_list)


In [30]:
test_df = get_fav_list(df_play)

In [31]:
def testing_models(df, model):

    df['distance'] = ''
    for index, row in df.iterrows():
        neighbors_list = list(model.kneighbors(X.loc[[row['most_fav_game']]],n_neighbors=df.shape[0])[1][0])
        res = pd.DataFrame(model.kneighbors(X.loc[[row['most_fav_game']]],n_neighbors=df.shape[0])[0][0],\
                           index = X.iloc[neighbors_list, :]\
                        .index, columns = ['distance']).loc[row['sec_fav_game']][0]
        df.loc[index, 'distance'] = res
        
    return df

In [38]:
test_df['distance'] = ''

In [None]:
for index, row in test_df.iterrows():
        neighbors_list = list(model.kneighbors(X.loc[[row['most_fav_game']]],n_neighbors=df.shape[0])[1][0])
        res = pd.DataFrame(model.kneighbors(X.loc[[row['most_fav_game']]],n_neighbors=df.shape[0])[0][0],\
                           index = X.iloc[neighbors_list, :]\
                        .index, columns = ['distance']).loc[row['sec_fav_game']][0]
        test_df.loc[index, 'distance'] = res

In [None]:
test_df['distance'].sum()