In [2]:
import numpy as np # import for math and array operations
import pandas as pd # import for dataframe handle
import matplotlib.pyplot as plt # import for visual representation
import seaborn as sns # import for visual representation
import math
from bs4 import BeautifulSoup
import requests
import datetime as dt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsRegressor
from sklearn import preprocessing

%load_ext autoreload
%autoreload 2

In [3]:
df = pd.read_csv('../raw_data/combine_df.csv')

In [4]:
def clean_tags(tag):
    if ',' in str(tag):
        tag = ','.join(list(set(str(tag).split(','))))
        return tag
df['tags'] = df["popular_tags"].astype(str) +',' + df['genre'].astype(str)
df['tags'] = df['tags'].apply(lambda x : clean_tags(x))

In [5]:
df.shape[0]

38021

In [6]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords 
from unidecode import unidecode
import string

def clean (text):
    
    for punctuation in string.punctuation:
        text = text.replace(punctuation, ' ') # Remove Punctuation
        
    lowercased = text.lower() # Lower Case
    tokenized = word_tokenize(lowercased) # Tokenize
    words_only = [word for word in tokenized if word.isalpha()] # Remove numbers
    stop_words = set(stopwords.words('English')) # Make stopword list
    without_stopwords = [word for word in words_only if not word in stop_words] # Remove Stop Words
    return " ".join(without_stopwords)

In [7]:
df['clean_tag'] = df['tags'].apply(clean)

In [8]:
df = df.drop(['tags','genre' , 'popular_tags'],axis  =1)

In [9]:
# Tuned TFidfvectorizer
vec = TfidfVectorizer(min_df = 0.05).fit(df.clean_tag)
vectors = vec.transform(df.clean_tag) # Transform text to vectors


In [10]:
X_proj = pd.DataFrame(vectors.toarray(), index=df.name.tolist())

# Define X and y
y = X_proj[0]
knn_model = KNeighborsRegressor().fit(X_proj,y) 

In [11]:
#Test
neighbors_list = list(knn_model.kneighbors(X_proj.loc[['Counter-Strike']],n_neighbors=df.shape[0])[1][0])
pd.DataFrame(knn_model.kneighbors(X_proj.loc[['Counter-Strike']],n_neighbors=df.shape[0])[0][0], index = X_proj.iloc[neighbors_list, :].index, columns = ['distance']).head(10)

Unnamed: 0,distance
Counter-Strike,0.0
Project Xandata,0.365622
Holdfast: Nations At War,0.50544
Primal Carnage: Extinction,0.551667
ARMA: Cold War Assault,0.567837
Primal Carnage,0.577521
Nexuiz,0.600543
BLACK CLOVER: QUARTET KNIGHTS,0.600543
Arma Tactics,0.618359
Fortress Forever,0.638026


### Testing

In [28]:
user_df= pd.read_csv('../raw_data/steam-200k.csv',usecols=[0,1,2,3],names=['userid','game','behavior','hoursplayed'])
user_df.head()

Unnamed: 0,userid,game,behavior,hoursplayed
0,151603712,The Elder Scrolls V Skyrim,purchase,1.0
1,151603712,The Elder Scrolls V Skyrim,play,273.0
2,151603712,Fallout 4,purchase,1.0
3,151603712,Fallout 4,play,87.0
4,151603712,Spore,purchase,1.0


In [29]:
user_df['name']=user_df['game']
user_df = user_df[user_df['behavior']== 'play']

In [30]:
user_test= pd.DataFrame(user_df['name'].unique(),columns=['name'])
user_test

Unnamed: 0,name
0,The Elder Scrolls V Skyrim
1,Fallout 4
2,Spore
3,Fallout New Vegas
4,Left 4 Dead 2
...,...
3595,Space Colony
3596,Life is Hard
3597,Executive Assault
3598,O.R.B.


In [31]:
join_df=df.merge(user_test, on='name')
join_name = list(join_df.name.unique())

In [32]:
len(join_name)

1513

In [33]:
test_df = user_df[user_df['game'].isin(join_name)]

In [34]:
test_df = test_df.drop(columns = ['game', 'behavior'])

In [35]:
test_df

Unnamed: 0,userid,hoursplayed,name
3,151603712,87.0,Fallout 4
9,151603712,8.9,Left 4 Dead 2
11,151603712,8.5,HuniePop
13,151603712,8.1,Path of Exile
15,151603712,7.5,Poly Bridge
...,...,...,...
199987,128470551,2.6,Rogue Legacy
199989,128470551,2.5,Mortal Kombat Komplete Edition
199993,128470551,2.2,Magic Duels
199995,128470551,1.5,Titan Souls


In [36]:
test_df.groupby(['userid', 'name']).sum().sort_values(['userid','hoursplayed'], ascending = [True, False]).reset_index().groupby(['userid']).first()

Unnamed: 0_level_0,name,hoursplayed
userid,Unnamed: 1_level_1,Unnamed: 2_level_1
5250,Portal 2,13.6
76767,Counter-Strike,365.0
86540,Far Cry 3,17.8
144736,Counter-Strike,0.1
181212,Counter-Strike,1.8
...,...,...
309265377,Brawlhalla,2.0
309404240,Unturned,13.0
309434439,Dota 2,0.8
309824202,Dota 2,0.7


In [37]:
test_df.groupby(['userid', 'name']).sum().sort_values(['userid','hoursplayed'], ascending = [True, False])

Unnamed: 0_level_0,Unnamed: 1_level_0,hoursplayed
userid,name,Unnamed: 2_level_1
5250,Portal 2,13.6
5250,Alien Swarm,4.9
5250,Team Fortress 2,0.8
5250,Dota 2,0.2
76767,Counter-Strike,365.0
...,...,...
309404240,AdVenture Capitalist,0.7
309404240,Transformice,0.3
309434439,Dota 2,0.8
309824202,Dota 2,0.7


In [41]:
count_df = test_df[test_df['hoursplayed'] > 1]

In [42]:
list_user_more_hour = list(count_df[count_df['hoursplayed'] > 1].index)

In [43]:
len(list(set(count_df[count_df['hoursplayed'] > 1].reset_index().userid)))

7551

In [44]:
sum_df = test_df[test_df['userid'].isin(list_user_more_hour) == True].groupby('userid').count()[['game']]

KeyError: "None of [Index(['game'], dtype='object')] are in the [columns]"

In [None]:
sum_df

In [None]:
list_user_more_game = list(sum_df[sum_df['game'] > 1].index)

In [None]:
final_test_df = test_df[test_df['userid'].isin(list_user_more_game) == True]

In [45]:
game_df = test_df.groupby('userid').count()[['game']]

KeyError: "None of [Index(['game'], dtype='object')] are in the [columns]"

In [None]:
list(game_df[game_df['game'] > 1].index)

In [None]:
final_test_df.groupby(['userid','game']).sum().sort_values('hoursplayed')

## Game Description

In [46]:
from nltk.stem import WordNetLemmatizer

def clean_2 (text):
    
    for punctuation in string.punctuation:
        text = text.replace(punctuation, ' ') # Remove Punctuation
        
    lowercased = text.lower() # Lower Case
    lemmatizer = WordNetLemmatizer()
    #lemmatized = [lemmatizer.lemmatize(word) for word in lowercased]
    #lowercased = lemmatized     
    tokenized = word_tokenize(lowercased) # Tokenize
    words_only = [word for word in tokenized if word.isalpha()] # Remove numbers
    stop_words = set(stopwords.words('English')) # Make stopword list
    without_stopwords = [word for word in words_only if not word in stop_words]# Remove Stop Words
    lemmatized = [lemmatizer.lemmatize(word) for word in without_stopwords]
    
    return " ".join(lemmatized)



In [47]:
df['game_description']= df['game_description'].fillna('')
df['clean_des'] = df['game_description'].apply(clean_2)
#df = df.drop(['game_description'],axis  =1)

In [65]:
# Tuned TFidfvectorizer
vec_2 = TfidfVectorizer(min_df = 0.2).fit(df.clean_des)
vectors_2 = vec_2.transform(df.clean_des) # Transform text to vectors


In [66]:
X_proj_2 = pd.DataFrame(vectors_2.toarray(), index=df.name.tolist())

# Define X and y
y = X_proj_2[0]
knn_model = KNeighborsRegressor().fit(X_proj_2,y) 

In [67]:
neighbors_list = list(knn_model.kneighbors(X_proj_2.loc[['QUAKE II']],n_neighbors=38021)[1][0])
pd.DataFrame(knn_model.kneighbors(X_proj_2.loc[['QUAKE II']],n_neighbors=38021)[0][0], index = X_proj_2.iloc[neighbors_list, :].index, columns = ['distance']).loc['Blood of Patriots']

distance    0.513091
Name: Blood of Patriots, dtype: float64

In [68]:
neighbors_list = list(knn_model.kneighbors(X_proj_2.loc[['DOOM']],n_neighbors=10)[1][0])
pd.DataFrame(knn_model.kneighbors(X_proj_2.loc[['DOOM']],n_neighbors=10)[0][0], index = X_proj_2.iloc[neighbors_list, :].index, columns = ['distance'])

Unnamed: 0,distance
DOOM,0.0
Pumped BMX +,0.367757
ShapeRockets,0.416261
The Isle,0.418785
Pop Island - Let's Code !!!,0.427378
Braveland Heroes,0.429762
Dragon Souls,0.435963
DayZ,0.436352
Eden Rising,0.439171
Onirism,0.448336


In [52]:
knn_model.predict(X_proj_2.loc[['Battlegun']])

array([0.41313401])

In [69]:
neighbors_list

[0, 7249, 18005, 155, 32474, 4171, 34838, 3, 916, 3625]

In [70]:
X_proj_2.iloc[neighbors_list, :].index

Index(['DOOM', 'Pumped BMX +', 'ShapeRockets', 'The Isle',
       'Pop Island - Let's Code !!!', 'Braveland Heroes', 'Dragon Souls',
       'DayZ', 'Eden Rising', 'Onirism'],
      dtype='object')

In [53]:
knn_model.kneighbors(X_proj_2.loc[['DOOM']],n_neighbors=10)[0][0]

array([0.        , 0.36775738, 0.41626061, 0.41878506, 0.42737789,
       0.42976182, 0.43596269, 0.43635249, 0.43917078, 0.44833614])

## TESTING MODELS

In [54]:
user_df = pd.read_csv('../raw_data/steam-200k.csv',usecols=[0,1,2,3],names=['userid','game','behavior','hoursplayed'])

In [55]:
df_play = user_df[user_df['behavior']=='play']

In [56]:
df_play=df_play.drop(columns='behavior')

In [57]:
user_name= pd.DataFrame(df_play['game'].unique(),columns=['name'])

In [58]:
join_df=df.merge(user_name, on='name')
join_name = list(join_df.name.unique())

In [59]:
df_play = df_play[df_play['game'].isin(join_name)]

In [60]:
game_df = df_play.groupby('userid').count()[['game']]
user_list=list(game_df[game_df['game'] > 1].index)

In [61]:
def get_fav_games(df,user):
    db = df[df['userid']==user].sort_values(by='hoursplayed', ascending=False)
    return list(db['game'].iloc[0:2])

def get_user_list(df):
    temp_df=df.groupby('userid').count()[['game']]
    return list(temp_df[temp_df['game']>1].index)

def get_fav_list(df):
    user_list= get_user_list(df)
    fav_list=[]
    for user in user_list:
        fav_list.append(get_fav_games(df,user))
    fav1=[]
    fav2=[]
    for fav in fav_list:
        fav1.append(fav[0])
        fav2.append(fav[1])
    return pd.DataFrame(data=list(zip( fav1, fav2)),
                         columns=['most_fav_game', 'sec_fav_game'],index=user_list)

In [62]:
test_df = get_fav_list(df_play)

In [296]:
test_df['distance_1'] = ''

In [297]:
for index, row in test_df.iterrows():
    res = pd.DataFrame(knn_model.kneighbors(X_proj.loc[[row['most_fav_game']]],n_neighbors=df.shape[0])[0][0], index = X_proj.iloc[neighbors_list, :]\
             .index, columns = ['distance_1']).loc[row['sec_fav_game']][0]
    test_df.loc[index, 'distance_1'] = res


In [63]:
X_proj.iloc[neighbors_list, :]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,22,23,24,25,26,27,28,29,30,31
DOOM,0.0,0.146101,0.0,0.275495,0.0,0.0,0.298439,0.307517,0.0,0.0,...,0.0,0.309003,0.0,0.214804,0.271478,0.0,0.0,0.0,0.0,0.0
Pumped BMX +,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ShapeRockets,0.0,0.313192,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.6624,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
The Isle,0.264294,0.144095,0.1582,0.0,0.299058,0.0,0.294341,0.0,0.264294,0.0,...,0.204469,0.0,0.187222,0.0,0.0,0.0,0.193135,0.0,0.0,0.309852
Pop Island - Let's Code !!!,0.0,0.548985,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.713297,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Braveland Heroes,0.208129,0.0,0.124581,0.213971,0.47101,0.126895,0.231791,0.0,0.208129,0.236091,...,0.322034,0.0,0.0,0.0,0.0,0.0,0.304184,0.0,0.0,0.0
Dragon Souls,0.0,0.242566,0.26631,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.344198,0.0,0.0,0.0,0.0,0.0,0.32512,0.0,0.0,0.0
DayZ,0.234482,0.127841,0.140355,0.241064,0.0,0.0,0.26114,0.0,0.234482,0.0,...,0.0,0.270383,0.166104,0.0,0.0,0.0,0.0,0.0,0.0,0.274902
Eden Rising,0.226078,0.123259,0.135325,0.0,0.0,0.0,0.503561,0.0,0.226078,0.0,...,0.174903,0.0,0.0,0.0,0.0,0.0,0.165209,0.0,0.0,0.265049
Onirism,0.593103,0.323363,0.355017,0.0,0.0,0.0,0.0,0.0,0.593103,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [64]:
neighbors_list

[0, 7249, 18005, 155, 32474, 4171, 34838, 3, 916, 3625]

In [323]:
test_df['distance_05'].sum()

3032.2850842778075

In [72]:
test_df_2 = get_fav_list(df_play)

In [74]:
test_df_2['distance_02'] = ''

for index, row in test_df_2.iterrows():
    neighbors_list = list(knn_model.kneighbors(X_proj_2.loc[[row['most_fav_game']]],n_neighbors=df.shape[0])[1][0])
    res = pd.DataFrame(knn_model.kneighbors(X_proj_2.loc[[row['most_fav_game']]],n_neighbors=df.shape[0])[0][0], index = X_proj_2.iloc[neighbors_list, :]\
             .index, columns = ['distance_02']).loc[row['sec_fav_game']][0]
    test_df_2.loc[index, 'distance_02'] = res

In [75]:
test_df_2

Unnamed: 0,most_fav_game,sec_fav_game,distance_02
5250,Portal 2,Alien Swarm,0.880324
76767,Counter-Strike,Banished,1.212521
86540,Far Cry 3,Left 4 Dead 2,0.997489
229911,Counter-Strike,Worms Reloaded,1.12462
298950,Team Fortress 2,Far Cry 3,1.074816
...,...,...,...
308468736,Magic Duels,War Thunder,1.032863
308695132,Champions Online,Brawlhalla,0.681081
308760273,Toribash,Unturned,1.138748
309052991,Brawlhalla,Heroes & Generals,1.024469


In [76]:
test_df_2['distance_02'].sum()

3123.457132881227