# Steam Recommender System

**Reading in the Steam dataset and libraries/packages**

In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior() 
import os

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

## Data Loading and Exploration

In [2]:
steam_raw = pd.read_csv("../input/steam-video-games/steam-200k.csv",usecols=[0,1,2,3],names=['userid','game','behavior','hoursplayed'])
steam_raw.head()
steam_raw.isnull().values.any()
steam_raw['userid'] = steam_raw.userid.astype(str)
steam_raw.describe()

In [3]:
steam_raw.groupby(['game']).mean().sort_values(by="hoursplayed",ascending=False).head()
steam_raw.groupby(['game']).sum().sort_values(by="hoursplayed",ascending=False).head()

**Eastside Hockey Manager has the highest average hours played while Dota 2 has the most hours played.**

In [4]:
len(steam_raw['game'].unique())
len(steam_raw['userid'].unique())

**There is 5155 unique games and 12393 unique players in the dataset.**

In [5]:
gb = steam_raw.groupby('game')['userid'].nunique().sort_values(ascending=False).head()
ax = gb.plot(kind='bar', title='Number of players for Most Popular Games', ylabel='No. of players',
         xlabel='Game', figsize=(6, 5))

ax.bar_label(ax.containers[0])

## Feature Engineering and Metrics
**Supposedly if a user plays a game for more than 40 hours, then the user enjoys the game.<br>
Thus, we define a binary column "like" that indicates 1 if the user enjoys the game, and 0 if he/she doesn't.**

In [6]:
steam_df = steam_raw.copy()
steam_df['like'] = [1 if x > 40 else 0 for x in steam_df['hoursplayed']]
steam_df['like'].value_counts()
steam_df.head()

bg=steam_df.groupby('game')['like'].apply(lambda x: (x==1).sum()).sort_values(ascending=False)
bg.head()
gb.head()
#Plot grouped bar-chart of common games
gbbg = pd.merge(gb, bg, on='game')
gbc = gbbg.plot.bar(logy=True)



**From the graph, Half-Life 2 Lost Coast had one of the highest unique players of 981 (purchased and played) but none of them played the game more than 40 hours.<br>
Now, let's find those who purchased a game and didn't play it at all.<br>
We would want to reassign hoursplayed for these players to 0 instead of 1.<br>
And change the behavior to play and finally drop rows that are purchase.<br>
<br>
This would leave the dataframe to only containing play behaviors and if those that are purchased and not played, the hoursplayed will be 0.**

In [7]:
x = steam_df.groupby(['userid', 'game'])['behavior'].size()
s = x[x == 1]
len(s)
len(x)

boolean_index = steam_df.groupby(['userid','game'])['behavior'].transform('size') < 2
steam_df.loc[boolean_index,'hoursplayed'] = 0
steam_df.loc[steam_df['hoursplayed']==0]

steam_df.loc[steam_df.hoursplayed==0,'behavior'] = 'play'

steam_df.loc[steam_df['hoursplayed'] ==0]
steam_df = steam_df[steam_df.behavior != 'purchase']

**There are 57904 games purchased that have not been played yet.<br>
Next, we define the metrics to calculate a simple recommendation based on popularity and what other players like.**

In [8]:
# Create a new dataframe to store metrics
d = {'like':'Sum Likes','hoursplayed':'Avg Hours Played'}
metrics_df = steam_df.groupby(['game'], as_index=False).agg({'like':'sum','hoursplayed':'mean'}).rename(columns=d)
metrics_df.loc[metrics_df['game'] == "Dota 2"] #Check Dota 2

# Calculate mean of Hours Played average
c = metrics_df['Avg Hours Played'].mean()
print("Average hours played across all games is " + str(round(c,2)))

# Calculate the minimum number of likes required, set to 95 percentile
m = metrics_df['Sum Likes'].quantile(0.95)
print("Minimum number of likes for a game is " + str(m))

**Here the cut-off for the minimum number of likes is 5, this mean that there should be at least 5 user that played the game for more than 40 hours. <br> 
If a game has no more than 5 likes, we wouldn't recommend it to others.<br>
Now, we can proceed to trim and filter out the dataframe that meet this minimum number of likes.**

In [9]:
metrics_df.shape
metrics_df = metrics_df.loc[metrics_df['Sum Likes'] >= m]
metrics_df.shape
metrics_df.head()

## Simple Recommender
**Next, we will create the scoring system for each game.<br>
Define the score as Average Hours Played for the Game multiplied by Sum Likes Fraction Add Average Hours Across Games multilpied by minimum number of Likes Fraction**

In [10]:
def weighted_rating(df, m=m, C=c):
    l = df['Sum Likes']
    a = df['Avg Hours Played']
    return (l/(l+m) * a) + (m/(l+m) * C)

metrics_df['score'] = metrics_df.apply(weighted_rating, axis=1)
metrics_df.head()

**Renaming metrics_df index name to 'index' and reseting the index by the sorted values of the score**

In [11]:
metrics_df.index.name = 'index'
newIndex = metrics_df.sort_values(by=['score'],ascending=False).reset_index('index', drop=True)
newIndex.head(20)
newIndex.shape
# metrics_df.sort_values(by=['score'],ascending=False).head(20).reset_index('index', drop=True)

# Taking In User Input
**Reading In File with User Inputted Game**

In [12]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
userInput_df = pd.read_csv("../input/userinput/UserGameInput.txt", sep=",", usecols=[0], names=['game'])
userInput_df.head()



**Verifying if user game is in newIndex dataframe**

In [13]:
# printing the game if userInput_df game is in metrics_df
# print (userInput_df[userInput_df['game'].isin(metrics_df['game'])])

gameRec = userInput_df[userInput_df['game'].isin(newIndex['game'])]

print (gameRec)

# Printing Out User's 2 Recommended Games Based Off User's Inputted Game

**Obtaining where game is in newIndex dataframe and assigning this row to a new dataframe called location and printing location**

**Printing the game above and the game below using location to find in the newIndex (rating table) as long as it is not the index of 0, which is the game Football Manager 2012, as this will print the two games below it from the newIndex dataframe (rating table).**

**Also verifying if last game of dataframe of newIndex will print the two games before it**

In [25]:
location = newIndex.loc[newIndex['game'].isin(userInput_df['game'])]
print (location)

# metrics_df.loc[metrics_df['game'] == "Dota 2"] #Check Dota 2

if  location.index == 0:
    t = 0
    re = newIndex.iloc[t: t+2]
    print (re)
    
    #>>> df['date'][df.index[-1]]
elif location.index == newIndex.index[newIndex.index[-1]]:
    t = newIndex['index'].iloc[-1]
    res = newIndex.iloc[t: t-2]
    print(res)
    
else :
    st = location.index.astype(int)
    result = newIndex.iloc[st+1]
    result2 = newIndex.iloc[st-1]
    print (result.game)
    print (result2.game)