In [37]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
import difflib
from sklearn import linear_model
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import linear_kernel
from sklearn.feature_extraction.text import CountVectorizer
from surprise import Reader, Dataset
from surprise import SVD
from surprise import accuracy
from surprise.model_selection import train_test_split

In [38]:
df = pd.read_excel(r'C:\4) Fourth Year\Capstone\Game analysis Dataset\Metacritic Data\Video_Game_Sales.xlsx')

In [39]:
df.head()

Unnamed: 0,Index,Name,User_Id,Platform,Year_of_Release,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales,Critic_Score,Critic_Count,User_Score,User_Count,Rating
0,0,Wii Sports,10240,Wii,2006,Sports,Nintendo,41.36,28.96,3.77,8.45,82.54,76,51,8.0,324,E
1,1,Super Mario Bros.,10258,NES,1985,Platform,Nintendo,29.08,3.58,6.81,0.77,40.24,76,51,8.0,324,E
2,2,Mario Kart Wii,10268,Wii,2008,Racing,Nintendo,15.68,12.8,3.79,3.29,35.57,82,73,8.3,712,E
3,3,Wii Sports Resort,10253,Wii,2009,Sports,Nintendo,15.61,10.95,3.28,2.95,32.78,80,73,8.0,193,E
4,4,Pokemon Red/Pokemon Blue,10213,G,1996,Role-Playing,Nintendo,11.27,8.89,10.22,1.0,31.37,80,73,8.0,193,E


In [40]:
df['Platform']=df['Platform'].astype(str)
df['Genre']=df['Genre'].astype(str)
df['Publisher']=df['Publisher'].astype(str)
df['Rating']=df['Rating'].astype(str)

In [41]:
df.info() #Dataset info

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17416 entries, 0 to 17415
Data columns (total 17 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Index            17416 non-null  int64  
 1   Name             17416 non-null  object 
 2   User_Id          17416 non-null  int64  
 3   Platform         17416 non-null  object 
 4   Year_of_Release  17416 non-null  int64  
 5   Genre            17416 non-null  object 
 6   Publisher        17416 non-null  object 
 7   NA_Sales         17416 non-null  float64
 8   EU_Sales         17416 non-null  float64
 9   JP_Sales         17416 non-null  float64
 10  Other_Sales      17416 non-null  float64
 11  Global_Sales     17416 non-null  float64
 12  Critic_Score     17416 non-null  int64  
 13  Critic_Count     17416 non-null  int64  
 14  User_Score       17416 non-null  float64
 15  User_Count       17416 non-null  int64  
 16  Rating           17416 non-null  object 
dtypes: float64(6

In [42]:
df['combined_features'] = df[['Platform', 'Publisher', 'Genre', 'Rating']].apply(lambda x: ' '.join(x), axis=1)

In [43]:
# Gettingcount of each word in the combined column
cv = CountVectorizer()
count_matrix = cv.fit_transform(df['combined_features'])

In [44]:
cosine_sim = cosine_similarity(count_matrix)

In [45]:
def recommend_by_content(title, cosine_sim = cosine_sim):
    recommended_games = []
    idx = df[df['Name'] == title].index[0]
    score_series = pd.Series(cosine_sim[idx]).sort_values(ascending = False)
    top_20_indexes = list(score_series.iloc[1:21].index)
    for i in top_20_indexes:
        recommended_games.append(df['Name'][i])
    return recommended_games

In [46]:
# Loading the dataset
reader = Reader()
data = Dataset.load_from_df(df[['User_Id', 'Name', 'User_Score']], reader)

In [47]:
# using the SVD algorithm to get the predictions
svd = SVD()
trainset = data.build_full_trainset()
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1796da1f220>

In [48]:
def recommend_by_cf(user_id):
    recommended_games = []
    # Get the inner id of the user
    user_inner_id = svd.trainset.to_inner_uid(user_id)
    # Get the games that the user has rated
    user_games = df[df['User_Id'] == user_id]
    user_games_names = user_games['Name'].tolist()
    all_games = df['Name'].unique()
    #get the games that the user has not played
    not_played_games = set(all_games) - set(user_games_names)
    for game in not_played_games:
        iid = svd.trainset.to_inner_iid(game)
        # Predict the rating for the given user and game
        user_rating = svd.predict(user_inner_id, iid)
        # Append the game to the recommended games list if the estimated rating is greater than or equal to 6
        if user_rating.est >= 6:
            recommended_games.append(game)
    return recommended_games

In [49]:
def recommend(user_id, title):
    recommended_games_content = recommend_by_content(title)
    recommended_games_cf = recommend_by_cf(user_id)
    final_recommendation = list(set(recommended_games_content + recommended_games_cf))
    return final_recommendation

In [51]:
user_id = 10240
title = "Wii Sports"
output = recommend(user_id, title)

for i, game in enumerate(output, 1):
    print(f"{i}. {game}")

1. My Fitness Coach
2. Punch-Out!!
3. Wii Fit
4. Cabelas Legendary Adventures
5. Mario Tennis
6. Tennis
7. Mario Sports Mix
8. New Play Control! Mario Power Tennis
9. Active Health with Carol Vorderman
10. Wii Sports Resort
11. Super Swing Golf
12. Mario Strikers Charged
13. Remington Super Slam Hunting: North America
14. Golf
15. Wii Fit Plus
16. Mario Super Sluggers
17. Baseball


In [52]:
train_data, test_data = train_test_split(data, test_size=0.2)

In [53]:
svd = SVD()

In [54]:
svd.fit(train_data)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1796b8a03d0>

In [55]:
predictions = svd.test(test_data)

In [56]:
accuracy.rmse(predictions)

RMSE: 2.5537


2.553674300601176

In [57]:
accuracy.mae(predictions)

MAE:  2.3063


2.3063359834454347