In [4]:
import numpy as np
import pandas as pd

In [5]:
games_df = pd.read_csv("games.csv")
games_df.head()

Unnamed: 0,app_id,title,date_release,win,mac,linux,rating,positive_ratio,user_reviews,price_final,price_original,discount,steam_deck
0,13500,Prince of Persia: Warrior Within™,2008-11-21,True,False,False,Very Positive,84,2199,9.99,9.99,0.0,True
1,22364,BRINK: Agents of Change,2011-08-03,True,False,False,Positive,85,21,2.99,2.99,0.0,True
2,113020,Monaco: What's Yours Is Mine,2013-04-24,True,True,True,Very Positive,92,3722,14.99,14.99,0.0,True
3,226560,Escape Dead Island,2014-11-18,True,False,False,Mixed,61,873,14.99,14.99,0.0,True
4,249050,Dungeon of the ENDLESS™,2014-10-27,True,True,False,Very Positive,88,8784,11.99,11.99,0.0,True


In [6]:
users_df = pd.read_csv("users.csv")
users_df.head()

Unnamed: 0,user_id,products,reviews
0,7360263,359,0
1,14020781,156,1
2,8762579,329,4
3,4820647,176,4
4,5167327,98,2


In [7]:
recommendations_df = pd.read_csv("recommendations.csv")
recommendations_df.head()

Unnamed: 0,app_id,helpful,funny,date,is_recommended,hours,user_id,review_id
0,975370,0,0,2022-12-12,True,36.3,51580,0
1,304390,4,0,2017-02-17,False,11.5,2586,1
2,1085660,2,0,2019-11-17,True,336.5,253880,2
3,703080,0,0,2022-09-23,True,27.4,259432,3
4,526870,0,0,2021-01-10,True,7.9,23869,4


In [8]:
print(games_df.shape)
print(users_df.shape)
print(recommendations_df.shape)

(50872, 13)
(14306064, 3)
(41154794, 8)


In [9]:
games_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50872 entries, 0 to 50871
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   app_id          50872 non-null  int64  
 1   title           50872 non-null  object 
 2   date_release    50872 non-null  object 
 3   win             50872 non-null  bool   
 4   mac             50872 non-null  bool   
 5   linux           50872 non-null  bool   
 6   rating          50872 non-null  object 
 7   positive_ratio  50872 non-null  int64  
 8   user_reviews    50872 non-null  int64  
 9   price_final     50872 non-null  float64
 10  price_original  50872 non-null  float64
 11  discount        50872 non-null  float64
 12  steam_deck      50872 non-null  bool   
dtypes: bool(4), float64(3), int64(3), object(3)
memory usage: 3.7+ MB


In [10]:
users_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14306064 entries, 0 to 14306063
Data columns (total 3 columns):
 #   Column    Dtype
---  ------    -----
 0   user_id   int64
 1   products  int64
 2   reviews   int64
dtypes: int64(3)
memory usage: 327.4 MB


In [11]:
recommendations_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41154794 entries, 0 to 41154793
Data columns (total 8 columns):
 #   Column          Dtype  
---  ------          -----  
 0   app_id          int64  
 1   helpful         int64  
 2   funny           int64  
 3   date            object 
 4   is_recommended  bool   
 5   hours           float64
 6   user_id         int64  
 7   review_id       int64  
dtypes: bool(1), float64(1), int64(5), object(1)
memory usage: 2.2+ GB


In [12]:
recommendations_df.isnull().sum()

app_id            0
helpful           0
funny             0
date              0
is_recommended    0
hours             0
user_id           0
review_id         0
dtype: int64

In [13]:
games_df.columns

Index(['app_id', 'title', 'date_release', 'win', 'mac', 'linux', 'rating',
       'positive_ratio', 'user_reviews', 'price_final', 'price_original',
       'discount', 'steam_deck'],
      dtype='object')

In [14]:
import json
with open("games_metadata.json", 'r', encoding='utf-8') as f:
    metadata_list = json.load(f)
metadata = {str(item['app_id']): item for item in metadata_list}

descriptions = []
tags = []
for _, row in games_df.iterrows():
    app_id = str(row['app_id'])
    game_data = metadata.get(app_id, {})
    descriptions.append(game_data.get('description', ''))
    tags.append(game_data.get('tags', []))
    
games_df['description'] = descriptions
games_df['genres'] = tags
games_df['description and genres combined'] = [str(d) + ' ' + str(g) for d, g in zip(descriptions, tags)]

In [15]:
games_df.head()

Unnamed: 0,app_id,title,date_release,win,mac,linux,rating,positive_ratio,user_reviews,price_final,price_original,discount,steam_deck,description,genres,description and genres combined
0,13500,Prince of Persia: Warrior Within™,2008-11-21,True,False,False,Very Positive,84,2199,9.99,9.99,0.0,True,Enter the dark underworld of Prince of Persia ...,"[Action, Adventure, Parkour, Third Person, Gre...",Enter the dark underworld of Prince of Persia ...
1,22364,BRINK: Agents of Change,2011-08-03,True,False,False,Positive,85,21,2.99,2.99,0.0,True,,[Action],['Action']
2,113020,Monaco: What's Yours Is Mine,2013-04-24,True,True,True,Very Positive,92,3722,14.99,14.99,0.0,True,Monaco: What's Yours Is Mine is a single playe...,"[Co-op, Stealth, Indie, Heist, Local Co-Op, St...",Monaco: What's Yours Is Mine is a single playe...
3,226560,Escape Dead Island,2014-11-18,True,False,False,Mixed,61,873,14.99,14.99,0.0,True,Escape Dead Island is a Survival-Mystery adven...,"[Zombies, Adventure, Survival, Action, Third P...",Escape Dead Island is a Survival-Mystery adven...
4,249050,Dungeon of the ENDLESS™,2014-10-27,True,True,False,Very Positive,88,8784,11.99,11.99,0.0,True,Dungeon of the Endless is a Rogue-Like Dungeon...,"[Roguelike, Strategy, Tower Defense, Pixel Gra...",Dungeon of the Endless is a Rogue-Like Dungeon...


In [16]:
games_df['description and genres combined'] = games_df['description and genres combined'].str.lower()

In [17]:
import re
def remove_html_tags(text):
    pattern = re.compile('<.*?>')
    return pattern.sub(r'', text)
games_df['description and genres combined'] = games_df['description and genres combined'].apply(remove_html_tags)

In [18]:
def remove_url(text):
    pattern = re.compile(r'https?://\S+|www\.\S+')
    return pattern.sub(r'', text)
games_df['description and genres combined'] = games_df['description and genres combined'].apply(remove_url)

In [19]:
import string
exclude = string.punctuation
def remove_punc(text):
    for char in exclude:
        text = text.replace(char,'')
    return text
games_df['description and genres combined'] = games_df['description and genres combined'].apply(remove_punc)

In [20]:
def remove_emoji(text):
    emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)
games_df['description and genres combined'] = games_df['description and genres combined'].apply(remove_emoji)

In [21]:
games_df[['title', 'description and genres combined']].head()

Unnamed: 0,title,description and genres combined
0,Prince of Persia: Warrior Within™,enter the dark underworld of prince of persia ...
1,BRINK: Agents of Change,action
2,Monaco: What's Yours Is Mine,monaco whats yours is mine is a single player ...
3,Escape Dead Island,escape dead island is a survivalmystery advent...
4,Dungeon of the ENDLESS™,dungeon of the endless is a roguelike dungeond...


In [22]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(games_df['description and genres combined'])
tfidf_matrix.shape

(50872, 58163)

In [23]:
indexes = pd.Series(games_df.index, index=games_df['title']).drop_duplicates()

In [24]:
from sklearn.metrics.pairwise import linear_kernel
def get_recommendations(title, n=6):
    game_index = indexes[title]
    cosine_similarities = linear_kernel(tfidf_matrix[game_index:game_index+1], tfidf_matrix).flatten()
    related_docs_indices = cosine_similarities.argsort()[:-n-1:-1]
    related_docs_indices = related_docs_indices[related_docs_indices != game_index]    
    return games_df.iloc[related_docs_indices][['title', 'genres']]

get_recommendations("Prince of Persia: Warrior Within™")

Unnamed: 0,title,genres
19579,Prince of Persia®,"[Action, Adventure, Parkour, Platformer, Third..."
12694,Prince of Persia: The Forgotten Sands™,"[Action, Adventure, Platformer, Parkour, Third..."
19153,Prince of Persia: The Two Thrones™,"[Action, Adventure, Platformer, Parkour, Third..."
9419,Prince of Persia®: The Sands of Time,"[Action, Adventure, Parkour, Platformer, Third..."
1901,Persian Nights: Sands of Wonders,"[Adventure, Casual, Hidden Object, Point & Cli..."


In [25]:
user_recommendations = recommendations_df.merge(games_df[['app_id', 'title']], left_on='app_id', right_on='app_id', how='left')

In [26]:
user_recommendations.head()

Unnamed: 0,app_id,helpful,funny,date,is_recommended,hours,user_id,review_id,title
0,975370,0,0,2022-12-12,True,36.3,51580,0,Dwarf Fortress
1,304390,4,0,2017-02-17,False,11.5,2586,1,FOR HONOR™
2,1085660,2,0,2019-11-17,True,336.5,253880,2,Destiny 2
3,703080,0,0,2022-09-23,True,27.4,259432,3,Planet Zoo
4,526870,0,0,2021-01-10,True,7.9,23869,4,Satisfactory


In [27]:
def get_recommendations_from_users(game_title, n=5):
    users_who_liked = user_recommendations[(user_recommendations['title'] == game_title) & (user_recommendations['is_recommended'])]['user_id'].unique()
    if len(users_who_liked) == 0:
        return get_recommendations(game_title, n=n)
    recommendations = user_recommendations[(user_recommendations['user_id'].isin(users_who_liked)) & (user_recommendations['is_recommended']) & (user_recommendations['title'] != game_title)]
    recommendations = recommendations.groupby('title').agg({'is_recommended': 'count'}).reset_index()
    recommendations = recommendations.nlargest(n, 'is_recommended')
    recommendations = recommendations.rename(columns={'is_recommended': 'predicted_score'})
    return recommendations

In [28]:
get_recommendations_from_users("Prince of Persia: Warrior Within™", n=5)

Unnamed: 0,title,predicted_score
6585,Prince of Persia®: The Sands of Time,314
6583,Prince of Persia: The Two Thrones™,248
9302,Tomb Raider,176
9144,The Witcher® 3: Wild Hunt,167
2091,DOOM,144


In [29]:
model_data = {
    'games_df': games_df,
    'users_df': users_df,
    'recommendations_df': recommendations_df,
    'tfidf_matrix': tfidf_matrix,
    'indexes': indexes
}

In [30]:
import pickle
with open('model.pkl', 'wb') as f:
    pickle.dump(model_data, f)