In [24]:
# Import Libraries
import json
import pandas as pd
import pickle

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import hstack, csr_matrix

from sentence_transformers import SentenceTransformer

import warnings
warnings.filterwarnings('ignore')

In [25]:
# Step 1 : Read File
with open('assets/steam_games_backup.json', 'r') as file:
  data = json.load(file)

df = pd.DataFrame.from_dict(data, orient='index')

df.head(5)

Unnamed: 0,name,release_date,price,short_description,header_image,genres,tags,screenshots,movies,review_count,review_score_text,review_score,platforms,developers,publishers
10,Counter-Strike,"1 Nov, 2000",69999.0,Play the world's number 1 online action game. ...,https://shared.akamai.steamstatic.com/store_it...,[Action],"[Multi-player, PvP, Online PvP, Shared/Split S...",[https://shared.akamai.steamstatic.com/store_i...,[],158708,Overwhelmingly Positive,8,"[windows, mac, linux]",[Valve],[Valve]
20,Team Fortress Classic,"1 Apr, 1999",47499.0,One of the most popular online action games of...,https://shared.akamai.steamstatic.com/store_it...,[Action],"[Multi-player, PvP, Online PvP, Shared/Split S...",[https://shared.akamai.steamstatic.com/store_i...,[],6492,Very Positive,7,"[windows, mac, linux]",[Valve],[Valve]
30,Day of Defeat,"1 May, 2003",47499.0,Enlist in an intense brand of Axis vs. Allied ...,https://shared.akamai.steamstatic.com/store_it...,[Action],"[Multi-player, Valve Anti-Cheat enabled, Famil...",[https://shared.akamai.steamstatic.com/store_i...,[],4234,Very Positive,7,"[windows, mac, linux]",[Valve],[Valve]
40,Deathmatch Classic,"1 Jun, 2001",47499.0,Enjoy fast-paced multiplayer gaming with Death...,https://shared.akamai.steamstatic.com/store_it...,[Action],"[Multi-player, PvP, Online PvP, Shared/Split S...",[https://shared.akamai.steamstatic.com/store_i...,[],2291,Very Positive,7,"[windows, mac, linux]",[Valve],[Valve]
50,Half-Life: Opposing Force,"1 Nov, 1999",47499.0,Return to the Black Mesa Research Facility as ...,https://shared.akamai.steamstatic.com/store_it...,[Action],"[Single-player, Multi-player, Valve Anti-Cheat...",[https://shared.akamai.steamstatic.com/store_i...,[],21741,Overwhelmingly Positive,8,"[windows, mac, linux]",[Gearbox Software],[Valve]


In [26]:
# Lihat Jumlah Data
print(len(df))

6004


In [27]:
# Cek Duplikat
duplicates = df[df.duplicated(subset='name', keep=False)]

print(f"Total duplicate entries: {len(duplicates)}")
print(duplicates['name'])

Total duplicate entries: 2
2676180    Fog of War
574080     Fog of War
Name: name, dtype: object


In [28]:
# Drop Duplikat
index_to_drop = df[df['name'] == 'Fog of War'].index[0]
df = df.drop(index_to_drop)

In [29]:
duplicates = df[df.duplicated(subset='name', keep=False)]

print(f"Total duplicate entries: {len(duplicates)}")
print(duplicates['name'])

Total duplicate entries: 0
Series([], Name: name, dtype: object)


In [30]:
# Cek N/A Values
missing = df.isna().sum() / len(df) * 100
missing

name                 0.0
release_date         0.0
price                0.0
short_description    0.0
header_image         0.0
genres               0.0
tags                 0.0
screenshots          0.0
movies               0.0
review_count         0.0
review_score_text    0.0
review_score         0.0
platforms            0.0
developers           0.0
publishers           0.0
dtype: float64

In [31]:
# Cek Empty List
def check_empty(value):
    return isinstance(value, list) and len(value) == 0

empty = df.applymap(check_empty).sum()

percentage_empty = (empty / len(df)) * 100
print(percentage_empty)

name                 0.000000
release_date         0.000000
price                0.000000
short_description    0.000000
header_image         0.000000
genres               0.266533
tags                 0.000000
screenshots          0.033317
movies               6.446777
review_count         0.000000
review_score_text    0.000000
review_score         0.000000
platforms            0.000000
developers           0.199900
publishers           0.666333
dtype: float64


In [32]:
# Cek top values untuk fitur-fitur
df['genres'].explode().value_counts().head(3)

genres
Indie        3829
Action       2610
Adventure    2545
Name: count, dtype: int64

In [33]:
df['tags'].explode().value_counts().head(3)

tags
Single-player         5728
Family Sharing        5727
Steam Achievements    4333
Name: count, dtype: int64

In [34]:
df['developers'].explode().value_counts().head(3)

developers
Square Enix                   37
Valve                         27
KOEI TECMO GAMES CO., LTD.    25
Name: count, dtype: int64

In [35]:
df['publishers'].explode().value_counts().head(3)

publishers
SEGA           55
Square Enix    52
THQ Nordic     45
Name: count, dtype: int64

In [36]:
# 1. Drop kolom yang tidak digunakan dalam model
data_df = df.drop(columns=['release_date', 'price', 'header_image', 'screenshots', 'movies', 'review_count', 'review_score_text', 'review_score'], axis=1)

# 2. Normalisasi text menjadi lowercase dan strip spasi
data_df['name'] = data_df['name'].str.lower().str.strip()
data_df['short_description'] = data_df['short_description'].str.lower().str.strip()

# 3. Mengubah list menjadi string yang dipisah oleh koma
data_df['genres'] = data_df['genres'].apply(lambda x: [g.lower() for g in x])
data_df['tags'] = data_df['tags'].apply(lambda x: [g.lower() for g in x])
data_df['platforms'] = data_df['platforms'].apply(lambda x: [g.lower() for g in x])
data_df['developers'] = data_df['developers'].apply(lambda x: [g.lower() for g in x])
data_df['publishers'] = data_df['publishers'].apply(lambda x: [g.lower() for g in x])

data_df.head(5)

Unnamed: 0,name,short_description,genres,tags,platforms,developers,publishers
10,counter-strike,play the world's number 1 online action game. ...,[action],"[multi-player, pvp, online pvp, shared/split s...","[windows, mac, linux]",[valve],[valve]
20,team fortress classic,one of the most popular online action games of...,[action],"[multi-player, pvp, online pvp, shared/split s...","[windows, mac, linux]",[valve],[valve]
30,day of defeat,enlist in an intense brand of axis vs. allied ...,[action],"[multi-player, valve anti-cheat enabled, famil...","[windows, mac, linux]",[valve],[valve]
40,deathmatch classic,enjoy fast-paced multiplayer gaming with death...,[action],"[multi-player, pvp, online pvp, shared/split s...","[windows, mac, linux]",[valve],[valve]
50,half-life: opposing force,return to the black mesa research facility as ...,[action],"[single-player, multi-player, valve anti-cheat...","[windows, mac, linux]",[gearbox software],[valve]




---

**Modeling - Content-based Filtering**

Fitur yang akan Digunakan:

- name

- short_description

- genres

- tags

- platforms

- developers

- publishers


Fitur untuk Filtering:

- release_date

- price

- review_count

- review_score_text

- review_score

In [37]:
def create_features(df):
    # Mengubah teks menjadi vektor yang merepresentasikan makna kalimat
    # Menggunakan SentenceTransformer agar bisa memahami makna kalimat secara keseluruhan (Kontekstual)
    sbert_model = SentenceTransformer('all-MiniLM-L6-v2')

    desc_embeddings = sbert_model.encode(df['short_description'])
    # Mengubah ke sparse matrix agar lebih hemat memori
    desc_matrix = csr_matrix(desc_embeddings)

    name_embeddings = sbert_model.encode(df['name'])
    name_matrix = csr_matrix(name_embeddings)

    # MultiLabelBinarizer untuk 'genres', 'tags', 'platforms', 'developers', 'publishers'
    # Mengubah teks menjadi vektor angka 0-1 dimana 1 artinya memiliki, dan 0 tidak memiliki fitur tersebut
    mlb_genres = MultiLabelBinarizer()
    genres_encoded = mlb_genres.fit_transform(df['genres'])

    mlb_tags = MultiLabelBinarizer()
    tags_encoded = mlb_tags.fit_transform(df['tags'])

    mlb_platforms = MultiLabelBinarizer()
    platforms_encoded = mlb_platforms.fit_transform(df['platforms'])

    mlb_developers = MultiLabelBinarizer()
    developers_encoded = mlb_developers.fit_transform(df['developers'])

    mlb_publishers = MultiLabelBinarizer()
    publishers_encoded = mlb_publishers.fit_transform(df['publishers'])

    # Diberi bobot untuk fitur-fitur yang dirasa lebih penting untuk rekomendasi
    weighted_names = name_matrix * 1.5
    weighted_desc = desc_matrix * 1.25
    weighted_genres = genres_encoded * 1.25
    weighted_tags = tags_encoded * 1.0
    weighted_platforms = platforms_encoded * 0.2
    weighted_developers = developers_encoded * 0.75
    weighted_publishers = publishers_encoded * 0.5

    # Gabungkan vektor-vektor yang sudah dibobot kedalam 1 matrix besar
    combined_features = hstack([weighted_names, weighted_desc, weighted_genres, weighted_tags, weighted_platforms, weighted_developers, weighted_publishers])

    return combined_features

combined_features = create_features(data_df)

In [38]:
# Save kedalam npz file agar dapat digunakan dalam app.py
from scipy.sparse import save_npz
save_npz('combined_features.npz', combined_features)

In [None]:
# Konversi COO ke CSR sebelum fit dan indexing
# KNN butuh CSR untuk indexing cepat
combined_features_csr = combined_features.tocsr()

param_grid = {
    'metric': ['cosine', 'euclidean', 'manhattan'],
    'algorithm': ['brute', 'auto']
}

best_score = -1
best_model = None
best_params = None

n_neighbors = 20
sample_indices = [0, 5, 10, 20, 50]

# Looping untuk mencari kombinasi parameter terbaik 
for metric in param_grid['metric']:
    for algo in param_grid['algorithm']:
        try:
            model = NearestNeighbors(n_neighbors=n_neighbors, metric=metric, algorithm=algo)
            model.fit(combined_features_csr)

            total_sim = 0
            count = 0

            for idx in sample_indices:
                distances, indices = model.kneighbors(combined_features_csr[idx:idx+1])

                for i in indices[0][1:]:
                    sim = cosine_similarity(combined_features_csr[idx:idx+1], combined_features_csr[i:i+1])[0][0]
                    total_sim += sim
                    count += 1

            avg_sim = total_sim / count if count else 0

            print(f"Metric: {metric}, Algorithm: {algo}, Avg Similarity: {avg_sim:.4f}")

            if avg_sim > best_score:
                best_score = avg_sim
                best_model = model
                best_params = {
                    'n_neighbors': n_neighbors,
                    'metric': metric,
                    'algorithm': algo
                }

        except Exception as e:
            print(f"Error with metric={metric}, algo={algo}: {e}")

# Save kedalam pkl file agar dapat digunakan dalam app.py
with open('knn_game_model_tuned.pkl', 'wb') as file:
    pickle.dump(best_model, file)

print(f"Best Parameters: {best_params}")

Best Parameters: {'n_neighbors': 20, 'metric': 'cosine', 'algorithm': 'brute'}


In [55]:
def recommend_games_knn(game_title, df, model, combined_features_csr, top_n=20):
    game_title = game_title.lower()

    # Cek apakah ada gamenya
    if df['name'].eq(game_title).any():
        idx = df[df['name'] == game_title].index[0]
        row_number = df.index.get_loc(idx)
    else:
        print(f"Game '{game_title}' not found in the dataset.")
        return None

    # Inisialisasi model KNN
    distances, indices = model.kneighbors(combined_features_csr[row_number], n_neighbors=top_n + 1)

    # Skip index pertama (gamenya sendiri)
    recommended_indices = indices[0][1:]
    recommended_distances = distances[0][1:]

    # Kalkulasi similarity score
    similarity_scores = 1 - recommended_distances

    recommended_games = df.iloc[recommended_indices][['name']].copy()
    recommended_games['similarity'] = similarity_scores

    return recommended_games.reset_index(drop=True)

search_game = "fallout 3"
recommendations = recommend_games_knn(search_game, data_df, best_model, combined_features_csr)
print(f"Game Input: {search_game}")
print(recommendations)

Game Input: fallout 3
                                                 name  similarity
0                fallout 3 - game of the year edition    0.741061
1                      the elder scrolls iv: oblivion    0.608068
2                                           fallout 4    0.595366
3                                  atom rpg trudograd    0.584909
4                    the elder scrolls iii: morrowind    0.581858
5                                  fallout: new vegas    0.561016
6                       final fantasy iii (3d remake)    0.554117
7                        final fantasy iv (3d remake)    0.547484
8                                final fantasy xiii-2    0.539578
9                                         dream quest    0.525206
10                                         silverfall    0.522412
11                         zanki zero: last beginning    0.516929
12                                   dungeon dreams 2    0.515947
13                            start again: a prologue 

In [41]:
# Generate random game
import random

random_name = data_df['name'].sample(1).values[0]
print("Random game:", random_name)

Random game: ground control anthology


In [42]:
# Checking data untuk suatu game
print(df[df['name'] == "Fallout 4"])

             name release_date     price  \
377160  Fallout 4  9 Nov, 2015  266000.0   

                                        short_description  \
377160  Bethesda Game Studios, the award-winning creat...   

                                             header_image genres  \
377160  https://shared.akamai.steamstatic.com/store_it...  [RPG]   

                                                     tags  \
377160  [Single-player, Steam Achievements, Full contr...   

                                              screenshots  \
377160  [https://shared.akamai.steamstatic.com/store_i...   

                                                   movies  review_count  \
377160  [http://video.akamai.steamstatic.com/store_tra...        258591   

       review_score_text  review_score              platforms  \
377160     Very Positive             7  [windows, mac, linux]   

                     developers            publishers  
377160  [Bethesda Game Studios]  [Bethesda Softworks]  


In [43]:
print(df[df['name'] == "Fallout 3 - Game of the Year Edition"])

                                       name  release_date     price  \
22370  Fallout 3 - Game of the Year Edition  17 Dec, 2009  164900.0   

                                       short_description  \
22370  Prepare for the Future™ With Fallout 3: Game o...   

                                            header_image genres  \
22370  https://shared.akamai.steamstatic.com/store_it...  [RPG]   

                                       tags  \
22370  [Single-player, Family Sharing, RPG]   

                                             screenshots  \
22370  [https://shared.akamai.steamstatic.com/store_i...   

                                                  movies  review_count  \
22370  [http://video.akamai.steamstatic.com/store_tra...         39136   

      review_score_text  review_score              platforms  \
22370     Very Positive             7  [windows, mac, linux]   

                    developers            publishers  
22370  [Bethesda Game Studios]  [Bethesda Softworks]

In [44]:
print(df[df['name'] == "The Elder Scrolls IV: Oblivion "])

                                  name  release_date     price  \
22330  The Elder Scrolls IV: Oblivion   16 Jun, 2009  135999.0   

                                       short_description  \
22330  The Elder Scrolls IV: Oblivion® Game of the Ye...   

                                            header_image genres  \
22330  https://shared.akamai.steamstatic.com/store_it...  [RPG]   

                                                    tags  \
22330  [Single-player, Steam Cloud, Family Sharing, RPG]   

                                             screenshots  \
22330  [https://shared.akamai.steamstatic.com/store_i...   

                                                  movies  review_count  \
22330  [http://video.akamai.steamstatic.com/store_tra...         39469   

             review_score_text  review_score              platforms  \
22330  Overwhelmingly Positive             8  [windows, mac, linux]   

                    developers            publishers  
22330  [Bethesda Game 