In [1]:
import pandas as pd
import numpy as np
import requests
import json
import os
import time
import ast
import sklearn
import pickle
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import r2_score, mean_squared_error, root_mean_squared_error, mean_absolute_error, mean_absolute_percentage_error
from imblearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import joblib
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer 
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import ExtraTreesRegressor, GradientBoostingRegressor, HistGradientBoostingRegressor 
from imblearn.over_sampling import SMOTE
import shap
from xgboost import plot_importance
from xgboost import XGBClassifier

In [2]:
data = pd.read_csv('../data/rawg_cleaned_games_no_dev_data.csv')

In [3]:
data.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,id,slug,name,released,tba,rating,rating_top,ratings_count,...,PC_platform,PlayStation 5_platform,Xbox Series S/X_platform,PlayStation 4_platform,Xbox One_platform,macOS_platform,Nintendo Switch_platform,Linux_platform,Android_platform,iOS_platform
0,0,0,3498,grand-theft-auto-v,Grand Theft Auto V,2013-09-17,False,4.47,5,7118,...,1,1,1,1,1,0,0,0,0,0
1,1,1,3328,the-witcher-3-wild-hunt,The Witcher 3: Wild Hunt,2015-05-18,False,4.64,5,6913,...,1,1,1,1,1,1,1,0,0,0
2,2,2,4200,portal-2,Portal 2,2011-04-18,False,4.59,5,5908,...,1,0,0,0,1,1,0,1,0,0
3,3,3,4291,counter-strike-global-offensive,Counter-Strike: Global Offensive,2012-08-21,False,3.57,4,3574,...,1,0,0,0,0,0,0,1,0,0
4,4,4,5286,tomb-raider,Tomb Raider (2013),2013-03-05,False,4.06,4,4007,...,1,0,0,1,1,1,0,0,0,0


In [4]:
data = data.drop(columns=['parent_platforms', 'user_game', 'updated', 'suggestions_count', 'community_rating', 'tba','Unnamed: 0.1', 'Unnamed: 0'])

In [5]:
data.columns

Index(['id', 'slug', 'name', 'released', 'rating', 'rating_top',
       'ratings_count', 'reviews_text_count', 'added', 'metacritic',
       ...
       'PC_platform', 'PlayStation 5_platform', 'Xbox Series S/X_platform',
       'PlayStation 4_platform', 'Xbox One_platform', 'macOS_platform',
       'Nintendo Switch_platform', 'Linux_platform', 'Android_platform',
       'iOS_platform'],
      dtype='object', length=109)

In [6]:
data = data[(data['owned'] > 150)]

In [7]:
def remove_string(list_str):
        extracted_list = ast.literal_eval(list_str)
        return extracted_list

## Creating Target

In [9]:
conditions = [
    (data['rating_exceptional'] >= 60),
    (data['rating_exceptional'] < 60) & (data['rating_exceptional'] >= 45),
    (data['rating_exceptional'] < 45) & (data['rating_exceptional'] >= 25),
    (data['rating_exceptional'] < 25)
]
choices = [4,3,2,1]
data['target'] = np.select(conditions, choices, default=0)

In [10]:
data

Unnamed: 0,id,slug,name,released,rating,rating_top,ratings_count,reviews_text_count,added,metacritic,...,PlayStation 5_platform,Xbox Series S/X_platform,PlayStation 4_platform,Xbox One_platform,macOS_platform,Nintendo Switch_platform,Linux_platform,Android_platform,iOS_platform,target
0,3498,grand-theft-auto-v,Grand Theft Auto V,2013-09-17,4.47,5,7118,65,22059,92.0,...,1,1,1,1,0,0,0,0,0,3
1,3328,the-witcher-3-wild-hunt,The Witcher 3: Wild Hunt,2015-05-18,4.64,5,6913,79,21702,92.0,...,1,1,1,1,1,1,0,0,0,4
2,4200,portal-2,Portal 2,2011-04-18,4.59,5,5908,38,20576,95.0,...,0,0,0,1,1,0,1,0,0,4
3,4291,counter-strike-global-offensive,Counter-Strike: Global Offensive,2012-08-21,3.57,4,3574,29,18267,81.0,...,0,0,0,0,0,0,1,0,0,1
4,5286,tomb-raider,Tomb Raider (2013),2013-03-05,4.06,4,4007,14,17600,86.0,...,0,0,1,1,1,0,0,0,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10895,375233,cartoon-network-journeys-vr,Cartoon Network Journeys VR,,0.00,0,1,0,153,,...,0,0,0,0,0,0,0,0,0,0
10916,18819,driftwood-the-visual-novel,Driftwood The Visual Novel,2016-08-08,0.00,0,1,0,153,,...,0,0,0,0,1,0,1,0,0,0
10921,14152,prism-collider,Prism Collider,2017-01-31,0.00,0,1,0,153,,...,0,0,0,0,0,0,0,0,0,0
10943,49986,mytd-wo-de-ta-fang,MyTD 我的塔防,,0.00,0,1,0,152,,...,0,0,0,0,0,0,0,0,0,0


In [11]:
data.iloc[:, 12:-1]

Unnamed: 0,esrb_rating,rating_exceptional,rating_recommended,rating_meh,rating_skip,owned,beaten,dropped,playing,Action_genre,...,PC_platform,PlayStation 5_platform,Xbox Series S/X_platform,PlayStation 4_platform,Xbox One_platform,macOS_platform,Nintendo Switch_platform,Linux_platform,Android_platform,iOS_platform
0,Mature,59.00,32.70,6.36,1.94,12714,6254.0,1154.0,753.0,1,...,1,1,1,1,1,0,0,0,0,0
1,Mature,76.74,16.34,4.23,2.69,12714,5091.0,995.0,893.0,1,...,1,1,1,1,1,1,1,0,0,0
2,Everyone 10+,69.33,25.16,3.02,2.50,13016,5716.0,627.0,161.0,0,...,1,0,0,0,1,1,0,1,0,0
3,Mature,16.25,46.57,25.94,11.24,14084,1109.0,2097.0,631.0,0,...,1,0,0,0,0,0,0,1,0,0
4,Mature,25.80,60.44,10.79,2.97,11545,4402.0,558.0,116.0,1,...,1,0,0,1,1,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10895,,,100.00,,,151,1.0,,,1,...,1,0,0,0,0,0,0,0,0,0
10916,,,,100.00,,151,,1.0,,0,...,1,0,0,0,0,1,0,1,0,0
10921,,,,,100.00,153,,,,1,...,1,0,0,0,0,0,0,0,0,0
10943,,,,100.00,,152,,,,0,...,1,0,0,0,0,0,0,0,0,0


In [12]:
data.iloc[:, 12:-1].drop(columns=['rating_exceptional', 'rating_recommended', 'rating_meh', 'rating_skip','owned', 'beaten', 'dropped', 'playing'])

Unnamed: 0,esrb_rating,Action_genre,RPG_genre,Shooter_genre,Puzzle_genre,Adventure_genre,Indie_genre,Platformer_genre,Massively Multiplayer_genre,Sports_genre,...,PC_platform,PlayStation 5_platform,Xbox Series S/X_platform,PlayStation 4_platform,Xbox One_platform,macOS_platform,Nintendo Switch_platform,Linux_platform,Android_platform,iOS_platform
0,Mature,1,0,0,0,0,0,0,0,0,...,1,1,1,1,1,0,0,0,0,0
1,Mature,1,1,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,0,0,0
2,Everyone 10+,0,0,1,1,0,0,0,0,0,...,1,0,0,0,1,1,0,1,0,0
3,Mature,0,0,1,0,0,0,0,0,0,...,1,0,0,0,0,0,0,1,0,0
4,Mature,1,0,0,0,0,0,0,0,0,...,1,0,0,1,1,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10895,,1,0,0,0,1,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
10916,,0,0,0,0,0,1,0,0,0,...,1,0,0,0,0,1,0,1,0,0
10921,,1,0,0,0,0,1,0,0,0,...,1,0,0,0,0,0,0,0,0,0
10943,,0,0,0,0,0,1,0,0,0,...,1,0,0,0,0,0,0,0,0,0


In [13]:
X = data.iloc[:, 12:-1].drop(columns=['rating_exceptional', 'rating_recommended', 'rating_meh', 'rating_skip','owned', 'beaten', 'dropped', 'playing'])
y = data['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=123)

In [14]:
# imputer = IterativeImputer(estimator = HistGradientBoostingRegressor(verbose=2, random_state=434), max_iter=1, random_state=343)
# classifier = GradientBoostingClassifier(n_estimators=200, learning_rate=0.8, max_depth=2, random_state=123)
# encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
# smote = SMOTE(random_state=3434)

In [15]:
imputer = SimpleImputer(strategy='most_frequent')
classifier = XGBClassifier(n_estimators=200, learning_rate=0.1, max_depth=2, random_state=123)
encoder = OneHotEncoder()

In [16]:
ct = ColumnTransformer([
    ('encoder', encoder, ['esrb_rating']),
], remainder = 'passthrough')

In [17]:
#X_train_smoted, y_train_smoted = smote.fit_resample(X_train, y_train)

In [18]:
pipe = Pipeline([
    ('ct', ct),
    ('imputer', imputer),
    ('smote', SMOTE(random_state=42)),
    ('xgboost', classifier)
]).fit(X_train, y_train)

In [19]:
# pipe = Pipeline([
#     ('encoder', encoder),
#     ('imputer', imputer),
#     ('smote', SMOTE(random_state=3434)),
#     ('xgboost', classifier)
# ]).fit(X_train, y_train)

In [20]:
pipe

In [21]:
joblib.dump(pipe, '../models/xgboostclassifier_v1.joblib')

['../models/xgboostclassifier_v1.joblib']

In [22]:
model = joblib.load('../models/xgboostclassifier_v1.joblib')

In [23]:
model

In [24]:
#joblib.dump(pipe, '../models/xgboostclassifier_v2.joblib')

In [25]:
y_pred = pipe.predict(X_test)

In [26]:
y_pred

array([4, 1, 1, ..., 1, 1, 1], dtype=int64)

In [27]:
confusion_matrix(y_test, y_pred)

array([[ 588,  232,    7,   10,    2],
       [ 392, 1097,   51,   62,   37],
       [  62,  208,   20,   33,    9],
       [  19,   54,    7,   14,   11],
       [   1,   17,    2,    8,    4]], dtype=int64)

In [28]:
pipe.score(X_test, y_test)

0.5846623685103495

In [29]:
print(f'MSE: {mean_squared_error(y_test, y_pred)}')
print(f'RMSE: {root_mean_squared_error(y_test, y_pred)}')
print(f'MAE: {mean_absolute_error(y_test, y_pred)}')
print(f'MAPE: {mean_absolute_percentage_error(y_test, y_pred)}')
print(f'R2: {r2_score(y_test, y_pred)}')

MSE: 0.8554462164913471
RMSE: 0.9249033552168286
MAE: 0.5412283678316933
MAPE: 434008243696376.6
R2: -0.35153188677776215


In [30]:
export_df = X.head(1)
export_df.iloc[0] = 0

In [31]:
export_df

Unnamed: 0,esrb_rating,Action_genre,RPG_genre,Shooter_genre,Puzzle_genre,Adventure_genre,Indie_genre,Platformer_genre,Massively Multiplayer_genre,Sports_genre,...,PC_platform,PlayStation 5_platform,Xbox Series S/X_platform,PlayStation 4_platform,Xbox One_platform,macOS_platform,Nintendo Switch_platform,Linux_platform,Android_platform,iOS_platform
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [32]:
export_df

Unnamed: 0,esrb_rating,Action_genre,RPG_genre,Shooter_genre,Puzzle_genre,Adventure_genre,Indie_genre,Platformer_genre,Massively Multiplayer_genre,Sports_genre,...,PC_platform,PlayStation 5_platform,Xbox Series S/X_platform,PlayStation 4_platform,Xbox One_platform,macOS_platform,Nintendo Switch_platform,Linux_platform,Android_platform,iOS_platform
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [33]:
y_pred

array([4, 1, 1, ..., 1, 1, 1], dtype=int64)

In [34]:
#export_df.to_csv('../data/zero_df.csv')

In [35]:
explainer = shap.Explainer(pipe['xgboost'].predict_proba, pipe[:-2].transform(X_train))

In [36]:
# print(type(pipe[:-1].transform(X_train)))
# print(type(pipe[:-1].transform(X_test)))

In [37]:
# pipe[:-1].transform(X_train)

In [39]:
explainer = shap.Explainer(pipe['xgboost'].predict_proba, pipe[:-2].transform(X_train), feature_names=pipe[:-2].get_feature_names_out())
explanation = explainer(pipe[:-2].transform(X_test))

PermutationExplainer explainer: 2948it [02:13, 20.93it/s]                          


In [40]:
joblib.dump(explanation, '../models/explanation_v1.joblib')

['../models/explanation_v1.joblib']

In [41]:
joblib.dump(explainer, '../models/explainer_v1.joblib')

['../models/explainer_v1.joblib']

In [None]:
target_list = list(pipe['xgboost'].classes_)

In [None]:
probs = model.predict_proba(X_test)
target = 4
class_index = target_list.index(target)
i = np.argmax(probs[:, class_index])
shap.plots.waterfall(explanation[i, :, class_index])

In [None]:
target_list = list(pipe['xgboost'].classes_)
i = 0

target=4

class_index = list(pipe['xgboost'].classes_).index(target)
shap.plots.waterfall(explanation[i, :, class_index])

In [None]:
preds = pipe.predict(X_train)          # X is your input data
target = 4

# Get index of an instance where model predicted class 4
i = np.where(preds == target)[0][0]  # pick first such example

# Use SHAP waterfall plot
class_index = target_list.index(target)
shap.plots.waterfall(explanation[i, :, class_index])

In [None]:
preds = pipe.predict(X_train)          # X is your input data
target = 3

# Get index of an instance where model predicted class 4
i = np.where(preds == target)[0][0]  # pick first such example

# Use SHAP waterfall plot
class_index = target_list.index(target)
shap.plots.waterfall(explanation[i, :, class_index])

In [None]:
list(feature_names)

In [None]:
plot_importance(pipe['xgboost'])

In [None]:
model = pipe['xgboost']
feature_names = pipe[:-1].get_feature_names_out()

In [None]:
importances = model.feature_importances_

# Create DataFrame
importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': importances
}).sort_values(by='Importance', ascending=False)

In [None]:
model.feature_importances_

In [None]:
importance_df