# 1. Importing the libraries and dataset

In [1]:
import warnings

import pandas as pd
import xgboost as xgb
import dill as pickle

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

warnings.filterwarnings("ignore")

In [2]:
dataframe = pd.read_csv("Datasets/steam_games_correlated.csv")
dataframe.head()

Unnamed: 0,Great Soundtrack,Singleplayer,Fast-Paced,Classic,Atmospheric,Difficult,Survival,PvP,Early Access,Strategy,...,detail_Profile Features Limited \r\n\t\t\t\t\t\t\t\t\t,detail_Steam is learning about this game \r\n\t\t\t\t\t\t\t\t\t,detail_Downloadable Content,genre_Massively Multiplayer,genre_Strategy,genre_Free to Play,genre_Indie,genre_Early Access,genre_Simulation,positive_reviews_share
0,True,True,True,True,True,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,0.92
1,False,False,False,False,False,True,True,True,True,True,...,False,False,False,True,False,False,False,False,False,0.49
2,True,True,False,False,False,True,False,False,False,True,...,False,False,False,False,True,False,False,False,False,0.71
3,False,False,False,False,True,False,True,True,True,False,...,False,False,False,True,False,False,False,False,False,0.61
4,False,False,False,False,True,True,False,True,False,True,...,False,False,False,True,True,True,False,False,False,0.74


# 2. RandomForestRegressor

**Define the target and source data.**

In [3]:
X_RF = dataframe.drop("positive_reviews_share", axis=1)
y_RF = dataframe["positive_reviews_share"]

**Spliting the data.**

In [4]:
X_train_RF, X_test_RF, y_train_RF, y_test_RF = train_test_split(X_RF, y_RF, test_size=0.2, random_state=42)

**Creating a grid of hyper-parameters**

In [5]:
#param_grid_RF = {
#    "n_estimators": [100, 200, 300],
#    "max_depth": [None, 5, 10],
#    "min_samples_split": [2, 5, 10]
#}

**Applying the best hyper-parameters for a grid on the previous runs performed.**

In [6]:
best_param_grid_RF = {
    "n_estimators": [300],
    "max_depth": [None],
    "min_samples_split": [10]
}

**Defining regressor.**

In [7]:
regressor_RF = RandomForestRegressor(random_state=42)

**Creating a model for a regression. Hyper-parameters are defined with a library "GridSearchCV".**

In [8]:
model_RF = GridSearchCV(regressor_RF, param_grid=best_param_grid_RF, cv=5, scoring="r2")
model_RF.fit(X_train_RF, y_train_RF)

In [9]:
best_params_RF = model_RF.best_params_

**Creating a prediction.**

In [10]:
y_pred_RF = model_RF.best_estimator_.predict(X_test_RF)

**Creating metrics and printing them out.**

In [11]:
mae_RF = mean_absolute_error(y_test_RF, y_pred_RF)
mse_RF = mean_squared_error(y_test_RF, y_pred_RF)
rmse_RF = mean_squared_error(y_test_RF, y_pred_RF, squared=False)
r2_RF = r2_score(y_test_RF, y_pred_RF)

In [12]:
print("Best Hyperparameters:", best_params_RF)
print("Mean Absolute Error (MAE):", mae_RF)
print("Mean Squared Error (MSE):", mse_RF)
print("Root Mean Squared Error (RMSE):", rmse_RF)
print("R-squared (R2):", r2_RF)

Best Hyperparameters: {'max_depth': None, 'min_samples_split': 10, 'n_estimators': 300}
Mean Absolute Error (MAE): 0.12911972977054884
Mean Squared Error (MSE): 0.027910630676672963
Root Mean Squared Error (RMSE): 0.16706474995244497
R-squared (R2): 0.18214157534030384


# 3. XGBRegressor

**Define the target and source data.**

In [13]:
X_XGB = dataframe.drop("positive_reviews_share", axis=1)
y_XGB = dataframe["positive_reviews_share"]

**Spliting the data.**

In [14]:
X_train_XGB, X_test_XGB, y_train_XGB, y_test_XGB = train_test_split(X_XGB, y_XGB, test_size=0.2, random_state=42)

**Creating a grid of hyper-parameters**

In [15]:
#param_grid_XGB = {
#    "learning_rate": [0.1, 0.01, 0.001],
#    "n_estimators": [100, 200, 300],
#    "max_depth": [3, 4, 5],
#    "subsample": [0.8, 0.9, 1.0],
#    "colsample_bytree": [0.8, 0.9, 1.0],
#    "gamma": [0, 0.1, 0.2],
#    "reg_alpha": [0, 0.1, 0.2],
#    "reg_lambda": [0, 0.1, 0.2]
#}

**Applying the best hyper-parameters for a grid on the previous runs performed.**

In [16]:
best_param_grid_XGB = {
    "learning_rate": [0.1],
    "n_estimators": [200],
    "max_depth": [4],
    "subsample": [0.9],
    "colsample_bytree": [0.8],
    "gamma": [0],
    "reg_alpha": [0.2],
    "reg_lambda": [0]
}

**Defining regressor.**

In [17]:
regressor_XGB = xgb.XGBRegressor(random_state=42)

**Creating a model for a regression. Hyper-parameters are defined with a library "GridSearchCV".**

In [18]:
model_XGB = GridSearchCV(regressor_XGB, best_param_grid_XGB, scoring="r2", cv=5)
model_XGB.fit(X_train_XGB, y_train_XGB)

In [19]:
best_params_XGB = model_XGB.best_estimator_.get_params()

**Creating a prediction.**

In [20]:
y_pred_XGB = model_XGB.predict(X_test_XGB)

**Creating metrics and printing them out.**

In [21]:
mae_XGB = mean_absolute_error(y_test_XGB, y_pred_XGB)
mse_XGB = mean_squared_error(y_test_XGB, y_pred_XGB)
rmse_XGB = mean_squared_error(y_test_XGB, y_pred_XGB, squared=False)
r2_XGB = r2_score(y_test_XGB, y_pred_XGB)

In [22]:
print("Best Hyperparameters:", best_params_XGB)
print("Mean Absolute Error (MAE):", mae_XGB)
print("Mean Squared Error (MSE):", mse_XGB)
print("Root Mean Squared Error (RMSE):", rmse_XGB)
print("R-squared (R2):", r2_XGB)

Best Hyperparameters: {'objective': 'reg:squarederror', 'base_score': None, 'booster': None, 'callbacks': None, 'colsample_bylevel': None, 'colsample_bynode': None, 'colsample_bytree': 0.8, 'early_stopping_rounds': None, 'enable_categorical': False, 'eval_metric': None, 'feature_types': None, 'gamma': 0, 'gpu_id': None, 'grow_policy': None, 'importance_type': None, 'interaction_constraints': None, 'learning_rate': 0.1, 'max_bin': None, 'max_cat_threshold': None, 'max_cat_to_onehot': None, 'max_delta_step': None, 'max_depth': 4, 'max_leaves': None, 'min_child_weight': None, 'missing': nan, 'monotone_constraints': None, 'n_estimators': 200, 'n_jobs': None, 'num_parallel_tree': None, 'predictor': None, 'random_state': 42, 'reg_alpha': 0.2, 'reg_lambda': 0, 'sampling_method': None, 'scale_pos_weight': None, 'subsample': 0.9, 'tree_method': None, 'validate_parameters': None, 'verbosity': None}
Mean Absolute Error (MAE): 0.12692244874698083
Mean Squared Error (MSE): 0.026884940685813048
Root