# 1. Importing the libraries and dataset

In [2]:
import warnings

import pandas as pd
import xgboost as xgb
import dill as pickle

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.ensemble import VotingRegressor

warnings.filterwarnings("ignore")

In [3]:
dataframe = pd.read_csv("Datasets/steam_games_cleaned.csv")
dataframe.head()

Unnamed: 0,release_date,has_setting,published_by_developer,multiple_languages,FPS,Gore,Action,Shooter,First-Person,Great Soundtrack,...,genre_Free to Play,genre_RPG,genre_Indie,genre_Early Access,genre_Simulation,genre_Racing,genre_Casual,genre_Sports,total_reviews,positive_reviews_share
0,2016,False,False,True,True,True,True,True,True,True,...,False,False,False,False,False,False,False,False,42550.0,0.92
1,2017,False,True,True,True,False,True,True,True,False,...,False,False,False,False,False,False,False,False,836608.0,0.49
2,2018,False,False,True,False,False,True,False,False,True,...,False,False,False,False,False,False,False,False,7030.0,0.71
3,2018,False,True,True,True,False,True,True,False,False,...,False,False,False,False,False,False,False,False,167115.0,0.61
4,2003,False,True,True,False,False,True,False,False,False,...,True,True,False,False,False,False,False,False,11481.0,0.74


# 2. RandomForestRegressor

**Define the target and source data.**

In [4]:
X_RF = dataframe.drop("positive_reviews_share", axis=1)
y_RF = dataframe["positive_reviews_share"]

**Spliting the data.**

In [5]:
X_train_RF, X_test_RF, y_train_RF, y_test_RF = train_test_split(X_RF, y_RF, test_size=0.2, random_state=42)

**Creating a grid of hyper-parameters**

In [6]:
#param_grid_RF = {
#    "n_estimators": [100, 200, 300],
#    "max_depth": [None, 5, 10],
#    "min_samples_split": [2, 5, 10]
#}

**Applying the best hyper-parameters for a grid on the previous runs performed.**

In [7]:
best_param_grid_RF = {
    "n_estimators": [300],
    "max_depth": [None],
    "min_samples_split": [10]
}

**Defining regressor.**

In [8]:
regressor_RF = RandomForestRegressor(random_state=42)

**Creating a model for a regression. Hyper-parameters are defined with a library "GridSearchCV".**

In [9]:
model_RF = GridSearchCV(regressor_RF, param_grid=best_param_grid_RF, cv=5, scoring="r2")
model_RF.fit(X_train_RF, y_train_RF)

In [10]:
best_params_RF = model_RF.best_params_

**Creating a prediction.**

In [11]:
y_pred_RF = model_RF.best_estimator_.predict(X_test_RF)

**Creating metrics and printing them out.**

In [12]:
mae_RF = mean_absolute_error(y_test_RF, y_pred_RF)
mse_RF = mean_squared_error(y_test_RF, y_pred_RF)
rmse_RF = mean_squared_error(y_test_RF, y_pred_RF, squared=False)
r2_RF = r2_score(y_test_RF, y_pred_RF)

In [13]:
print("Best Hyperparameters:", best_params_RF)
print("Mean Absolute Error (MAE):", mae_RF)
print("Mean Squared Error (MSE):", mse_RF)
print("Root Mean Squared Error (RMSE):", rmse_RF)
print("R-squared (R2):", r2_RF)

Best Hyperparameters: {'max_depth': None, 'min_samples_split': 10, 'n_estimators': 300}
Mean Absolute Error (MAE): 0.1190569881321025
Mean Squared Error (MSE): 0.02432968470649142
Root Mean Squared Error (RMSE): 0.15597975736130448
R-squared (R2): 0.287073164450252


# 3. XGBRegressor

**Define the target and source data.**

In [14]:
X_XGB = dataframe.drop("positive_reviews_share", axis=1)
y_XGB = dataframe["positive_reviews_share"]

**Spliting the data.**

In [15]:
X_train_XGB, X_test_XGB, y_train_XGB, y_test_XGB = train_test_split(X_XGB, y_XGB, test_size=0.2, random_state=42)

**Creating a grid of hyper-parameters**

In [16]:
#param_grid_XGB = {
#    "learning_rate": [0.1, 0.01, 0.001],
#    "n_estimators": [100, 200, 300],
#    "max_depth": [3, 4, 5],
#    "subsample": [0.8, 0.9, 1.0],
#    "colsample_bytree": [0.8, 0.9, 1.0],
#    "gamma": [0, 0.1, 0.2],
#    "reg_alpha": [0, 0.1, 0.2],
#    "reg_lambda": [0, 0.1, 0.2]
#}

**Applying the best hyper-parameters for a grid on the previous runs performed.**

In [17]:
best_param_grid_XGB = {
    "learning_rate": [0.1],
    "n_estimators": [200],
    "max_depth": [4],
    "subsample": [0.9],
    "colsample_bytree": [0.8],
    "gamma": [0],
    "reg_alpha": [0.2],
    "reg_lambda": [0]
}

**Defining regressor.**

In [18]:
regressor_XGB = xgb.XGBRegressor(random_state=42)

**Creating a model for a regression. Hyper-parameters are defined with a library "GridSearchCV".**

In [19]:
model_XGB = GridSearchCV(regressor_XGB, best_param_grid_XGB, scoring="r2", cv=5)
model_XGB.fit(X_train_XGB, y_train_XGB)

In [20]:
best_params_XGB = model_XGB.best_estimator_.get_params()

**Creating a prediction.**

In [21]:
y_pred_XGB = model_XGB.predict(X_test_XGB)

**Creating metrics and printing them out.**

In [22]:
mae_XGB = mean_absolute_error(y_test_XGB, y_pred_XGB)
mse_XGB = mean_squared_error(y_test_XGB, y_pred_XGB)
rmse_XGB = mean_squared_error(y_test_XGB, y_pred_XGB, squared=False)
r2_XGB = r2_score(y_test_XGB, y_pred_XGB)

In [23]:
print("Best Hyperparameters:", best_params_XGB)
print("Mean Absolute Error (MAE):", mae_XGB)
print("Mean Squared Error (MSE):", mse_XGB)
print("Root Mean Squared Error (RMSE):", rmse_XGB)
print("R-squared (R2):", r2_XGB)

Best Hyperparameters: {'objective': 'reg:squarederror', 'base_score': None, 'booster': None, 'callbacks': None, 'colsample_bylevel': None, 'colsample_bynode': None, 'colsample_bytree': 0.8, 'early_stopping_rounds': None, 'enable_categorical': False, 'eval_metric': None, 'feature_types': None, 'gamma': 0, 'gpu_id': None, 'grow_policy': None, 'importance_type': None, 'interaction_constraints': None, 'learning_rate': 0.1, 'max_bin': None, 'max_cat_threshold': None, 'max_cat_to_onehot': None, 'max_delta_step': None, 'max_depth': 4, 'max_leaves': None, 'min_child_weight': None, 'missing': nan, 'monotone_constraints': None, 'n_estimators': 200, 'n_jobs': None, 'num_parallel_tree': None, 'predictor': None, 'random_state': 42, 'reg_alpha': 0.2, 'reg_lambda': 0, 'sampling_method': None, 'scale_pos_weight': None, 'subsample': 0.9, 'tree_method': None, 'validate_parameters': None, 'verbosity': None}
Mean Absolute Error (MAE): 0.11837147831843876
Mean Squared Error (MSE): 0.02415938801126711
Root 

# 4. Ensemble of RandomForest and  XGBoost Regressors

**Define the target and source data.**

In [24]:
X_en = dataframe.drop("positive_reviews_share", axis=1)
y_en = dataframe["positive_reviews_share"]

**Spliting the data.**

In [25]:
X_train_en, X_test_en, y_train_en, y_test_en = train_test_split(X_en, y_en, test_size=0.2, random_state=42)

**Creating a model for an ensemble.**

In [26]:
model_en = VotingRegressor(estimators=[("rf", model_RF.best_estimator_), ("xgb", model_XGB.best_estimator_)])

In [27]:
model_en.fit(X_train_en, y_train_en)

**Creating a prediction.**

In [28]:
y_pred_en = model_en.predict(X_test_en)

**Creating metrics and printing them out.**

In [29]:
mae_en = mean_absolute_error(y_test_en, y_pred_en)
mse_en = mean_squared_error(y_test_en, y_pred_en)
rmse_en = mean_squared_error(y_test_en, y_pred_en, squared=False)
r2_en = r2_score(y_test_en, y_pred_en)

In [30]:
print("Mean Absolute Error (MAE):", mae_en)
print("Mean Squared Error (MSE):", mse_en)
print("Root Mean Squared Error (RMSE):", rmse_en)
print("R-squared (R2):", r2_en)

Mean Absolute Error (MAE): 0.11700650989580823
Mean Squared Error (MSE): 0.023628446581069384
Root Mean Squared Error (RMSE): 0.15371547280956913
R-squared (R2): 0.3076213747437698


# 4. Saving the models

**Creating a function to save model.**

In [31]:
def save_model(model, filename):
    with open("Models/" + filename, "wb") as file:
        pickle.dump(model, file)

In [32]:
save_model(model_RF, "random_forest_model.pkl")

In [33]:
save_model(model_XGB, "XGBoost_model.pkl")

In [34]:
save_model(model_en, "Ensemble_model.pkl")