In [None]:
import pandas as pd
import joblib
import optuna

from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import mutual_info_regression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, root_mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

# Play Time Imputing model

In [None]:
df = pd.read_excel('../../../data/BGG_Cleaned_Data_Set_Mechanics_Imputing_Embedded.xlsx')

In [None]:
df

## Feature Selection
#### Calculating the mutual information for the play time column

In [None]:
df_mutual = df.drop(columns=['Strategy Games', 'Abstract Games', 'Thematic Games', 'Party Games', 'Wargames', 'Customizable Games', 'Children\'s Games', 'Family Games', '0', '1', '2', '3', '4', '5', '6'])

df_mutual = df_mutual.dropna()

df_mutual['Domains'] = df_mutual['Domains'].astype('category').cat.codes
df_mutual['mechanic_cluster'] = df_mutual['mechanic_cluster'].astype('category').cat.codes

# Define target and features
target = df_mutual['Play Time'].values
features = df_mutual.drop(columns=['Play Time'])
mutual_info = mutual_info_regression(features, target, random_state=42, n_neighbors=5, discrete_features='auto')

mutual_info_df = pd.DataFrame({
    'Feature': features.columns,
    'Mutual Information': mutual_info
})

mutual_info_df = mutual_info_df.sort_values(by='Mutual Information', ascending=False)
# plt.figure(figsize=(10, 6))
# plt.barh(mutual_info_df['Feature'], mutual_info_df['Mutual Information'])
# plt.xlabel('Mutual Information')
# plt.ylabel('Feature')
# plt.title('Mutual Information for the Mechanics Column')
# plt.gca().invert_yaxis()
# plt.show()

mutual_info_df

## Data Preprocessing
#### Choosing the best features

In [None]:
# Selecting the features that have a mutual information higher than 0.2
selected_features = mutual_info_df[mutual_info_df['Mutual Information'] > 0.2]['Feature'].values

# Selecting the columns to use
columns_to_select = list(selected_features)
if 'Domains' in df.columns:
    columns_to_select.remove('Domains')

# Removing the rows with missing values and dropping the not needed columns
df = df[columns_to_select + ['Play Time']].dropna()
df

#### Removing possible outliers

In [None]:
# Removing the outliers in the Play Time column beyond the 0.95 quantile
df = df[df['Play Time'] < df['Play Time'].quantile(0.95)]

## Model Training
#### Splitting the data

In [None]:
X = df[columns_to_select].drop(columns=['mechanic_cluster'])
y = df['Play Time']

X_train_full, X_test, y_train_full, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_full, y_train_full, test_size=0.2, random_state=42)

#### Define the Optuna objective function

In [None]:
def objective(trial):
    n_estimators = trial.suggest_int('n_estimators', 100, 500)
    max_depth = trial.suggest_int('max_depth', 5, 30, step=5)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 10)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 10)
    max_features = trial.suggest_categorical('max_features', ['sqrt', 'log2', None])
    bootstrap = trial.suggest_categorical('bootstrap', [True, False])

    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('rf', RandomForestRegressor(
            n_estimators=n_estimators,
            max_depth=max_depth,
            min_samples_split=min_samples_split,
            min_samples_leaf=min_samples_leaf,
            max_features=max_features,
            bootstrap=bootstrap,
            random_state=42
        ))
    ])
    pipeline.fit(X_train, y_train)

    val_predictions = pipeline.predict(X_val)
    val_mse = mean_squared_error(y_val, val_predictions)
    return val_mse

#### Running the Bayesian Optimization

In [None]:
bayesian_optimization = optuna.create_study(direction='minimize')
bayesian_optimization.optimize(objective, n_trials=75)

#### The best parameters

In [None]:
best_parameters = bayesian_optimization.best_params
print(best_parameters)

#### Training the model with the best parameters

In [None]:
final_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('rf', RandomForestRegressor(
        n_estimators=best_parameters['n_estimators'],
        max_depth=best_parameters['max_depth'],
        min_samples_split=best_parameters['min_samples_split'],
        min_samples_leaf=best_parameters['min_samples_leaf'],
        max_features=best_parameters['max_features'],
        bootstrap=best_parameters['bootstrap'],
        random_state=42
    ))
])

final_pipeline.fit(X_train_full, y_train_full)

## Model Evaluation
#### Evaluating the model

In [None]:
train_predictions = final_pipeline.predict(X_train_full)
train_mse = mean_squared_error(y_train_full, train_predictions)
train_mae = mean_absolute_error(y_train_full, train_predictions)
train_rmse = root_mean_squared_error(y_train_full, train_predictions)
train_r2 = r2_score(y_train_full, train_predictions)
print(f'Train MSE for the most Optimized Pipeline: {train_mse:.2f}')
print(f'Train MAE for the most Optimized Pipeline: {train_mae:.2f}')
print(f'Train RMSE for the most Optimized Pipeline: {train_rmse:.2f}')
print(f'Train R2 for the most Optimized Pipeline: {train_r2:.2f}')

In [None]:
validation_predictions = final_pipeline.predict(X_val)
test_mse = mean_squared_error(y_val, validation_predictions)
test_mae = mean_absolute_error(y_val, validation_predictions)
test_rmse = root_mean_squared_error(y_val, validation_predictions)
test_r2 = r2_score(y_val, validation_predictions)
print(f'Validation MSE for the most Optimized Pipeline: {test_mse:.2f}')
print(f'Validation MAE for the most Optimized Pipeline: {test_mae:.2f}')
print(f'Validation RMSE for the most Optimized Pipeline: {test_rmse:.2f}')
print(f'Validation R2 for the most Optimized Pipeline: {test_r2:.2f}')

In [None]:
test_predictions = final_pipeline.predict(X_test)
test_mse = mean_squared_error(y_test, test_predictions)
test_mae = mean_absolute_error(y_test, test_predictions)
test_rmse = root_mean_squared_error(y_test, test_predictions)
test_r2 = r2_score(y_test, test_predictions)
print(f'Test MSE for the most Optimized Pipeline: {test_mse:.2f}')
print(f'Test MAE for the most Optimized Pipeline: {test_mae:.2f}')
print(f'Test RMSE for the most Optimized Pipeline: {test_rmse:.2f}')
print(f'Test R2 for the most Optimized Pipeline: {test_r2:.2f}')