In [None]:
import pandas as pd
import numpy as np
import math
import joblib
import matplotlib.pyplot as plt

from sklearn.feature_selection import mutual_info_regression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import classification_report
from scripts.custom_models import WeightedRandomForest
from scripts.weight_calculation import calculate_one_hot_class_weights
from scripts.MLSmote import get_minority_instace, MLSMOTE

# Mechanics Weighted Classifier
## Loading the dataset

In [None]:
df = pd.read_excel('../../../data/BGG_Cleaned_Data_Set_Mechanics_Imputing_Custom_Imputed_Domains.xlsx')

In [None]:
df.head()

## Feature Selection
#### Calculating the mutual information for the mechanics column

In [None]:
df_mutual = df.drop(columns=['Strategy Games', 'Abstract Games', 'Thematic Games', 'Party Games', 'Wargames', 'Customizable Games', 'Children\'s Games', 'Family Games', 'Mechanics', 'Other', 'Action and Turn Management', 'Resource Management', 'Interaction and Conflict', 'Game Progression and Mechanics', 'Auxiliary Mechanics', 'Narrative and Thematic', 'Movement and Positioning', 'Other', 'Specialized Mechanics', 'Strategic Elements'])

df_mutual = df_mutual.dropna()

df_mutual['Domains'] = df_mutual['Domains'].astype('category').cat.codes
df_mutual['Mechanics_Categories'] = df_mutual['Mechanics_Categories'].astype('category').cat.codes

# Define target and features
target = df_mutual['Mechanics_Categories'].values
features = df_mutual.drop(columns=['Mechanics_Categories'])
mutual_info = mutual_info_regression(features, target, random_state=42, n_neighbors=5, discrete_features='auto')

mutual_info_df = pd.DataFrame({
    'Feature': features.columns,
    'Mutual Information': mutual_info
})

mutual_info_df = mutual_info_df.sort_values(by='Mutual Information', ascending=False)
# plt.figure(figsize=(10, 6))
# plt.barh(mutual_info_df['Feature'], mutual_info_df['Mutual Information'])
# plt.xlabel('Mutual Information')
# plt.ylabel('Feature')
# plt.title('Mutual Information for the Mechanics Column')
# plt.gca().invert_yaxis()
# plt.show()

mutual_info_df

## Data Preprocessing
#### Choosing the best features

In [None]:
# Selecting the features that have a mutual information higher than 0.2
all_domains = df['Domains'].str.split(', ').explode().unique()
selected_features = mutual_info_df[mutual_info_df['Mutual Information'] > 0.15]['Feature'].values

# Selecting the valid domains
valid_domains = [domain for domain in all_domains if domain in df.columns]

# Selecting the columns to use
columns_to_select = list(valid_domains) + list(selected_features)
columns_to_select.remove('Domains')

In [None]:
print(columns_to_select)

#### Removing possible noise related to the domains

In [None]:
# Calculating the frequency of each mechanics category
targets = list(df['Mechanics_Categories'].str.split(', ').explode().unique())
targets = [item for item in targets if not (isinstance(item, float) and math.isnan(item))]

mechanics_frequencies = df[targets].sum()
total_mechanics_occurrences = df[targets].sum().sum()
mechanics_frequencies

#### Removing some of the rows that have are overrepresented to balance the dataset

In [None]:
num_rows_to_remove = 7000
rows_to_remove = df[df['Game Progression and Mechanics'] == 1].sample(num_rows_to_remove, random_state=42).index
df = df.drop(rows_to_remove)

#### Encoding the mechanics column

In [None]:
df.dropna(inplace=True)
# Assigning the right features to the X variable and the target to the y variable
X = df[columns_to_select]
y = df[targets]

## Model Training
#### Splitting the data


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#### Using SMOTE to balance the classes

In [None]:
X_sub, y_sub = get_minority_instace(X_train, y_train)
X_resampled, y_resampled = MLSMOTE(X_sub, y_sub, 500)

X_train = pd.concat([X_train, X_resampled], axis=0)
y_train = pd.concat([y_train, y_resampled], axis=0)

#### Calculating the class weights

In [None]:
class_weights = calculate_one_hot_class_weights(y_train)
class_weights

#### Defining the model

In [None]:
rf_model = WeightedRandomForest(class_weights=class_weights)

#### Wrapping the model in a MultiOutputClassifier

In [None]:
multi_output_model = MultiOutputClassifier(rf_model)

#### Defining the pipeline

In [None]:
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('model', multi_output_model)
])

## Hyperparameter Tuning
#### Defining the hyperparameters

In [None]:
rf_parameters = {
    'model__estimator__n_estimators': [50, 100, 150],
    'model__estimator__max_depth': [None, 10, 20, 30],
    'model__estimator__min_samples_split': [2, 5, 10],
    'model__estimator__min_samples_leaf': [1, 2, 4],
    'model__estimator__max_features': [None, 'sqrt', 'log2'],
}

#### Randomized search

In [None]:
random_search = RandomizedSearchCV(
    estimator=pipeline,
    param_distributions=rf_parameters,
    n_iter=50,
    cv=5,
    scoring='f1_micro',
    n_jobs=-1,
    verbose=2,
    random_state=42
)

#### Fitting the model

In [None]:
print("Starting the training...")
random_search.fit(X_train, y_train)

## Model Evaluation
#### Best model and its parameters

In [None]:
best_model = random_search.best_estimator_
best_params = random_search.best_params_
print(f"Best parameters: {best_params}")

#### Evaluating the model

In [None]:
y_pred = best_model.predict(X_test)
y_pred = y_pred.reshape(y_pred.shape[1], y_pred.shape[2])
print(classification_report(y_test, y_pred))

In [None]:
joblib.dump(best_model, '../../../saved/mechanics_imputing/multi_output_classifier_weighted_custom.pkl')