In [None]:
import pandas as pd
import joblib
import lightgbm as lgb
from sklearn.feature_selection import mutual_info_regression
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier, StackingClassifier, VotingClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report

from scripts.MLSmote import get_minority_instace, MLSMOTE
from scripts.custom_models import WeightedRandomForest
from scripts.weight_calculation import calculate_one_hot_class_weights

# Domains column Imputing
## Loading the dataset

In [None]:
df = pd.read_excel('../../../data/BGG_Cleaned_Data_Set_Mechanics_Imputing_Custom.xlsx')

In [None]:
df.head()

## Feature Selection
#### Calculating the mutual information for the domains column

In [None]:
df_mutual = df.drop(columns=['Strategy Games', 'Abstract Games', 'Thematic Games', 'Party Games', 'Wargames', 'Customizable Games', 'Children\'s Games', 'Family Games', 'Mechanics', 'Strategic Elements', 'Interaction and Conflict', 'Action and Turn Management', 'Resource Management', 'Narrative and Thematic', 'Movement and Positioning', 'Specialized Mechanics', 'Auxiliary Mechanics', 'Other', 'Game Progression and Mechanics'])

df_mutual = df_mutual.dropna()

df_mutual['Domains'] = df_mutual['Domains'].astype('category').cat.codes
df_mutual['Mechanics_Categories'] = df_mutual['Mechanics_Categories'].astype('category').cat.codes

# Define target and features
target = df_mutual['Domains'].values
features = df_mutual.drop(columns=['Domains'])

mutual_info = mutual_info_regression(features, target, random_state=42, n_neighbors=5, discrete_features='auto')

mutual_info_df = pd.DataFrame({
    'Feature': features.columns,
    'Mutual Information': mutual_info
})

mutual_info_df = mutual_info_df.sort_values(by='Mutual Information', ascending=False)
# plt.figure(figsize=(10, 6))
# plt.barh(mutual_info_df['Feature'], mutual_info_df['Mutual Information'])
# plt.xlabel('Mutual Information')
# plt.ylabel('Feature')
# plt.title('Mutual Information for the Domains Column')
# plt.gca().invert_yaxis()
# plt.show()

mutual_info_df

## Data Preprocessing
#### Choosing the best features

In [None]:
# Selecting the features that have a mutual information higher than 0.2
all_mechanics = df['Mechanics_Categories'].str.split(',').explode().unique()
selected_features = mutual_info_df[mutual_info_df['Mutual Information'] > 0.2]['Feature'].values

# Selecting the valid mechanics
valid_mechanics = [mechanic for mechanic in all_mechanics if mechanic in df.columns]

# Selecting the features
columns_to_select = list(selected_features)

# Targets
targets = ['Strategy Games', 'Abstract Games', 'Thematic Games', 'Party Games', 'Wargames', 'Customizable Games', 'Children\'s Games', 'Family Games']
columns_to_select

#### Removing the NaN values

In [None]:
df = df.dropna(subset=columns_to_select + targets)

#### Checking the spread of the mechanic clusters

In [None]:
# Calculating the frequency of each domain category
domains_frequencies = df[targets].sum()
total_domain_occurrences = df[targets].sum().sum()
print(domains_frequencies)

## Model training for imputing the 'Domains' column
#### Splitting the data

In [None]:
X = df[columns_to_select].drop(columns=['Mechanics_Categories'])
y = df[targets]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#### Using SMOTE to balance the classes

In [None]:
X_sub, y_sub = get_minority_instace(X_train, y_train)
X_resampled, y_resampled = MLSMOTE(X_sub, y_sub, 500)

X_train = pd.concat([X_train, X_resampled], axis=0)
y_train = pd.concat([y_train, y_resampled], axis=0)

#### Calculating the class weights

In [None]:
class_weights = calculate_one_hot_class_weights(y)
class_weights

### Random Forest
#### Defining the RandomForestClassifier

In [None]:
rf_model = WeightedRandomForest(class_weights=class_weights)

#### Wrapping the model in a MultiOutputClassifier

In [None]:
rf_multi_output_model = MultiOutputClassifier(rf_model)

#### Defining hyperparameter grid for the Random Forest

In [None]:
rf_parameters = {
    'model__estimator__n_estimators': [50, 100, 150],
    'model__estimator__max_depth': [None, 10, 20, 30],
    'model__estimator__min_samples_split': [2, 5, 10],
    'model__estimator__min_samples_leaf': [1, 2, 4],
    'model__estimator__max_features': [None, 'sqrt', 'log2'],
}

#### Defining the pipeline

In [None]:
rf_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('model', rf_multi_output_model)
])

#### Randomized search

In [None]:
rf_random_search = RandomizedSearchCV(
    estimator=rf_pipeline,
    param_distributions=rf_parameters,
    n_iter=20,
    cv=3,
    scoring='f1',
    n_jobs=-1,
    verbose=2,
    random_state=42
)

#### Fitting the Random Forest model

In [None]:
print("Starting the training...")
rf_random_search.fit(X_train, y_train)
print(f"Best parameters: {rf_random_search.best_params_}")
best_rf_model = rf_random_search.best_estimator_

#### Evaluating the Random Forest model

In [None]:
y_pred = best_rf_model.predict(X_test)
y_pred = y_pred.reshape(y_pred.shape[1], y_pred.shape[2])
print(classification_report(y_test, y_pred))

### LightGBM
#### Defining the LightGBM model

In [None]:
lgb_model = lgb.LGBMClassifier(random_state=42)

#### Wrapping the model in a MultiOutputClassifier

In [None]:
lgbm_multi_output_model = MultiOutputClassifier(lgb_model)

#### Defining hyperparameter grid for the LightGBM model

In [None]:
lgbm_param_grid = {
    'model__estimator__n_estimators': [50, 100, 150],
    'model__estimator__max_depth': [None, 5, 10, 15],
    'model__estimator__min_child_samples': [1, 5, 10],
    'model__estimator__num_leaves': [31, 63, 127],
    'model__estimator__learning_rate': [0.1, 0.01, 0.001],
}

#### Defining the pipeline

In [None]:
lgbm_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('model', lgbm_multi_output_model)
])

#### Randomized search

In [None]:
lgbm_random_search = RandomizedSearchCV(
    estimator=lgbm_pipeline,
    param_distributions=lgbm_param_grid,
    n_iter=20,
    cv=3,
    scoring='f1',
    n_jobs=-1,
    verbose=2,
    random_state=42
)

#### Fitting the LightGBM model

In [None]:
print("Starting the training...")
lgbm_random_search.fit(X_train, y_train)
print(f"Best parameters: {lgbm_random_search.best_params_}")
best_lgbm_model = lgbm_random_search.best_estimator_

## Model evaluation
#### Classification report

In [None]:
y_pred = best_lgbm_model.predict(X_test)
print(classification_report(y_test, y_pred))

## Saving the model

In [None]:
joblib.dump(best_rf_model, '../../../saved/domain_imputing/Multi_Output_Random_Forest.pkl')
joblib.dump(best_lgbm_model, '../../../saved/domain_imputing/Multi_Output_LightGBM.pkl')