In [None]:
import pandas as pd
import joblib

from scripts.mechanics_reduction import map_mechanics_to_categories

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.model_selection import RandomizedSearchCV
from sklearn.multioutput import MultiOutputClassifier

# Mechanics column Imputing
## Loading the dataset

In [None]:
df = pd.read_excel('../../../data/BGG_Cleaned_Data_Set_Imputing.xlsx')

In [None]:
to_drop_columns = [
    'Action Queue', 'Action Retrieval', 'Campaign / Battle Card Driven',
    'Card Play Conflict Resolution', 'Communication Limits', 'Cooperative Game',
    'Deck Construction', 'Deck Bag and Pool Building', 'Grid Movement', 'Hand Management',
    'Hexagon Grid', 'Legacy Game', 'Modular Board', 'Once-Per-Game Abilities',
    'Scenario / Mission / Campaign Game', 'Simultaneous Action Selection', 'Solo / Solitaire Game',
    'Storytelling', 'Variable Player Powers', 'Action Points', 'Point to Point Movement',
    'Set Collection', 'Trading', 'Income', 'Loans', 'Market', 'Network and Route Building',
    'Score-and-Reset Game', 'Tech Trees / Tech Tracks', 'Turn Order: Stat-Based', 'Variable Set-up',
    'Card Drafting', 'Drafting', 'End Game Bonuses', 'Take That', 'Tile Placement',
    'Turn Order: Progressive', 'Critical Hits and Failures', 'Line of Sight',
    'Area Majority / Influence', 'Area Movement', 'Area-Impulse', 'Delayed Purchase',
    'Dice Rolling', 'Team-Based Game', 'Action/Event', 'Advantage Token', 'Simulation',
    'Sudden Death Ending', 'Tug of War', 'Ownership', 'Rondel', 'Track Movement',
    'Hidden Movement', 'Movement Points', 'Events', 'Grid Coverage',
    'Worker Placement with Dice Workers', 'Increase Value of Unchosen Resources',
    'Turn Order: Pass Order', 'Victory Points as a Resource',
    'Automatic Resource Growth', 'Pattern Building', 'Push Your Luck', 'Worker Placement',
    'Role Playing', 'Hidden Roles', 'Player Elimination', 'Semi-Cooperative Game',
    'Traitor Game', 'Race', 'Action Drafting', 'Follow', 'Hidden Victory Points',
    'Variable Phase Order', 'Turn Order: Claim Action', 'Enclosure', 'Memory',
    'Map Addition', 'Pick-up and Deliver', 'Contracts', 'Narrative Choice / Paragraph',
    'Stock Holding', 'Auction/Bidding', 'Auction: Turn Order Until Pass', 'Catch the Leader',
    'Bias', 'Trick-taking', 'Die Icon Resolution', 'Resource to Move',
    'Roles with Asymmetric Information', 'Stat Check Resolution', 'Turn Order: Role Order',
    'Auction: Dutch', 'Secret Unit Deployment', 'Commodity Speculation', 'Investment',
    'Highest-Lowest Scoring', 'Flicking', 'Moving Multiple Units', 'Different Worker Types',
    'Voting', 'Lose a Turn', 'Alliances', 'Auction: Sealed Bid', 'Betting and Bluffing',
    'Force Commitment', 'Negotiation', 'Connections', 'Pieces as Map', 'Square Grid',
    'Mancala', 'Targeted Clues', 'Order Counters', 'Movement Template', 'Constrained Bidding',
    'Multiple Maps', 'Bingo', 'Line Drawing', 'Paper-and-Pencil', 'Selection Order Bid',
    'Turn Order: Auction', 'Re-rolling and Locking', 'Kill Steal', 'Passed Action Token',
    'Command Cards', 'Interrupts', 'Prisoner\'s Dilemma', 'Move Through Deck',
    'Roll / Spin and Move', 'Real-Time', 'Ladder Climbing', 'Predictive Bid',
    'Auction: Once Around', 'Closed Economy Auction', 'Auction: Fixed Placement',
    'Relative Movement', 'Cube Tower', 'Random Production', 'Elapsed Real Time Ending',
    'Auction: English', 'Time Track', 'Melding and Splaying', 'Chaining', 'Three Dimensional Movement',
    'Pattern Movement', 'Static Capture', 'Deduction', 'Finale Ending', 'Pattern Recognition',
    'Rock-Paper-Scissors', 'Map Reduction', 'Turn Order: Random', 'Map Deformation',
    'Ratio / Combat Results Table', 'Minimap Resolution', 'Layering', 'Acting', 'Singing',
    'Bribery', 'Programmed Movement', 'King of the Hill', 'Stacking and Balancing',
    'Different Dice Movement', 'Multiple-Lot Auction', 'Measurement Movement',
    'Slide/Push', 'Auction: Dutch Priority', 'I Cut You Choose', 'Single Loser Game',
    'Action Timer', 'Physical Removal', 'Induction', 'Player Judge', 'Speed Matching',
    'Chit-Pull System', 'Zone of Control', 'Crayon Rail System', 'Matching',
    'Auction: Dexterity', 'Impulse Movement', 'Hot Potato'
]
total_domain_nans = df['Domains'].isna().sum()
print(f"Number of rows containing NaN values in the 'Domains' column: {total_domain_nans}")
total_mechanics_nans = df['Mechanics'].isna().sum()
print(f"Number of rows containing NaN values in the 'Mechanics' column: {total_mechanics_nans}")
df = df.dropna(subset=['Mechanics'])
print(f"Number of rows after dropping NaN values in the 'Mechanics' column: {df.shape[0]}")
print(f"Number of rows after dropping NaN values in the 'Domains' column: {df.shape[0]}")
df.drop(columns=['Domains'], inplace=True)
df.drop(columns=['Unknown'], inplace=True)
df.drop(columns=to_drop_columns, inplace=True)

#### Mapping the mechanics to categories

In [None]:
df['Mechanics_Categories'] = df['Mechanics'].apply(map_mechanics_to_categories)
mechanics_dummies = df['Mechanics_Categories'].str.get_dummies(sep=', ')

# Include the dummy columns in the dataframe
df = pd.concat([df, mechanics_dummies], axis=1)

#### Removing possible noise

In [None]:
# Calculating the frequency of each mechanic category
mechanics_frequencies = mechanics_dummies.sum()
print(mechanics_frequencies)

# Threshold for removing the noise
threshold = 100

# Filtering the mechanics that have a frequency lower than the threshold
mechanics_to_keep = mechanics_frequencies[mechanics_frequencies >= threshold].index

# Keeping only the mechanics that have a frequency higher than the threshold
filtered_mechanics_dummies = mechanics_dummies[mechanics_to_keep]

# Updating the dataframe
df = pd.concat([df.drop(columns=mechanics_dummies.columns), filtered_mechanics_dummies], axis=1)

In [None]:
feature_columns = [
    "Strategy Games", "Thematic Games", "Wargames", "Family Games",
    "Customizable Games", "Abstract Games", "Party Games", "Children's Games",
    "Min Age", "Play Time", "BGG Rank", "Users Rated", "Owned Users",
    "Max Players", "Complexity Average"
]
targets = list(filtered_mechanics_dummies.columns)
targets

## Model training for imputing the 'Mechanics' column
#### Splitting the data

In [None]:
X = df[feature_columns]
y = df[targets]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#### Defining the model

In [None]:
rf_model = RandomForestClassifier(random_state=42)

#### Wrapping the model in a MultiOutputClassifier

In [None]:
multi_output_model = MultiOutputClassifier(rf_model)

#### Defining the pipeline

In [None]:
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('model', multi_output_model)
])

## Hyperparameter tuning
#### Defining the parameters

In [None]:
rf_parameters = {
    'model__estimator__n_estimators': [50, 100, 150],
    'model__estimator__max_depth': [None, 10, 20, 30],
    'model__estimator__min_samples_split': [2, 5, 10],
    'model__estimator__min_samples_leaf': [1, 2, 4],
    'model__estimator__max_features': [None, 'sqrt', 'log2'],
}

#### Randomized search

In [None]:
random_search = RandomizedSearchCV(
    estimator=pipeline,
    param_distributions=rf_parameters,
    n_iter=30,
    cv=5,
    scoring='f1',
    n_jobs=-1,
    verbose=2,
    random_state=42
)

#### Fitting the model

In [None]:
print("Starting the training...")
random_search.fit(X_train, y_train)

## Model evaluation
#### Best model and its parameters

In [None]:
best_model = random_search.best_estimator_
best_params = random_search.best_params_
print(f"Best parameters: {best_params}")

#### evaluating the model

In [None]:
y_pred = best_model.predict(X_test)
print(classification_report(y_test, y_pred))

## Saving the model

In [None]:
joblib.dump(best_model, '../../../saved/mechanics_imputing/Multi_Ouput_Random_Forest.pkl')