In [7]:
import pandas as pd
import numpy as np
from xgboost import XGBClassifier, plot_importance
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, cross_val_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder, MinMaxScaler, PowerTransformer, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, balanced_accuracy_score, f1_score
from sklearn.pipeline import Pipeline
from sklearn.utils.class_weight import compute_class_weight
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv('../db/monster_fights.csv')
random_state = 37

X = df.drop('Outcome', axis=1)
y = df['Outcome']

# Check feature variances
# print(X.describe())

# Encode the target variable
label_encoder_y = LabelEncoder()
y = label_encoder_y.fit_transform(y)

# Identify column types automatically
float_cols = X.select_dtypes(include=['float'], exclude=['int']).columns.tolist()
cat_cols = X.select_dtypes(include=['object', 'category', 'int']).columns.tolist()

## Create preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', PowerTransformer(method='yeo-johnson'), float_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols)
    ],
    remainder='passthrough'  # Safety for any unprocessed columns
)

## Create complete pipeline with Random Forest
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=random_state))
])

## Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)

## Basic model training
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

print(f"Initial Accuracy: {accuracy_score(y_test, y_pred):.2f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

## Hyperparameter Tuning
param_grid = {
    'classifier__n_estimators': [10, 20, 40, 80, 100, 150, 200, 240, 300, 360, 400, 450], 
    'classifier__max_depth': [None, 5, 6, 7, 10, 13, 20, 30, 40],  
    'classifier__min_samples_split': [0.01, 0.05, 0.1],
    'classifier__min_samples_leaf': [1, 2, 4],
    'classifier__max_features': ['sqrt', 'log2', 0.3, 0.5, 0.8],  
    'classifier__bootstrap': [True],
    'classifier__class_weight': [None, 'balanced'],
    'classifier__criterion': ['gini', 'entropy'],
}

# Use RandomizedSearchCV for faster tuning with many features
from sklearn.model_selection import RandomizedSearchCV

search = RandomizedSearchCV(
    pipeline,
    param_grid,
    n_iter=300,
    cv=7,
    scoring='accuracy',
    n_jobs=-1,
    random_state=random_state
)

search.fit(X_train, y_train)

def evaluate_model(search, X_test, y_test):
    best_model = search.best_estimator_
    y_pred = best_model.predict(X_test)
    
    print(f"\nBest Params: {search.best_params_}")
    print(f"CV Accuracy: {search.best_score_:.2f}")
    print(f"Test Accuracy: {accuracy_score(y_test, y_pred):.2f}")
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    
    # Feature importance
    if hasattr(best_model.named_steps['classifier'], 'feature_importances_'):
        importances = best_model.named_steps['classifier'].feature_importances_
        features = best_model.named_steps['preprocessor'].get_feature_names_out()
        print("\nTop 10 Features:")
        for feat, imp in sorted(zip(features, importances), 
                            key=lambda x: x[1], reverse=True)[:10]:
            print(f"{feat}: {imp:.3f}")

evaluate_model(search, X_test, y_test)

# # Get feature names after preprocessing
# numeric_feature_names = numeric_features.tolist()
# categorical_transformer = search.best_estimator_.named_steps['preprocessor'].named_transformers_['cat']
# categorical_feature_names = categorical_transformer.get_feature_names_out(categorical_features).tolist()
# all_feature_names = numeric_feature_names + categorical_feature_names

# # Get feature importances
# importances = search.best_estimator_.named_steps['classifier'].feature_importances_
# feature_importance = pd.DataFrame({'Feature': all_feature_names, 'Importance': importances})
# feature_importance = feature_importance.sort_values('Importance', ascending=False)

# # Plot top 15 features
# import matplotlib.pyplot as plt
# import seaborn as sns

# plt.figure(figsize=(12, 8))
# sns.barplot(x='Importance', y='Feature', data=feature_importance.head(15))
# plt.title('Top 15 Feature Importances')
# plt.tight_layout()
# plt.show()

# # Evaluate on test set
# best_model = search.best_estimator_
# y_pred = best_model.predict(X_test)

# print("\nTest Set Evaluation:")
# print("Accuracy:", accuracy_score(y_test, y_pred))
# print("\nClassification Report:")
# print(classification_report(y_test, y_pred))

# # Confusion matrix
# from sklearn.metrics import confusion_matrix

# cm = confusion_matrix(y_test, y_pred)
# sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
# plt.xlabel('Predicted')
# plt.ylabel('Actual')
# plt.show()

In [8]:
# Load and prepare data
df = pd.read_csv('../db/monster_fights.csv')
random_state = 42

X = df.drop('Outcome', axis=1)
y = df['Outcome']

# Identify column types
float_cols = X.select_dtypes(include=['float']).columns.tolist()
bool_cols = [col for col in X.columns 
            if set(X[col].unique()).issubset({0, 1})]
object_cols = X.select_dtypes(include=['object']).columns.tolist()

# Convert boolean columns to int (0/1)
X[bool_cols] = X[bool_cols].astype(int)

# One-hot encode categorical (object) columns
if object_cols:
    ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
    object_encoded = ohe.fit_transform(X[object_cols])
    object_encoded_df = pd.DataFrame(object_encoded,
                                columns=ohe.get_feature_names_out(object_cols),
                                index=X.index)
    X = pd.concat([X.drop(object_cols, axis=1), object_encoded_df], axis=1)

# Convert target
if y.dtype == 'object':
    y = LabelEncoder().fit_transform(y)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=random_state, stratify=y
)

# --- Hyperparameter Tuning ---
# Define the parameter grid
param_grid = {
    'n_estimators': [50, 75, 100, 125, 150, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'subsample': [0.8, 1.0]
}

# Initialize base model
base_model = XGBClassifier(
    n_estimators=500,
    objective='multi:softprob' if len(np.unique(y)) > 2 else 'binary:logistic',
    eval_metric='mlogloss' if len(np.unique(y)) > 2 else 'logloss',
    random_state=random_state,
    early_stopping_rounds=10,
    n_jobs=-1  # Use all CPU cores
)

# Setup GridSearchCV
grid_search = GridSearchCV(
    
    estimator=base_model,
    param_grid=param_grid,
    cv=5,  # 5-fold cross-validation
    scoring='accuracy',
    verbose=False  # Show detailed progress
)

# Run grid search (using eval_set for early stopping)
grid_search.fit(
    X_train, y_train,
    eval_set=[(X_test, y_test)],
    verbose=False  # Disables duplicate progress bars
)

# --- Results ---
print("\nBest parameters found:")
print(grid_search.best_params_)

best_model = grid_search.best_estimator_

# Evaluate on test set
y_pred = best_model.predict(X_test)
print(f"\nTest Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


# Convert to numpy arrays if they aren't already
X_array = X.values if hasattr(X, 'values') else X
y_array = y.values if hasattr(y, 'values') else y

# Initialize CV
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
f1_scores = []

# Manual cross-validation loop
for train_idx, val_idx in cv.split(X_array, y_array):
    X_train, X_val = X_array[train_idx], X_array[val_idx]
    y_train, y_val = y_array[train_idx], y_array[val_idx]
    
    # Train with early stopping
    model = XGBClassifier(
        **grid_search.best_params_,  # Your tuned parameters
        early_stopping_rounds=10
    )
    model.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        verbose=False
    )
    
    # Predict and score
    y_pred = model.predict(X_val)
    f1_scores.append(f1_score(y_val, y_pred, average='weighted'))

print(f"CV F1: {np.mean(f1_scores):.2f} ± {np.std(f1_scores):.2f}")

# Feature importance
# plt.figure(figsize=(10, 8))
# plot_importance(best_model, max_num_features=20)
# plt.show()


Best parameters found:
{'learning_rate': 0.2, 'max_depth': 3, 'n_estimators': 100, 'subsample': 1.0}

Test Accuracy: 0.7623

Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.61      0.72        28
           1       0.78      0.64      0.70        11
           2       0.73      0.89      0.80        54
           3       0.75      0.72      0.74        29

    accuracy                           0.76       122
   macro avg       0.79      0.71      0.74       122
weighted avg       0.78      0.76      0.76       122

CV F1: 0.71 ± 0.04
