In [7]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.svm import LinearSVC, SVC
from sklearn import preprocessing
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

In [2]:
df = pd.read_csv('../data/MOBA_df.csv')

feature_cols = [c for c in df.columns if c not in ['name', 'gender']]
X = df[feature_cols]
y = df['gender'] #in future - include the name with pred

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

#Using standard scalar instead
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

df['gender'].value_counts()

gender
0    425
1    215
Name: count, dtype: int64

In [6]:
#Random forests with Standard Scaling and cross validation and grid search cv- chat gpt assisted for grid search
rf_complex = RandomForestClassifier(
    n_estimators=500,          
    max_depth=20,              
    min_samples_split=2,
    min_samples_leaf=1,
    max_features="sqrt",
    class_weight="balanced",   
    random_state=42,
    n_jobs=-1
)
cv = StratifiedKFold(
    n_splits=5,
    shuffle=True,
    random_state=42
)
param_grid = {
    "n_estimators": [300, 500],
    "max_depth": [20, None],
    "min_samples_split": [2, 5],
    "min_samples_leaf": [1, 2],
    "max_features": ["sqrt", "log2"],
    "class_weight": ["balanced", None],
    "bootstrap": [True]
}
grid = GridSearchCV(
    estimator=rf_complex,
    param_grid=param_grid,
    scoring="f1_macro",   
    cv=cv,
    n_jobs=-1,
    verbose=0
)
pd.set_option('display.max_rows', None)
grid.fit(X, y)
print("Best CV macro-F1:", grid.best_score_)
print("Best params:", grid.best_params_)
best_idx = grid.best_index_
mean = grid.cv_results_["mean_test_score"][best_idx]
std  = grid.cv_results_["std_test_score"][best_idx]
print(f"Best CV macro-F1: {mean:.3f} ± {std:.3f}")
best_rf = grid.best_estimator_
best_rf.fit(X_train, y_train)
y_pred_rf = best_rf.predict(X_test)
print()
print("-----------------------------------------------")
print("Random Forest (Best via CV) – Quantitative analysis")
print("-----------------------------------------------")
print(classification_report(y_test, y_pred_rf))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_rf))
print()
print("-----------------------------------------------")
print("Random Forest (Best via CV) – Test Set Results")
print("-----------------------------------------------")
rf_results = X_test.copy()
rf_results["actual_gender"] = y_test.values
rf_results["predicted_gender"] = y_pred_rf
rf_results["name"] = df.loc[X_test.index, "name"]
rf_results = rf_results[["name", "actual_gender", "predicted_gender"]]
print(rf_results)

Best CV macro-F1: 0.6147447401791896
Best params: {'bootstrap': True, 'class_weight': 'balanced', 'max_depth': None, 'max_features': 'log2', 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 300}
Best CV macro-F1: 0.615 ± 0.025

-----------------------------------------------
Random Forest (Best via CV) – Quantitative analysis
-----------------------------------------------
              precision    recall  f1-score   support

           0       0.69      0.86      0.77        84
           1       0.50      0.27      0.35        44

    accuracy                           0.66       128
   macro avg       0.60      0.56      0.56       128
weighted avg       0.63      0.66      0.62       128

Confusion Matrix:
[[72 12]
 [32 12]]

-----------------------------------------------
Random Forest (Best via CV) – Test Set Results
-----------------------------------------------
                          name  actual_gender  predicted_gender
570                   Hun Batz        

In [None]:
#AdaBoosting model with Standard scalar
