# Model Training
---
In this file, I will train the model using the prepared dataset using scikit-learn.

In [1]:
from Utils import setup_database_connection, true, false
from Utils import load_all_players
from Utils import COLOUR_BANNED, COLOUR_NON_BANNED, COLOUR_BLUE
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from imblearn.combine import SMOTEENN

engine = setup_database_connection()
player_data = load_all_players(engine)
banned_player_data = player_data[player_data['has_ban'] == true]
non_banned_player_data = player_data[player_data['has_ban'] == false]

Connecting to database...
Connection successful!
Loaded 216547 players


## Data Cleaning
---
Apply data cleaning to prepare the dataset for training:
1. Remove features with >50% zero values in banned player data
2. Remove players with >2 zero values across features

In [2]:
features_to_exclude = []
for feature in banned_player_data.select_dtypes(include=['int64', 'float64']).columns:
    banned_zeros = (banned_player_data[feature] == 0).sum()
    banned_zero_pct = (banned_zeros / len(banned_player_data)) * 100

    if banned_zero_pct > 50:
        features_to_exclude.append(feature)

thresholded_player_data = player_data.drop(columns=features_to_exclude)

ZERO_THRESHOLD = 2

numeric_features = thresholded_player_data.select_dtypes(include=['int64', 'float64']).columns.tolist()
zero_counts_per_player = (thresholded_player_data[numeric_features] == 0).sum(axis=1)

mask = zero_counts_per_player <= ZERO_THRESHOLD

filtered_player_data = thresholded_player_data[mask].copy()

original_banned_count = (thresholded_player_data['has_ban'] == true).sum()
original_non_banned_count = (thresholded_player_data['has_ban'] == false).sum()
filtered_banned_count = (filtered_player_data['has_ban'] == true).sum()
filtered_non_banned_count = (filtered_player_data['has_ban'] == false).sum()

total_original = len(thresholded_player_data)
total_filtered = len(filtered_player_data)
total_removed = total_original - total_filtered
banned_removed = original_banned_count - filtered_banned_count
non_banned_removed = original_non_banned_count - filtered_non_banned_count

print(f"{'Category':<20} {'Original':<15} {'Filtered':<15} {'Removed':<15} {'% Retained':<15}")
print("-" * 80)
print(f"{'Banned Players':<20} {original_banned_count:<15,} {filtered_banned_count:<15,} {banned_removed:<15,} {(filtered_banned_count/original_banned_count*100):.2f}%")
print(f"{'Non-Banned Players':<20} {original_non_banned_count:<15,} {filtered_non_banned_count:<15,} {non_banned_removed:<15,} {(filtered_non_banned_count/original_non_banned_count*100):.2f}%")
print(f"{'Total Players':<20} {total_original:<15,} {total_filtered:<15,} {total_removed:<15,} {(total_filtered/total_original*100):.2f}%")

print(f"\nClass Balance:")
print("-" * 50)
print(f"Original - Banned: {(original_banned_count/total_original*100):.2f}% | Non-Banned: {(original_non_banned_count/total_original*100):.2f}%")
print(f"Filtered - Banned: {(filtered_banned_count/total_filtered*100):.2f}% | Non-Banned: {(filtered_non_banned_count/total_filtered*100):.2f}%")

print(f"\nData cleaning complete. Ready for training with {len(filtered_player_data):,} players and {filtered_player_data.shape[1]} features")

Category             Original        Filtered        Removed         % Retained     
--------------------------------------------------------------------------------
Banned Players       43,969          22,237          21,732          50.57%
Non-Banned Players   172,578         170,585         1,993           98.85%
Total Players        216,547         192,822         23,725          89.04%

Class Balance:
--------------------------------------------------
Original - Banned: 20.30% | Non-Banned: 79.70%
Filtered - Banned: 11.53% | Non-Banned: 88.47%

Data cleaning complete. Ready for training with 192,822 players and 29 features


# Training/Test Splitting
---

In [3]:
from sklearn.model_selection import train_test_split

columns_to_exclude = ['steam_id', 'created_at', 'name', 'total_matches', 'updated_at', 'has_ban']
X = filtered_player_data.drop(columns=columns_to_exclude)
y = filtered_player_data['has_ban'].map({true: 1, false: 0})

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,      # 80% train, 20% test
    random_state=42,
    stratify=y          # Maintains class balance in splits
)

print(f"Total dataset: {X.shape[0]:,} samples")
print(f"Training set: {X_train.shape[0]:,} samples")
print(f"Test set: {X_test.shape[0]:,} samples")
print(f"\nTraining class distribution:")
print(f"  Banned (1): {(y_train == 1).sum():,} ({(y_train == 1).sum()/len(y_train)*100:.2f}%)")
print(f"  Non-banned (0): {(y_train == 0).sum():,} ({(y_train == 0).sum()/len(y_train)*100:.2f}%)")
print(f"\nTest class distribution:")
print(f"  Banned (1): {(y_test == 1).sum():,} ({(y_test == 1).sum()/len(y_test)*100:.2f}%)")
print(f"  Non-banned (0): {(y_test == 0).sum():,} ({(y_test == 0).sum()/len(y_test)*100:.2f}%)")


Total dataset: 192,822 samples
Training set: 154,257 samples
Test set: 38,565 samples

Training class distribution:
  Banned (1): 17,790 (11.53%)
  Non-banned (0): 136,467 (88.47%)

Test class distribution:
  Banned (1): 4,447 (11.53%)
  Non-banned (0): 34,118 (88.47%)


# Handling Class Imbalance
---
I will be using both undersampling and oversampling techniques to handle class imbalance in the dataset. I have too little banned players in my dataset in comparison to non banned players. I will be using scikit learns SMOTEENN method to oversample the banned players and undersample the non banned players.

In [4]:
# Apply SMOTEENN ONLY to training data (not test data)
smote_enn = SMOTEENN(random_state=42)
X_train_resampled, y_train_resampled = smote_enn.fit_resample(X_train, y_train)

print(f"Original training set shape: {X_train.shape}")
print(f"Resampled training set shape: {X_train_resampled.shape}")
print(f"Test set shape (unchanged): {X_test.shape}")

print(f"\nOriginal training class distribution:")
print(f"  Banned (1): {(y_train == 1).sum():,} ({(y_train == 1).sum()/len(y_train)*100:.2f}%)")
print(f"  Non-banned (0): {(y_train == 0).sum():,} ({(y_train == 0).sum()/len(y_train)*100:.2f}%)")

print(f"\nResampled training class distribution:")
print(f"  Banned (1): {(y_train_resampled == 1).sum():,} ({(y_train_resampled == 1).sum()/len(y_train_resampled)*100:.2f}%)")
print(f"  Non-banned (0): {(y_train_resampled == 0).sum():,} ({(y_train_resampled == 0).sum()/len(y_train_resampled)*100:.2f}%)")

print(f"\nTest set class distribution (unchanged - represents real-world data):")
print(f"  Banned (1): {(y_test == 1).sum():,} ({(y_test == 1).sum()/len(y_test)*100:.2f}%)")
print(f"  Non-banned (0): {(y_test == 0).sum():,} ({(y_test == 0).sum()/len(y_test)*100:.2f}%)")


Original training set shape: (154257, 23)
Resampled training set shape: (245643, 23)
Test set shape (unchanged): (38565, 23)

Original training class distribution:
  Banned (1): 17,790 (11.53%)
  Non-banned (0): 136,467 (88.47%)

Resampled training class distribution:
  Banned (1): 135,485 (55.16%)
  Non-banned (0): 110,158 (44.84%)

Test set class distribution (unchanged - represents real-world data):
  Banned (1): 4,447 (11.53%)
  Non-banned (0): 34,118 (88.47%)


# Data Resampling Results
---
I went from 22000 banned players up to 167,000 and undersampled non banned players from 168,744 to 136,021, giving me a much more balanced dataset to train on. approx 55/45 split.

# Performance Metrics
---
Before I start training the model and choosing an algorithm, I will decide on what metrics I want to prioritise. Given we are trying to identify cheaters and crucially without incorrectly flagging legit players as cheaters I want to reduce the number of false positives, therefore prioritising Precision as my main metric. I would rather miss some cheaters (false negatives) than incorrectly flag legit players (false positives). I will also monitor Recall, but it will be a secondary metric to Precision. Preferably a good balance would be optimal.

# Algorithm Selection
---
I will start with Random Forest, however if I am not happy with the performance, I may try others such as XGBoost.

# Model Training
---

In [8]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(
    n_estimators=100,
    max_depth=100,
    random_state=42,
    n_jobs=-1
)

clf.fit(X_train_resampled, y_train_resampled)

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,100
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


# Model Evaluation
---
Testing the trained model on the held-out test set to evaluate performance.

In [9]:
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, precision_score, recall_score, f1_score

# Make predictions on test set
y_pred = clf.predict(X_test)
y_pred_proba = clf.predict_proba(X_test)[:, 1]

# Print results
print("=" * 80)
print("RANDOM FOREST CLASSIFIER - TEST SET RESULTS")
print("=" * 80)

print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=['Non-Banned (0)', 'Banned (1)']))

print("\nConfusion Matrix:")
cm = confusion_matrix(y_test, y_pred)
print(f"{'':>15} {'Predicted Non-Banned':>25} {'Predicted Banned':>20}")
print(f"{'Actual Non-Banned':<15} {cm[0][0]:>25,} {cm[0][1]:>20,}")
print(f"{'Actual Banned':<15} {cm[1][0]:>25,} {cm[1][1]:>20,}")

# Calculate detailed metrics
tn, fp, fn, tp = cm.ravel()
fpr = fp / (fp + tn)
fnr = fn / (fn + tp)

print("\n" + "=" * 80)
print("KEY METRICS (focus on Precision to minimize false positives)")
print("=" * 80)
print(f"  ROC-AUC Score:           {roc_auc_score(y_test, y_pred_proba):.4f}")
print(f"  Precision (Banned):      {precision_score(y_test, y_pred):.4f}  <- PRIMARY METRIC")
print(f"  Recall (Banned):         {recall_score(y_test, y_pred):.4f}")
print(f"  F1-Score (Banned):       {f1_score(y_test, y_pred):.4f}")

print("\n" + "=" * 80)
print("ERROR ANALYSIS")
print("=" * 80)
print(f"  False Positive Rate:     {fpr:.4f} ({fp:,} legit players incorrectly flagged)")
print(f"  False Negative Rate:     {fnr:.4f} ({fn:,} cheaters missed)")
print(f"  True Positives:          {tp:,} cheaters correctly identified")
print(f"  True Negatives:          {tn:,} legit players correctly identified")

print("\n" + "=" * 80)
print("INTERPRETATION")
print("=" * 80)
print(f"  Out of {len(y_test):,} test players:")
print(f"    - {tp:,} out of {tp+fn:,} actual cheaters were caught ({tp/(tp+fn)*100:.1f}%)")
print(f"    - {fp:,} out of {tn+fp:,} legit players were wrongly flagged ({fp/(tn+fp)*100:.2f}%)")


RANDOM FOREST CLASSIFIER - TEST SET RESULTS

Classification Report:
                precision    recall  f1-score   support

Non-Banned (0)       0.98      0.89      0.93     34118
    Banned (1)       0.50      0.88      0.64      4447

      accuracy                           0.88     38565
     macro avg       0.74      0.88      0.78     38565
  weighted avg       0.93      0.88      0.90     38565


Confusion Matrix:
                     Predicted Non-Banned     Predicted Banned
Actual Non-Banned                    30,205                3,913
Actual Banned                         555                3,892

KEY METRICS (focus on Precision to minimize false positives)
  ROC-AUC Score:           0.9467
  Precision (Banned):      0.4987  <- PRIMARY METRIC
  Recall (Banned):         0.8752
  F1-Score (Banned):       0.6353

ERROR ANALYSIS
  False Positive Rate:     0.1147 (3,913 legit players incorrectly flagged)
  False Negative Rate:     0.1248 (555 cheaters missed)
  True Positives: 