Objective: Predict the outcome of UFC fights using machine learning algorithms.

Dataset: Ultimate UFC Fight Predictor Dataset (March 2010 – December 2024)

Features: Fighter statistics, fight outcomes, event details

Model:

In [10]:
!pip install catboost
!pip install optuna



Collecting optuna
  Downloading optuna-4.5.0-py3-none-any.whl.metadata (17 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Downloading optuna-4.5.0-py3-none-any.whl (400 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m400.9/400.9 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, optuna
Successfully installed colorlog-6.9.0 optuna-4.5.0


In [None]:
# ===============================================
# UFC Fight Outcome Prediction - Ultimate Research-Level
# ===============================================
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import StackingClassifier
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier
import optuna
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

# ---------------------------
# 1️⃣ Load Dataset
# ---------------------------
df = pd.read_csv('ufc_fight_data.csv')
df.columns = df.columns.str.strip()

# ---------------------------
# 2️⃣ Keep only clear outcomes
# ---------------------------
df = df[df['winner'].isin(['blue','red'])]
df['winner'] = df['winner'].map({'blue':0,'red':1})

print("After mapping, class counts:\n", df['winner'].value_counts())

# ---------------------------
# 3️⃣ Drop rows with missing essential numeric values
# ---------------------------
numeric_cols = ['B_Weight','R_Weight','B_Height','R_Height','B_Age','R_Age','BStreak','RStreak']
df.dropna(subset=numeric_cols, inplace=True)

# ---------------------------
# 4️⃣ Feature Engineering
# ---------------------------
df['weight_diff'] = df['B_Weight'] - df['R_Weight']
df['height_diff'] = df['B_Height'] - df['R_Height']
df['age_diff'] = df['B_Age'] - df['R_Age']
df['streak_diff'] = df['BStreak'] - df['RStreak']

df.drop(columns=numeric_cols, inplace=True)

# ---------------------------
# 5️⃣ Per-round stats differences
# ---------------------------
rounds = [1,2,3,4,5]
stat_types = ['SigStrikes_Landed','SigStrikes_Attempted',
              'Takedowns_Landed','Takedowns_Attempted',
              'Submissions_Attempted','Grappling_Control_Time']

for r in rounds:
    for s in stat_types:
        b_col = f'B__Round{r}_{s}'
        r_col = f'R__Round{r}_{s}'
        if b_col in df.columns and r_col in df.columns:
            df[f'{s}_diff_R{r}'] = df[b_col] - df[r_col]

drop_round_cols = [col for col in df.columns if '__Round' in col]
df.drop(columns=drop_round_cols, inplace=True, errors='ignore')

# ---------------------------
# 6️⃣ Advanced Features
# ---------------------------
df['B_Strike_Acc'] = df.get('B_SigStrikes_Landed',0) / df.get('B_SigStrikes_Attempted',1)
df['R_Strike_Acc'] = df.get('R_SigStrikes_Landed',0) / df.get('R_SigStrikes_Attempted',1)
df['strike_acc_diff'] = df['B_Strike_Acc'] - df['R_Strike_Acc']

df['B_TD_Acc'] = df.get('B_Takedowns_Landed',0) / df.get('B_Takedowns_Attempted',1)
df['R_TD_Acc'] = df.get('R_Takedowns_Landed',0) / df.get('R_Takedowns_Attempted',1)
df['td_acc_diff'] = df['B_TD_Acc'] - df['R_TD_Acc']

df['recent_streak_diff'] = df.get('BStreak',0) - df.get('RStreak',0)

for col in ['B_SigStrikes_Landed','B_SigStrikes_Attempted','R_SigStrikes_Landed','R_SigStrikes_Attempted',
            'B_Takedowns_Landed','B_Takedowns_Attempted','R_Takedowns_Landed','R_Takedowns_Attempted']:
    if col in df.columns:
        df.drop(columns=col, inplace=True)

# ---------------------------
# 7️⃣ Drop noisy categorical columns
# ---------------------------
drop_cols = [col for col in df.columns if 'Name' in col]
df.drop(columns=drop_cols, inplace=True, errors='ignore')

categorical_cols = [col for col in df.columns if df[col].dtype=='object']
df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

# ---------------------------
# 8️⃣ Features and target
# ---------------------------
X = df.drop('winner', axis=1)
y = df['winner']

scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

# ---------------------------
# 9️⃣ Train-Test Split
# ---------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y
)

# ---------------------------
# 🔹 10️⃣ Optuna Hyperparameter Tuning
# ---------------------------
def tune_xgb(trial):
    param = {
        'n_estimators': trial.suggest_int('n_estimators', 500, 2000),
        'max_depth': trial.suggest_int('max_depth', 3, 8),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'scale_pos_weight': y_train.value_counts()[0]/y_train.value_counts()[1],
        'random_state':42,
        'eval_metric':'logloss'
    }
    model = xgb.XGBClassifier(**param)
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    return accuracy_score(y_test, preds)

study = optuna.create_study(direction='maximize')
study.optimize(tune_xgb, n_trials=25)
best_params_xgb = study.best_trial.params
print("Best XGBoost params:", best_params_xgb)

# ---------------------------
# 11️⃣ Base Models with tuned hyperparameters
# ---------------------------
xgb_model = xgb.XGBClassifier(**best_params_xgb)
lgb_model = lgb.LGBMClassifier(
    n_estimators=2000,
    max_depth=6,
    learning_rate=0.03,
    subsample=0.8,
    colsample_bytree=0.8,
    class_weight='balanced',
    random_state=42
)
cat_model = CatBoostClassifier(
    iterations=2000,
    depth=6,
    learning_rate=0.03,
    verbose=0,
    class_weights=[y_train.value_counts()[0], y_train.value_counts()[1]],
    random_state=42
)

# ---------------------------
# 12️⃣ Feature Selection via XGBoost
# ---------------------------
selector = SelectFromModel(xgb_model)
selector.fit(X_train, y_train)
X_train_sel = selector.transform(X_train)
X_test_sel = selector.transform(X_test)
selected_features = X.columns[selector.get_support()]

# ---------------------------
# 13️⃣ Stacking Ensemble
# ---------------------------
estimators = [('xgb', xgb_model), ('lgb', lgb_model), ('cat', cat_model)]
stack_model = StackingClassifier(
    estimators=estimators,
    final_estimator=xgb.XGBClassifier(
        n_estimators=500,
        max_depth=4,
        learning_rate=0.05,
        random_state=42,
        eval_metric='logloss'
    ),
    cv=5,
    passthrough=True
)

stack_model.fit(X_train_sel, y_train)

# ---------------------------
# 14️⃣ Evaluation
# ---------------------------
y_pred = stack_model.predict(X_test_sel)
acc = accuracy_score(y_test, y_pred)
print(f"Ultimate Ensemble Accuracy: {acc*100:.2f}%\n")
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

# Feature importance from XGBoost final estimator
final_xgb = stack_model.final_estimator_
feat_importances = pd.Series(final_xgb.feature_importances_, index=selected_features)
top_features = feat_importances.sort_values(ascending=False).head(25)
plt.figure(figsize=(12,8))
sns.barplot(x=top_features.values, y=top_features.index)
plt.title("Top 25 Feature Importances - Ultimate Ensemble XGBoost")
plt.show()

# ---------------------------
# 15️⃣ Real-time Fight Prediction Function
# ---------------------------
def predict_fight_ultimate(fighter_blue: dict, fighter_red: dict):
    fight_data = {}
    fight_data['weight_diff'] = fighter_blue['B_Weight'] - fighter_red['R_Weight']
    fight_data['height_diff'] = fighter_blue['B_Height'] - fighter_red['R_Height']
    fight_data['age_diff'] = fighter_blue['B_Age'] - fighter_red['R_Age']
    fight_data['streak_diff'] = fighter_blue['BStreak'] - fighter_red['RStreak']
    fight_data['strike_acc_diff'] = fighter_blue.get('B_Strike_Acc',0) - fighter_red.get('R_Strike_Acc',0)
    fight_data['td_acc_diff'] = fighter_blue.get('B_TD_Acc',0) - fighter_red.get('R_TD_Acc',0)
    fight_data['recent_streak_diff'] = fighter_blue.get('BStreak',0) - fighter_red.get('RStreak',0)

    for r in rounds:
        for s in stat_types:
            b_key = f'B__Round{r}_{s}'
            r_key = f'R__Round{r}_{s}'
            diff_key = f'{s}_diff_R{r}'
            fight_data[diff_key] = fighter_blue.get(b_key,0) - fighter_red.get(r_key,0)

    fight_df = pd.DataFrame([fight_data])
    fight_df = pd.DataFrame(scaler.transform(fight_df.reindex(columns=X[selected_features].columns, fill_value=0)),
                            columns=selected_features)
    pred = stack_model.predict(fight_df)[0]
    return "Red Wins" if pred==1 else "Blue Wins"


After mapping, class counts:
 winner
1    867
0    584
Name: count, dtype: int64


[I 2025-10-02 14:55:18,639] A new study created in memory with name: no-name-2f79f8b4-55d0-448f-923f-a21a258f2dc2
[I 2025-10-02 14:56:00,171] Trial 0 finished with value: 0.5275862068965518 and parameters: {'n_estimators': 1548, 'max_depth': 7, 'learning_rate': 0.023325652869518473, 'subsample': 0.9290482985314895, 'colsample_bytree': 0.7191139301656968}. Best is trial 0 with value: 0.5275862068965518.
[I 2025-10-02 14:56:19,493] Trial 1 finished with value: 0.5379310344827586 and parameters: {'n_estimators': 1538, 'max_depth': 3, 'learning_rate': 0.01899087833383067, 'subsample': 0.8153990803230606, 'colsample_bytree': 0.8196187766728538}. Best is trial 1 with value: 0.5379310344827586.
[I 2025-10-02 14:56:33,729] Trial 2 finished with value: 0.5275862068965518 and parameters: {'n_estimators': 967, 'max_depth': 6, 'learning_rate': 0.010570411805161068, 'subsample': 0.9281837771888524, 'colsample_bytree': 0.7456058536107999}. Best is trial 1 with value: 0.5379310344827586.
[I 2025-10-0