In [2]:
import pandas as pd
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, accuracy_score

In [3]:
fighters_stats = pd.read_csv('data/fighter_stats_2.csv')

In [4]:
fighters_stats

Unnamed: 0,knockdowns,significant_strikes_landed,significant_strikes_attempted,significant_strikes_rate,total_strikes_landed,total_strikes_attempted,takedown_successful,takedown_attempted,takedown_rate,submission_attempt,...,body_attempted_1,leg_landed_1,leg_attempted_1,distance_landed_1,distance_attempted_1,clinch_landed_1,clinch_attempted_1,ground_landed_1,ground_attempted_1,winner
0,0.15,25.05,51.05,0.507,31.75,60.1,0.1,0.45,0.0665,0.15,...,8.644737,4.618421,5.921053,17.144737,39.460526,0.828947,1.026316,0.789474,1.210526,1
1,0.15,25.05,51.05,0.507,31.75,60.1,0.1,0.45,0.0665,0.15,...,6.500000,1.750000,2.750000,7.500000,18.500000,4.000000,5.750000,1.750000,2.000000,0
2,0.15,25.05,51.05,0.507,31.75,60.1,0.1,0.45,0.0665,0.15,...,6.500000,2.391892,2.918919,16.581081,34.418919,1.689189,2.297297,1.148649,1.702703,0
3,0.15,25.05,51.05,0.507,31.75,60.1,0.1,0.45,0.0665,0.15,...,4.187500,1.343750,2.093750,13.812500,34.875000,0.562500,1.343750,0.968750,1.562500,1
4,0.15,25.05,51.05,0.507,31.75,60.1,0.1,0.45,0.0665,0.15,...,3.795918,2.367347,2.979592,15.183673,42.653061,0.877551,1.591837,1.346939,2.408163,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14349,0.00,1.00,7.00,0.140,1.00,7.0,0.0,0.00,0.0000,0.00,...,0.750000,0.250000,0.500000,1.500000,3.500000,0.000000,0.000000,0.250000,0.250000,1
14350,1.00,7.50,15.00,0.275,19.00,28.0,0.0,0.00,0.0000,0.00,...,0.333333,1.000000,1.000000,2.000000,3.666667,0.000000,0.000000,2.666667,3.666667,1
14351,1.00,7.50,15.00,0.275,19.00,28.0,0.0,0.00,0.0000,0.00,...,3.500000,1.000000,1.500000,0.500000,4.500000,5.000000,9.500000,1.000000,1.000000,0
14352,0.00,0.00,0.00,0.000,0.00,0.0,0.0,0.00,0.0000,0.00,...,1.000000,0.785714,2.000000,0.714286,2.857143,1.000000,1.642857,0.928571,1.214286,1


In [5]:
X, y = fighters_stats.drop('winner', axis=1), fighters_stats[['winner']]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [6]:
# Paramètres à tester pour le réglage des hyperparamètres
param_grid = {
    'learning_rate': [0.05, 0.1, 0.20],
    'max_depth': [3, 6, 9,],
    'min_child_weight': [1, 10, 100],
    'n_estimators': [100, 400, 800],
}

In [8]:
# Initialize XGBoost model
xgb = XGBClassifier(random_state=42, objective='binary:logistic')

# Create grid search instance
grid_search = GridSearchCV(xgb, param_grid, cv=5, verbose=2)

# Fit grid search
grid_search.fit(X_train, y_train)

# Print best results
print("Best hyperparameters : ", grid_search.best_params_)

# Save best hyperparameters
xgb_best_param = grid_search.best_params_

Fitting 5 folds for each of 81 candidates, totalling 405 fits
[CV] END learning_rate=0.05, max_depth=3, min_child_weight=1, n_estimators=100; total time=   0.0s
[CV] END learning_rate=0.05, max_depth=3, min_child_weight=1, n_estimators=100; total time=   0.0s
[CV] END learning_rate=0.05, max_depth=3, min_child_weight=1, n_estimators=100; total time=   0.0s
[CV] END learning_rate=0.05, max_depth=3, min_child_weight=1, n_estimators=100; total time=   0.0s
[CV] END learning_rate=0.05, max_depth=3, min_child_weight=1, n_estimators=100; total time=   0.0s
[CV] END learning_rate=0.05, max_depth=3, min_child_weight=1, n_estimators=400; total time=   0.3s
[CV] END learning_rate=0.05, max_depth=3, min_child_weight=1, n_estimators=400; total time=   0.3s
[CV] END learning_rate=0.05, max_depth=3, min_child_weight=1, n_estimators=400; total time=   0.3s
[CV] END learning_rate=0.05, max_depth=3, min_child_weight=1, n_estimators=400; total time=   0.3s
[CV] END learning_rate=0.05, max_depth=3, min_c

In [9]:
parameters = {'learning_rate': 0.2, 'max_depth': 3, 'min_child_weight': 10, 'n_estimators': 100}


In [10]:
# Instanciate XGBoost model with best hyperparameters
xgb_optimized = XGBClassifier(**parameters, objective= 'binary:logistic',)

# Train model using best hyperparameters
xgb_optimized.fit(X_train, y_train)

# Run prediction on test set
y_pred = xgb_optimized.predict(X_test)

In [11]:
# evaluate predictions
rmse = mean_squared_error(y_test, y_pred, squared=False)
print(f"RMSE of the base model: {rmse:.3f}")

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

RMSE of the base model: 0.565
Accuracy: 68.10%


# Test model

In [59]:
ufc_stats = pd.read_csv('data/ufc_stats_2.csv')

def get_fights(fighter_name):
    all_fights = ufc_stats[ufc_stats['fighter'] == fighter_name]
    return all_fights

def clean_columns(data):
    # remove columns named "fighter", "Unnamed: 0", "event", "fight_date", "location", "attendance"
    cleaned_data = data.drop(columns=['fighter', 'Unnamed: 0', 'event', 'location', 'attendance', 'time', 'scheduled_rounds', 'weight_class'])
    # Convert the "fight_date" column to a datetime data type
    cleaned_data['fight_date'] = pd.to_datetime(cleaned_data['fight_date'])
    return cleaned_data

def get_last_fights(data, nb_fights):
    df = data.sort_values(by=['fight_date'], ascending=False)
    # Group the DataFrame by the unique fight ID
    grouped = df.groupby('id', sort=False)
    # Get the first nb_fights groups (corresponding to the first nb_fights fights)
    first_nb_fights = [group for _, group in grouped][:nb_fights]
    # Concatenate the DataFrames for the first nb_fights fights
    result_df = pd.concat(first_nb_fights)
    # Reset the index of the result DataFrame if needed
    result_df.reset_index(drop=True, inplace=True)
    return result_df

def get_fighter_data(fighter_name):
    all_fights = get_fights(fighter_name)
    all_fights = clean_columns(all_fights)
    last_fights = get_last_fights(all_fights, nb_fights=5)
    last_fights = last_fights.drop(columns=['round', 'last_round', 'id', 'result', 'winner', 'fight_date'])
    last_fights = last_fights.mean().to_frame().transpose()
    return last_fights

In [90]:
def predict_fight(fighter_1, fighter_2):
    data_1 = get_fighter_data(fighter_1)
    data_2 = get_fighter_data(fighter_2)
    data = [data_1.values[0].tolist() + data_2.values[0].tolist()]
    
    preds = xgb_optimized.predict_proba(data)
    print(fighter_1, preds[0][0])
    print(fighter_2, preds[0][1])

predict_fight('Conor McGregor', 'Benoit Saint Denis')

Conor McGregor 0.35497195
Benoit Saint Denis 0.64502805
