In [1]:
import pandas as pd
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, accuracy_score

In [2]:
fighters_stats = pd.read_csv('data/fighter_stats_3.csv')

In [3]:
fighters_stats

Unnamed: 0,nb_fights,L,W,D,NC,knockdowns,significant_strikes_landed,significant_strikes_attempted,significant_strikes_rate,total_strikes_landed,...,body_attempted_1,leg_landed_1,leg_attempted_1,distance_landed_1,distance_attempted_1,clinch_landed_1,clinch_attempted_1,ground_landed_1,ground_attempted_1,winner
0,8.0,2.0,6.0,0.0,0.0,0.0,26.0,49.5,0.510,30.0,...,7.0,3.0,5.0,16.0,36.0,0.0,0.0,0.0,0.0,1
1,8.0,2.0,6.0,0.0,0.0,0.0,26.0,49.5,0.510,30.0,...,5.0,1.0,1.5,6.5,15.0,3.0,5.0,1.0,1.0,0
2,8.0,2.0,6.0,0.0,0.0,0.0,26.0,49.5,0.510,30.0,...,6.0,2.0,3.0,15.0,34.0,1.0,2.0,0.0,0.0,0
3,8.0,2.0,6.0,0.0,0.0,0.0,26.0,49.5,0.510,30.0,...,3.0,0.0,0.5,11.5,33.5,0.0,0.5,0.0,0.0,1
4,8.0,2.0,6.0,0.0,0.0,0.0,26.0,49.5,0.510,30.0,...,4.0,1.0,2.0,15.0,46.0,0.0,1.0,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14349,1.0,1.0,0.0,0.0,0.0,0.0,1.0,7.0,0.140,1.0,...,0.5,0.0,0.5,1.5,2.5,0.0,0.0,0.0,0.0,1
14350,2.0,1.0,1.0,0.0,0.0,1.0,7.5,15.0,0.275,19.0,...,0.0,0.0,0.0,1.0,3.0,0.0,0.0,2.0,2.0,1
14351,2.0,1.0,1.0,0.0,0.0,1.0,7.5,15.0,0.275,19.0,...,3.5,1.0,1.5,0.5,4.5,5.0,9.5,1.0,1.0,0
14352,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000,0.0,...,0.0,0.0,1.5,0.0,2.0,0.0,0.0,0.0,0.0,1


In [4]:
X, y = fighters_stats.drop('winner', axis=1), fighters_stats[['winner']]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [6]:
# Paramètres à tester pour le réglage des hyperparamètres
param_grid = {
    'learning_rate': [0.05, 0.1, 0.20],
    'max_depth': [3, 6, 9,],
    'min_child_weight': [1, 10, 100],
    'n_estimators': [100, 400, 800],
}

In [8]:
# Initialize XGBoost model
xgb = XGBClassifier(random_state=42, objective='binary:logistic')

# Create grid search instance
grid_search = GridSearchCV(xgb, param_grid, cv=5, verbose=2)

# Fit grid search
grid_search.fit(X_train, y_train)

# Print best results
print("Best hyperparameters : ", grid_search.best_params_)

# Save best hyperparameters
xgb_best_param = grid_search.best_params_

Fitting 5 folds for each of 81 candidates, totalling 405 fits
[CV] END learning_rate=0.05, max_depth=3, min_child_weight=1, n_estimators=100; total time=   0.0s
[CV] END learning_rate=0.05, max_depth=3, min_child_weight=1, n_estimators=100; total time=   0.0s
[CV] END learning_rate=0.05, max_depth=3, min_child_weight=1, n_estimators=100; total time=   0.0s
[CV] END learning_rate=0.05, max_depth=3, min_child_weight=1, n_estimators=100; total time=   0.0s
[CV] END learning_rate=0.05, max_depth=3, min_child_weight=1, n_estimators=100; total time=   0.0s
[CV] END learning_rate=0.05, max_depth=3, min_child_weight=1, n_estimators=400; total time=   0.3s
[CV] END learning_rate=0.05, max_depth=3, min_child_weight=1, n_estimators=400; total time=   0.3s
[CV] END learning_rate=0.05, max_depth=3, min_child_weight=1, n_estimators=400; total time=   0.3s
[CV] END learning_rate=0.05, max_depth=3, min_child_weight=1, n_estimators=400; total time=   0.3s
[CV] END learning_rate=0.05, max_depth=3, min_c

In [5]:
parameters = {'learning_rate': 0.2, 'max_depth': 3, 'min_child_weight': 10, 'n_estimators': 100}


In [6]:
# Instanciate XGBoost model with best hyperparameters
xgb_optimized = XGBClassifier(**parameters, objective= 'binary:logistic',)

# Train model using best hyperparameters
xgb_optimized.fit(X_train, y_train)

# Run prediction on test set
y_pred = xgb_optimized.predict(X_test)

In [7]:
# evaluate predictions
rmse = mean_squared_error(y_test, y_pred, squared=False)
print(f"RMSE of the base model: {rmse:.3f}")

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

RMSE of the base model: 0.466
Accuracy: 78.24%


# Test model

In [13]:
ufc_stats = pd.read_csv('data/ufc_stats.csv')

def get_fights(fighter_name):
    all_fights = ufc_stats[ufc_stats['fighter'] == fighter_name]
    return all_fights

def clean_columns(data):
    cleaned_data = data.drop(columns=['fighter', 'Unnamed: 0', 'event', 'location', 'attendance', 'time', 'scheduled_rounds', 'weight_class', 'round', 'last_round', 'id', 'result', 'winner', 'fight_date'])
    # remove columns named "fighter", "Unnamed: 0", "event", "fight_date", "location", "attendance"
    # Convert the "fight_date" column to a datetime data type
    #cleaned_data['fight_date'] = pd.to_datetime(cleaned_data['fight_date'])
    return cleaned_data

def get_last_fights(data):
    df = data.sort_values(by=['fight_date'], ascending=False)
    # Group the DataFrame by the unique fight ID
    grouped = df.groupby('id', sort=False)
    # Get the first nb_fights groups (corresponding to the first nb_fights fights)
    first_nb_fights = [group for _, group in grouped]
    # Concatenate the DataFrames for the first nb_fights fights
    result_df = pd.concat(first_nb_fights)
    # Reset the index of the result DataFrame if needed
    result_df.reset_index(drop=True, inplace=True)
    return result_df

def get_all_oponents(fighter_name):
    all_fights = get_fights(fighter_name)
    fights_ids = all_fights['id'].unique()
    filtered_fights = ufc_stats[(ufc_stats['id'].isin(fights_ids)) & (ufc_stats['fighter'] != fighter_name)]['fighter'].unique()
    return filtered_fights

def get_winner(fighter_name_1, fighter_name_2):
    all_fights = get_fights(fighter_name_1)
    fights_ids = all_fights['id'].unique()
    filtered_fights = ufc_stats[(ufc_stats['id'].isin(fights_ids)) & (ufc_stats['fighter'] != fighter_name_1) & (ufc_stats['fighter'] == fighter_name_2)]
    grouped = filtered_fights.groupby('id', sort=False)
    fights = [group.iloc[0] for _, group in grouped]
    fights = pd.DataFrame(fights)
    res = fights['winner'].value_counts()
    
    # return -1 if no fight found
    if len(res) == 0:
         return -1

    res = res.index[0]
    return 0 if res == 'L' else 1

def get_all_fighters_name():
    return ufc_stats['fighter'].unique()

def compute_extra_features(data: pd.DataFrame) -> pd.DataFrame:
    # compute nb fights
    nb_fights = len(data['id'].unique())
    extra_features = {
        'nb_fights': [nb_fights],
        'L': 0,
        'W': 0,
        'D': 0,
        'NC': 0   
    }
    #compute nb wins / loses
    grouped = data.groupby('id')
    fights = [group.iloc[0] for _, group in grouped]
    fights = pd.DataFrame(fights)
    wins_loses = fights['winner'].value_counts()
    extra_features.update(wins_loses)
    return pd.DataFrame(extra_features)

def get_fighter_data(fighter_name):
    fights = get_fights(fighter_name)
    extra_features = compute_extra_features(fights)
    fights = clean_columns(fights)
    #last_fights = get_last_fights(all_fights)
    #all_fights = all_fights.drop(columns=['round', 'last_round', 'id', 'result', 'winner', 'fight_date'])
    fights = fights.median().to_frame().transpose()
    fights = pd.concat([extra_features, fights], axis=1)

    return fights

In [14]:
def predict_fight(fighter_1, fighter_2):
    data_1 = get_fighter_data(fighter_1)
    data_2 = get_fighter_data(fighter_2)
    data = [data_1.values[0].tolist() + data_2.values[0].tolist()]
    
    preds = xgb_optimized.predict_proba(data)
    print(fighter_1, preds[0][0])
    print(fighter_2, preds[0][1])

predict_fight('Conor McGregor', 'Benoit Saint Denis')

Unnamed: 0,nb_fights,L,W,D,NC,knockdowns,significant_strikes_landed,significant_strikes_attempted,significant_strikes_rate,total_strikes_landed,...,body_landed,body_attempted,leg_landed,leg_attempted,distance_landed,distance_attempted,clinch_landed,clinch_attempted,ground_landed,ground_attempted
0,14,4,10,0,0,0.0,20.0,39.5,0.54,30.0,...,3.0,4.0,1.0,2.0,15.0,31.5,1.0,1.0,1.0,1.5


Conor McGregor 0.6735128
Benoit Saint Denis 0.3264872
