In [13]:
import pandas as pd
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, accuracy_score

In [2]:
fighters_stats = pd.read_csv('data/fighter_stats.csv')

In [3]:
fighters_stats

Unnamed: 0,knockdowns,significant_strikes_landed,significant_strikes_attempted,significant_strikes_rate,total_strikes_landed,total_strikes_attempted,takedown_successful,takedown_attempted,takedown_rate,submission_attempt,...,body_attempted_1,leg_landed_1,leg_attempted_1,distance_landed_1,distance_attempted_1,clinch_landed_1,clinch_attempted_1,ground_landed_1,ground_attempted_1,winner
0,0.066667,24.733333,50.066667,0.512667,30.533333,58.666667,0.133333,0.4,0.088667,0.2,...,10.733333,3.666667,4.600000,19.800000,42.733333,1.733333,1.866667,0.600000,0.733333,1
1,0.066667,24.733333,50.066667,0.512667,30.533333,58.666667,0.133333,0.4,0.088667,0.2,...,6.500000,1.750000,2.750000,7.500000,18.500000,4.000000,5.750000,1.750000,2.000000,0
2,0.066667,24.733333,50.066667,0.512667,30.533333,58.666667,0.133333,0.4,0.088667,0.2,...,9.583333,3.166667,3.833333,18.416667,37.916667,1.833333,2.500000,0.583333,1.166667,0
3,0.066667,24.733333,50.066667,0.512667,30.533333,58.666667,0.133333,0.4,0.088667,0.2,...,3.928571,1.500000,2.142857,16.142857,38.714286,0.642857,1.285714,0.142857,0.571429,1
4,0.066667,24.733333,50.066667,0.512667,30.533333,58.666667,0.133333,0.4,0.088667,0.2,...,2.583333,1.250000,1.583333,14.083333,35.500000,0.416667,0.666667,0.833333,1.083333,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14349,0.000000,1.000000,7.000000,0.140000,1.000000,7.000000,0.000000,0.0,0.000000,0.0,...,0.750000,0.250000,0.500000,1.500000,3.500000,0.000000,0.000000,0.250000,0.250000,1
14350,1.000000,7.500000,15.000000,0.275000,19.000000,28.000000,0.000000,0.0,0.000000,0.0,...,0.333333,1.000000,1.000000,2.000000,3.666667,0.000000,0.000000,2.666667,3.666667,1
14351,1.000000,7.500000,15.000000,0.275000,19.000000,28.000000,0.000000,0.0,0.000000,0.0,...,3.500000,1.000000,1.500000,0.500000,4.500000,5.000000,9.500000,1.000000,1.000000,0
14352,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0,...,2.000000,1.166667,2.833333,1.333333,5.333333,1.833333,2.666667,1.500000,2.166667,1


In [14]:
X, y = fighters_stats.drop('winner', axis=1), fighters_stats[['winner']]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [15]:
# Paramètres à tester pour le réglage des hyperparamètres
param_grid = {
    'learning_rate': [0.01, 0.1, 1],
    'max_depth': [100, 200, 300],
    'min_child_weight': [1, 3, 5],
    'gamma': [1, 3, 5],
    'n_estimators': [300, 500, 700],
    'alpha': [0.01, 0.1, 1],
    'colsample_bytree': [0.8, 0.9, 1]
}

In [17]:
# Initialize XGBoost model
xgb = XGBClassifier(random_state=42)

# Create grid search instance
grid_search = GridSearchCV(xgb, param_grid, cv=5, verbose=2)

# Fit grid search
grid_search.fit(X_train, y_train)

# Print best results
print("Best hyperparameters : ", grid_search.best_params_)

# Save best hyperparameters
xgb_best_param = grid_search.best_params_

Fitting 5 folds for each of 2187 candidates, totalling 10935 fits
[CV] END alpha=0.01, colsample_bytree=0.8, gamma=1, learning_rate=0.01, max_depth=100, min_child_weight=1, n_estimators=300; total time=  15.0s
[CV] END alpha=0.01, colsample_bytree=0.8, gamma=1, learning_rate=0.01, max_depth=100, min_child_weight=1, n_estimators=300; total time=  16.7s
[CV] END alpha=0.01, colsample_bytree=0.8, gamma=1, learning_rate=0.01, max_depth=100, min_child_weight=1, n_estimators=300; total time=  14.4s
[CV] END alpha=0.01, colsample_bytree=0.8, gamma=1, learning_rate=0.01, max_depth=100, min_child_weight=1, n_estimators=300; total time=  14.4s
[CV] END alpha=0.01, colsample_bytree=0.8, gamma=1, learning_rate=0.01, max_depth=100, min_child_weight=1, n_estimators=300; total time=  16.8s
[CV] END alpha=0.01, colsample_bytree=0.8, gamma=1, learning_rate=0.01, max_depth=100, min_child_weight=1, n_estimators=500; total time=  17.0s
[CV] END alpha=0.01, colsample_bytree=0.8, gamma=1, learning_rate=0.01

KeyboardInterrupt: 

In [None]:
# Instanciate XGBoost model with best hyperparameters
xgb_optimized = XGBClassifier(**grid_search.best_params_)

# Train model using best hyperparameters
xgb_optimized.fit(X_train, y_train)


# Run prediction on test set
y_pred = xgb_optimized.predict(X_test)

In [9]:
# evaluate predictions
rmse = mean_squared_error(y_test, y_pred, squared=False)
print(f"RMSE of the base model: {rmse:.3f}")

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

RMSE of the base model: 0.603


# Test model

In [11]:
ufc_stats = pd.read_csv('data/ufc_stats.csv')

def get_fights(fighter_name):
    all_fights = ufc_stats[ufc_stats['fighter'] == fighter_name]
    return all_fights

def clean_columns(data):
    # remove columns named "fighter", "Unnamed: 0", "event", "fight_date", "location", "attendance"
    cleaned_data = data.drop(columns=['fighter', 'Unnamed: 0', 'event', 'location', 'attendance', 'time', 'scheduled_rounds', 'weight_class'])
    # Convert the "fight_date" column to a datetime data type
    cleaned_data['fight_date'] = pd.to_datetime(cleaned_data['fight_date'])
    return cleaned_data

def get_last_fights(data, nb_fights):
    df = data.sort_values(by=['fight_date'], ascending=False)
    # Group the DataFrame by the unique fight ID
    grouped = df.groupby('id', sort=False)
    # Get the first nb_fights groups (corresponding to the first nb_fights fights)
    first_nb_fights = [group for _, group in grouped][:nb_fights]
    # Concatenate the DataFrames for the first nb_fights fights
    result_df = pd.concat(first_nb_fights)
    # Reset the index of the result DataFrame if needed
    result_df.reset_index(drop=True, inplace=True)
    return result_df

def get_fighter_data(fighter_name):
    all_fights = get_fights(fighter_name)
    all_fights = clean_columns(all_fights)
    last_fights = get_last_fights(all_fights, nb_fights=5)
    last_fights = last_fights.drop(columns=['round', 'last_round', 'id', 'result', 'winner', 'fight_date'])
    last_fights = last_fights.mean().to_frame().transpose()
    return last_fights

In [12]:
def predict_fight(fighter_1, fighter_2):
    data_1 = get_fighter_data(fighter_1)
    data_2 = get_fighter_data(fighter_2)
    data = [data_1.values[0].tolist() + data_2.values[0].tolist()]
    
    preds = bst.predict_proba(data)
    print(fighter_1, preds[0][0])
    print(fighter_2, preds[0][1])

predict_fight('Conor McGregor', 'Dustin Poirier')

Conor McGregor 0.29467595
Dustin Poirier 0.70532405
