In [223]:
import re
import numpy as np
import pandas as pd

In [224]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score

In [225]:
from catboost import CatBoostClassifier

In [226]:
data = pd.read_csv('data/data.csv')
data.head()

Unnamed: 0,R_fighter,B_fighter,Referee,date,location,Winner,title_bout,weight_class,B_avg_KD,B_avg_opp_KD,...,R_win_by_Decision_Unanimous,R_win_by_KO/TKO,R_win_by_Submission,R_win_by_TKO_Doctor_Stoppage,R_Stance,R_Height_cms,R_Reach_cms,R_Weight_lbs,B_age,R_age
0,Adrian Yanez,Gustavo Lopez,Chris Tognoni,2021-03-20,"Las Vegas, Nevada, USA",Red,False,Bantamweight,0.0,0.0,...,0,1,0,0,Orthodox,170.18,177.8,135.0,31.0,27.0
1,Trevin Giles,Roman Dolidze,Herb Dean,2021-03-20,"Las Vegas, Nevada, USA",Red,False,Middleweight,0.5,0.0,...,0,3,0,0,Orthodox,182.88,187.96,185.0,32.0,28.0
2,Tai Tuivasa,Harry Hunsucker,Herb Dean,2021-03-20,"Las Vegas, Nevada, USA",Red,False,Heavyweight,,,...,1,3,0,0,Southpaw,187.96,190.5,264.0,32.0,28.0
3,Cheyanne Buys,Montserrat Conejo,Mark Smith,2021-03-20,"Las Vegas, Nevada, USA",Blue,False,WomenStrawweight,,,...,0,0,0,0,Switch,160.02,160.02,115.0,28.0,25.0
4,Marion Reneau,Macy Chiasson,Mark Smith,2021-03-20,"Las Vegas, Nevada, USA",Blue,False,WomenBantamweight,0.125,0.0,...,1,2,2,0,Orthodox,167.64,172.72,135.0,29.0,43.0


Data Cleaning

Before April 2001, there were almost no rules in UFC (no judges, no time limits, no rounds, etc.). It's up to this precise date that UFC started to implement a set of rules known as "Unified Rules of Mixed Martial Arts". Therefore, we delete all fights before this major update in UFC's rules history.

In [227]:
limit_date = '2001-04-01'
#data = data[(data['date'] > limit_date)]
print(data.shape)

(6012, 144)


In [228]:
print("Total NaN in dataframe :" , data.isna().sum().sum())
print("Total NaN in each column of the dataframe")
na = []
for index, col in enumerate(data):
    na.append((index, data[col].isna().sum())) 
na_sorted = na.copy()
na_sorted.sort(key = lambda x: x[1], reverse = True) 

for i in range(len(data.columns)):
    print(data.columns[na_sorted[i][0]],":", na_sorted[i][1], "NaN")

Total NaN in dataframe : 106494
Total NaN in each column of the dataframe
B_avg_KD : 1427 NaN
B_avg_opp_KD : 1427 NaN
B_avg_SIG_STR_pct : 1427 NaN
B_avg_opp_SIG_STR_pct : 1427 NaN
B_avg_TD_pct : 1427 NaN
B_avg_opp_TD_pct : 1427 NaN
B_avg_SUB_ATT : 1427 NaN
B_avg_opp_SUB_ATT : 1427 NaN
B_avg_REV : 1427 NaN
B_avg_opp_REV : 1427 NaN
B_avg_SIG_STR_att : 1427 NaN
B_avg_SIG_STR_landed : 1427 NaN
B_avg_opp_SIG_STR_att : 1427 NaN
B_avg_opp_SIG_STR_landed : 1427 NaN
B_avg_TOTAL_STR_att : 1427 NaN
B_avg_TOTAL_STR_landed : 1427 NaN
B_avg_opp_TOTAL_STR_att : 1427 NaN
B_avg_opp_TOTAL_STR_landed : 1427 NaN
B_avg_TD_att : 1427 NaN
B_avg_TD_landed : 1427 NaN
B_avg_opp_TD_att : 1427 NaN
B_avg_opp_TD_landed : 1427 NaN
B_avg_HEAD_att : 1427 NaN
B_avg_HEAD_landed : 1427 NaN
B_avg_opp_HEAD_att : 1427 NaN
B_avg_opp_HEAD_landed : 1427 NaN
B_avg_BODY_att : 1427 NaN
B_avg_BODY_landed : 1427 NaN
B_avg_opp_BODY_att : 1427 NaN
B_avg_opp_BODY_landed : 1427 NaN
B_avg_LEG_att : 1427 NaN
B_avg_LEG_landed : 1427 NaN
B

In [229]:
print("Total NaN in dataframe :" , data.isna().sum().sum())
print("Total NaN in each column of the dataframe")
na = []
for index, col in enumerate(data):
    na.append((index, data[col].isna().sum())) 
na_sorted = na.copy()
na_sorted.sort(key = lambda x: x[1], reverse = True) 

for i in range(len(data.columns)):
    print(data.columns[na_sorted[i][0]],":", na_sorted[i][1], "NaN")

Total NaN in dataframe : 106494
Total NaN in each column of the dataframe
B_avg_KD : 1427 NaN
B_avg_opp_KD : 1427 NaN
B_avg_SIG_STR_pct : 1427 NaN
B_avg_opp_SIG_STR_pct : 1427 NaN
B_avg_TD_pct : 1427 NaN
B_avg_opp_TD_pct : 1427 NaN
B_avg_SUB_ATT : 1427 NaN
B_avg_opp_SUB_ATT : 1427 NaN
B_avg_REV : 1427 NaN
B_avg_opp_REV : 1427 NaN
B_avg_SIG_STR_att : 1427 NaN
B_avg_SIG_STR_landed : 1427 NaN
B_avg_opp_SIG_STR_att : 1427 NaN
B_avg_opp_SIG_STR_landed : 1427 NaN
B_avg_TOTAL_STR_att : 1427 NaN
B_avg_TOTAL_STR_landed : 1427 NaN
B_avg_opp_TOTAL_STR_att : 1427 NaN
B_avg_opp_TOTAL_STR_landed : 1427 NaN
B_avg_TD_att : 1427 NaN
B_avg_TD_landed : 1427 NaN
B_avg_opp_TD_att : 1427 NaN
B_avg_opp_TD_landed : 1427 NaN
B_avg_HEAD_att : 1427 NaN
B_avg_HEAD_landed : 1427 NaN
B_avg_opp_HEAD_att : 1427 NaN
B_avg_opp_HEAD_landed : 1427 NaN
B_avg_BODY_att : 1427 NaN
B_avg_BODY_landed : 1427 NaN
B_avg_opp_BODY_att : 1427 NaN
B_avg_opp_BODY_landed : 1427 NaN
B_avg_LEG_att : 1427 NaN
B_avg_LEG_landed : 1427 NaN
B

In [230]:
from sklearn.impute import SimpleImputer

imp_features = ['R_Weight_lbs', 'R_Height_cms', 'B_Height_cms', 'R_age', 'B_age', 'R_Reach_cms', 'B_Reach_cms']
imp_median = SimpleImputer(missing_values=np.nan, strategy='median')

for feature in imp_features:
    imp_feature = imp_median.fit_transform(data[feature].values.reshape(-1,1))
    data[feature] = imp_feature

imp_stance = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
imp_R_stance = imp_stance.fit_transform(data['R_Stance'].values.reshape(-1,1))
imp_B_stance = imp_stance.fit_transform(data['B_Stance'].values.reshape(-1,1))
data['R_Stance'] = imp_R_stance
data['B_Stance'] = imp_B_stance

In [231]:
print('Number of features with NaN values :', len([x[1] for x in na if x[1] > 0]))

Number of features with NaN values : 109


In [232]:
na_features = ['B_avg_BODY_att', 'R_avg_BODY_att']
data.dropna(subset = na_features, inplace = True)

data.drop(['Referee', 'location'], axis = 1, inplace = True)

In [233]:
print(data.shape)
print("Total NaN in dataframe :" , data.isna().sum().sum())

(4316, 142)
Total NaN in dataframe : 0



Feature Engineering

In [234]:
list(data.select_dtypes(include=['object', 'bool']))

['R_fighter',
 'B_fighter',
 'date',
 'Winner',
 'title_bout',
 'weight_class',
 'B_Stance',
 'R_Stance']

In [235]:
print(data['B_draw'].value_counts())
print(data['R_draw'].value_counts())
data.drop(['B_draw', 'R_draw'], axis=1, inplace=True)

0    4316
Name: B_draw, dtype: int64
0    4316
Name: R_draw, dtype: int64


In [236]:
data = data[data['Winner'] != 'Draw']
data = data[data['weight_class'] != 'Catch Weight']

Data Preprocessing

In [237]:
#  i = index of the fighter's fight, 0 means the last fight, -1 means first fight
def select_fight_row(data, name, i): 
    data_temp = data[(data['R_fighter'] == name) | (data['B_fighter'] == name)]  # filter data on fighter's name
    data_temp.reset_index(drop=True, inplace=True) #  as we created a new temporary dataframe, we have to reset indexes
    idx = max(data_temp.index)  #  get the index of the oldest fight
    if i > idx:  #  if we are looking for a fight that didn't exist, we return nothing
        return 
    arr = data_temp.iloc[i,:].values
    return arr

select_fight_row(data, 'Amanda Nunes', 0) #  we get the last fight of Amanda Nunes

array(['Amanda Nunes', 'Megan Anderson', '2021-03-06', 'Red', True,
       'WomenFeatherweight', 0.5, 0.0, 0.498125, 0.415, 0.125, 0.25875,
       0.25, 0.1875, 0.5625, 0.25, 16.4375, 8.1875, 17.9375, 8.875,
       34.375, 25.3125, 39.375, 27.25, 1.0, 0.25, 2.25, 0.875, 14.0,
       6.8125, 10.875, 4.3125, 2.4375, 1.375, 4.0, 2.75, 0.0, 0.0, 3.0625,
       1.8125, 10.5625, 4.125, 10.1875, 2.5, 2.125, 1.5625, 4.25, 3.5,
       3.75, 2.5, 3.5, 2.875, 58.0, 136.3125, 250.3125, 7, 0, 0, 1, 2, 3,
       2, 0, 0, 0, 2, 1, 0, 'Orthodox', 182.88, 182.88, 145.0,
       0.25146484375, 0.0, 0.5198291015625, 0.428233642578125,
       0.633231201171875, 0.005947265625, 1.0087890625, 0.250732421875,
       0.0, 0.001953125, 170.2108154296875, 90.3916015625,
       94.584716796875, 35.8212890625, 213.79296875, 125.3968505859375,
       136.8037109375, 72.43798828125, 6.637939453125, 5.1295166015625,
       3.853271484375, 0.033935546875, 136.6878662109375,
       65.3922119140625, 73.3140869140625, 1

In [238]:
# get all active UFC fighters (according to the limit_date parameter)
def list_fighters(data, limit_date):
    data_temp = data[data['date'] > limit_date]
    set_R = set(data_temp['R_fighter'])
    set_B = set(data_temp['B_fighter'])
    fighters = list(set_R.union(set_B))
    return fighters

In [239]:
fighters = list_fighters(data, '2017-01-01')
print(len(fighters))

850


In [240]:
def build_data(data, fighters, i):      
    arr = [select_fight_row(data, fighters[f], i) for f in range(len(fighters)) if select_fight_row(data, fighters[f], i) is not None]
    cols = [col for col in data] 
    data_fights = pd.DataFrame(data=arr, columns=cols)
    data_fights.drop_duplicates(inplace=True)
    data_fights['title_bout'] = data_fights['title_bout'].replace({True: 1, False: 0})
    data_fights.drop(['R_fighter', 'B_fighter', 'date'], axis=1, inplace=True)
    return data_fights

data_train = build_data(data, fighters, 0)
data_test = build_data(data, fighters, 1)

In [241]:
data_train.head(5)

Unnamed: 0,Winner,title_bout,weight_class,B_avg_KD,B_avg_opp_KD,B_avg_SIG_STR_pct,B_avg_opp_SIG_STR_pct,B_avg_TD_pct,B_avg_opp_TD_pct,B_avg_SUB_ATT,...,R_win_by_Decision_Unanimous,R_win_by_KO/TKO,R_win_by_Submission,R_win_by_TKO_Doctor_Stoppage,R_Stance,R_Height_cms,R_Reach_cms,R_Weight_lbs,B_age,R_age
0,Red,0,Featherweight,0.0,0.5,0.3,0.52,0.0,0.33,0.0,...,0,0,0,0,Southpaw,170.18,185.42,145.0,25.0,29.0
1,Red,0,Heavyweight,0.625,0.0,0.443594,0.375937,0.15125,0.193438,0.0625,...,3,2,0,1,Orthodox,193.04,203.2,265.0,38.0,28.0
2,Red,0,Featherweight,0.0,0.0,0.55,0.53,0.0,0.0,0.0,...,0,0,0,0,Orthodox,182.88,187.96,145.0,27.0,27.0
3,Red,1,Welterweight,1.410767,0.187584,0.67478,0.32424,0.567714,0.140651,0.00502,...,8,2,1,0,Switch,182.88,193.04,170.0,35.0,33.0
4,Red,0,Heavyweight,0.0,0.0,0.5725,0.4675,0.25,0.25,0.25,...,0,0,0,0,Southpaw,190.5,213.36,257.0,26.0,26.0


In [242]:
data_train['weight_class'].unique() 

array(['Featherweight', 'Heavyweight', 'Welterweight', 'WomenFlyweight',
       'Lightweight', 'Middleweight', 'Bantamweight', 'CatchWeight',
       'WomenBantamweight', 'Flyweight', 'LightHeavyweight',
       'WomenStrawweight', 'WomenFeatherweight'], dtype=object)

In [243]:
print(data_train.shape)
print(data_test.shape)

(648, 137)
(611, 137)


In [244]:
print(len(data_train[data_train['Winner'] == 'Blue']))
print(len(data_train[data_train['Winner'] == 'Red']))
print(len(data_test[data_test['Winner'] == 'Blue']))
print(len(data_test[data_test['Winner'] == 'Red']))

278
370
258
353


In [245]:
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder
from sklearn.compose import make_column_transformer

preprocessor = make_column_transformer((OrdinalEncoder(), ['weight_class', 'B_Stance', 'R_Stance']), remainder='passthrough')

# If the winner is from the Red corner, Winner label will be encoded as 1, otherwise it will be 0 (Blue corner)
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(data_train['Winner'])
y_test = label_encoder.transform(data_test['Winner'])

X_train, X_test = data_train.drop(['Winner'], axis=1), data_test.drop(['Winner'], axis=1)

In [246]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(648, 136)
(648,)
(611, 136)
(611,)


In [247]:
# from sklearn.model_selection import GridSearchCV
# parameters = [{'random_forest__n_estimators': [10, 50, 100, 500, 1000],
#               'random_forest__criterion': ['gini', 'entropy'],
#               'random_forest__max_depth': [5, 10, 50],
#               'random_forest__min_samples_split': [2, 3, 4],
#               'random_forest__min_samples_leaf': [1, 2, 3],
#              }]
# model = Pipeline([('encoding', preprocessor), ('random_forest', RandomForestClassifier())])

# grid_search = GridSearchCV(estimator=model, param_grid=parameters, scoring='accuracy', cv=5, n_jobs=-1)
# grid_search = grid_search.fit(X_train, y_train)
# best_accuracy = grid_search.best_score_

# best_params = grid_search.best_params_
# print('Best accuracy : ', best_accuracy)
# print('Best parameters : ', best_params)

Random Forest Model

In [248]:
random_forest = RandomForestClassifier(n_estimators=130, 
                                       criterion='entropy', 
                                       max_depth=10, 
                                       min_samples_split=2,
                                       min_samples_leaf=1, 
                                       random_state=0)

model = Pipeline([('encoding', preprocessor), ('random_forest', random_forest)])
model.fit(X_train, y_train)

# We use cross-validation with 7-folds to have a more precise accuracy (reduce variation)
accuracies = cross_val_score(estimator=model, X=X_train, y=y_train, cv=7)
print('Accuracy mean : ', accuracies.mean())
print('Accuracy standard deviation : ', accuracies.std())

y_pred = model.predict(X_test)
print('Testing accuracy : ', accuracy_score(y_test, y_pred), '\n')

target_names = ["Blue","Red"]
print(classification_report(y_test, y_pred, labels=[0,1], target_names=target_names))

Accuracy mean :  0.6171608896012823
Accuracy standard deviation :  0.046593766172628545
Testing accuracy :  0.7086743044189853 

              precision    recall  f1-score   support

        Blue       0.73      0.49      0.59       258
         Red       0.70      0.87      0.77       353

    accuracy                           0.71       611
   macro avg       0.72      0.68      0.68       611
weighted avg       0.71      0.71      0.70       611



Predictions

In [249]:
def predict(data, pipeline, blue_fighter, red_fighter, weightclass, rounds, title_bout=False): 
    
    #We build two dataframes, one for each figther 
    f1 = data[(data['R_fighter'] == blue_fighter) | (data['B_fighter'] == blue_fighter)].copy()
    f1.reset_index(drop=True, inplace=True)
    f1 = f1[:1]
    f2 = data[(data['R_fighter'] == red_fighter) | (data['B_fighter'] == red_fighter)].copy()
    f2.reset_index(drop=True, inplace=True)
    f2 = f2[:1]
    
    # if the fighter was red/blue corner on his last fight, we filter columns to only keep his statistics (and not the other fighter)
    # then we rename columns according to the color of  the corner in the parameters using re.sub()
    if (f1.loc[0, ['R_fighter']].values[0]) == blue_fighter:
        result1 = f1.filter(regex='^R', axis=1).copy() #here we keep the red corner stats
        result1.rename(columns = lambda x: re.sub('^R','B', x), inplace=True)  #we rename it with "B_" prefix because he's in the blue_corner
    else: 
        result1 = f1.filter(regex='^B', axis=1).copy()
    if (f2.loc[0, ['R_fighter']].values[0]) == red_fighter:
        result2 = f2.filter(regex='^R', axis=1).copy()
    else:
        result2 = f2.filter(regex='^B', axis=1).copy()
        result2.rename(columns = lambda x: re.sub('^B','R', x), inplace=True)
        
    fight = pd.concat([result1, result2], axis = 1) # we concatenate the red and blue fighter dataframes (in columns)
    fight.drop(['R_fighter','B_fighter'], axis = 1, inplace = True) # we remove fighter names
    fight.insert(0, 'title_bout', title_bout) # we add tittle_bout, weight class and number of rounds data to the dataframe
    fight.insert(1, 'weight_class', weightclass)
    fight.insert(2, 'no_of_rounds', rounds)
    fight['title_bout'] = fight['title_bout'].replace({True: 1, False: 0})
    
    pred = pipeline.predict(fight)
    proba = pipeline.predict_proba(fight)
    if (pred == 1.0): 
        print("The predicted winner is", red_fighter, 'with a probability of', round(proba[0][1] * 100, 2), "%")
    else:
        print("The predicted winner is", blue_fighter, 'with a probability of ', round(proba[0][0] * 100, 2), "%")
    return proba

In [42]:
predict(data, model, 'Kamaru Usman', 'Colby Covington', 'Welterweight', 5, True) 

The predicted winner is Colby Covington with a probability of 78.37 %


array([[0.21629572, 0.78370428]])

In [None]:
predict(data, model, 'Max Holloway', 'Alexander Volkanovski', 'Featherweight', 5, True) 

The predicted winner is Alexander Volkanovski with a probability of 80.55 %


array([[0.19447799, 0.80552201]])

In [250]:
predict(data, model, 'Amanda Nunes', 'Germaine de Randamie', "WomenBantamweight", 5, True)

The predicted winner is Germaine de Randamie with a probability of 58.1 %


array([[0.41895554, 0.58104446]])

In [251]:
predict(data, model, 'Jose Aldo', 'Marlon Moraes', 'Bantamweight', 3, False)

The predicted winner is Jose Aldo with a probability of  57.66 %


array([[0.57662889, 0.42337111]])

In [252]:
predict(data, model, 'Urijah Faber', 'Petr Yan', 'Bantamweight', 3, False)

The predicted winner is Petr Yan with a probability of 76.18 %


array([[0.23822623, 0.76177377]])