In [1]:
import pandas as pd

In [2]:
training_data = pd.read_csv('../../data/ufc-master.csv') #For training
prediction_data = pd.read_csv('../../data/for-predictions.csv') #Fights that models have to predict

Selecting features and preparing the training data

In [3]:
training_data = training_data[['B_fighter', 'R_fighter', 'title_bout', 'reach_dif', 'B_age', 'R_age', 
                               'B_current_lose_streak', 'R_current_lose_streak', 'B_current_win_streak',
                               'R_current_win_streak', 'better_rank', 'B_wins', 'R_wins', 'B_losses', 'R_losses',
                               'B_Stance', 'R_Stance', 'Winner']]

In [4]:
#In the blue fighter stance column we have to fix one data point, where 'Switch' is written as 'Switch ' with an extra space
#in the end
training_data['B_Stance'] = training_data['B_Stance'].replace({'Switch ': 'Switch'})
#Fixing values in the reach_dif columns
#We will fix outliers using the data available on the UFC website instead of removing the "broken" datapoints
filter1 = (training_data['reach_dif'] == -187.96) & (training_data['B_fighter'] == 'Parker Porter')
filter2 = (training_data['reach_dif'] == -187.96) & (training_data['B_fighter'] == 'Irwin Rivera')
filter3 = training_data['reach_dif'] == -160.02
training_data[filter1] = training_data[filter1].replace({-187.96: -2.54})
training_data[filter2] = training_data[filter2].replace({-187.96: -17.78 })
training_data[filter3] = training_data[filter3].replace({-160.02: 5.08})

In [5]:
#Now we will use columns wins and losses for both fighters to create a column that has a win ratio out of all wins and losses
B_ratio = training_data['B_wins'] / (training_data['B_wins'] + training_data['B_losses'])
R_ratio = training_data['R_wins'] / (training_data['R_wins'] + training_data['R_losses'])
training_data['B_wr'] = B_ratio
training_data['R_wr'] = R_ratio
#It is possible that in some of the rows that value is now NaN as the fighter has never fought before. In task 1 we found out
#that the fighters making debut usually win 43% of the time so we will replace NaN with 0.43 as giving them 0 would not 
#represent reality very well
training_data['B_wr'].fillna(0.43, inplace=True)
training_data['R_wr'].fillna(0.43, inplace=True)
#Now we will drop win and loss columns for both fighters because we have added the winrate column
training_data = training_data.drop(columns=['B_wins', 'B_losses', 'R_wins', 'R_losses'])

In [6]:
#Changing values into 1s and 0s where necessary and one-hot encoding stance and rank features
training_data['title_bout'] = (training_data['title_bout']).astype(int)
training_data['Winner'] = training_data['Winner'].map(dict(Blue=1, Red=0))
training_data = pd.get_dummies(training_data, columns=['B_Stance', 'R_Stance', 'better_rank'])

In [7]:
#Creating training and validation sets for choosing hyperparameters for models
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(training_data.drop(columns=['B_fighter', 'R_fighter', 'Winner']), training_data['Winner'], test_size = 0.15, random_state = 2)

In [8]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
knn = KNeighborsClassifier(n_neighbors = 71, p=1)
knn.fit(X_train, y_train)
acc = accuracy_score(y_val, knn.predict(X_val))
print(acc)

0.6229508196721312


In [9]:
#Note: in the KNN algorithm p=1 is Manhattan distance and p=2 is the Euclidean distance
best_acc = 0
best_comb = [0, 0]
for i in range(1, 301, 10):
    for j in range(1, 3, 1):
        model = KNeighborsClassifier(n_neighbors = i, p = j)
        model.fit(X_train, y_train)
        acc = accuracy_score(y_val, model.predict(X_val))
        if (acc > best_acc):
            best_acc = acc
            best_comb[0] = i
            best_comb[1] = j
print("The best achieved accuracy was: " + str(round(best_acc * 100, 2)) + "%.")
print("The neighbors value should be: " + str(best_comb[0]))
print("The value for p should be: " + str(best_comb[1]))

The best achieved accuracy was: 62.3%.
The neighbors value should be: 71
The value for p should be: 1


In [10]:
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier(n_estimators = 350, random_state=0, max_depth=11, max_features=9)
forest.fit(X_train, y_train)
accuracy = accuracy_score(y_val, forest.predict(X_val))
print(accuracy)

0.6050670640834576


Now that we have chosen 2 models we are going to traing them on the entire training dataset and then predict results on the
prediction dataset

Loading dataset for making predictions (in the predictions 1 means that "Blue" fighter won and 0 that "Red" fighter won)

In [11]:
pred = pd.read_csv('../../data/for-predictions.csv')
pred_df = pred[['title_bout', 'reach_dif', 'B_age', 'R_age', 
                               'B_current_lose_streak', 'R_current_lose_streak', 'B_current_win_streak',
                               'R_current_win_streak', 'better_rank', 'B_wins', 'R_wins', 'B_losses', 'R_losses',
                               'B_Stance', 'R_Stance']].copy(deep=True)

In [12]:
#Now we will use columns wins and losses for both fighters to create a column that has a win ratio out of all wins and losses
B_ratio = pred_df['B_wins'] / (pred_df['B_wins'] + pred_df['B_losses'])
R_ratio = pred_df['R_wins'] / (pred_df['R_wins'] + pred_df['R_losses'])
pred_df['B_wr'] = B_ratio
pred_df['R_wr'] = R_ratio
#It is possible that in some of the rows that value is now NaN as the fighter has never fought before. In task 1 we found out
#that the fighters making debut usually win 43% of the time so we will replace NaN with 0.43 as giving them 0 would not 
#represent reality very well
pred_df['B_wr'].fillna(0.43, inplace=True)
pred_df['R_wr'].fillna(0.43, inplace=True)
#Now we will drop win and loss columns for both fighters because we have added the winrate column
pred_df = pred_df.drop(columns=['B_wins', 'B_losses', 'R_wins', 'R_losses'])

In [13]:
#One-hot encoding as in the training dataset
pred_df = pd.get_dummies(pred_df, columns=['B_Stance', 'R_Stance', 'better_rank'])

In [14]:
#Adding missing columns (one-hot encoding does not create them when some values are not represented)
pred_df['better_rank_Blue'] = 0
pred_df['B_Stance_Open Stance'] = 0
pred_df['R_Stance_Open Stance'] = 0

Predictions with KNN algorithm

In [15]:
knn_final = KNeighborsClassifier(n_neighbors = 71, p=1)
knn_final.fit(training_data.drop(columns=['B_fighter', 'R_fighter', 'Winner']), training_data['Winner'])
pred['KNN'] = knn_final.predict(pred_df)

In [16]:
forest_final = RandomForestClassifier(n_estimators = 350, random_state=0, max_depth=11, max_features=9)
forest_final.fit(training_data.drop(columns=['B_fighter', 'R_fighter', 'Winner']), training_data['Winner'])
pred['Forest'] = forest_final.predict(pred_df)

Fights on index 10-30 have already happened on December 6 and November 29, but they were not included in our training dataset.
Fights 0-10 have not happened as of the time of making these predictions (December 13 according to Eastern European Time).
Because of that value of the winner of these fights is -1 as of now. Also for fights that were cancelled because of coronavirus or othe reasons.

In [25]:
#Reminder: 1 is Blue fighter and 0 is Red fighter
winners = [-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 0, -1, 0, -1, 0, 1, -1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1]
pred['Winner'] = winners
#Odds were not used when making these predictions, but are here for information purposes
results = pred[['B_fighter', 'R_fighter', 'KNN', 'Forest', 'Winner', 'B_odds', 'R_odds']].copy(deep=True)
results

Unnamed: 0,B_fighter,R_fighter,KNN,Forest,Winner,B_odds,R_odds
0,Brandon Moreno,Deiveson Figueiredo,0,1,-1,250,-330
1,Charles Oliveira,Tony Ferguson,1,1,-1,140,-175
2,Virna Jandiroba,Mackenzie Dern,0,0,-1,160,-200
3,Ronaldo Souza,Kevin Holland,0,0,-1,-118,-106
4,Ciryl Gane,Junior dos Santos,1,1,-1,-455,333
5,Daniel Pineda,Cub Swanson,1,1,-1,-159,127
6,Rafael Fiziev,Renato Moicano,0,1,-1,-159,130
7,Billy Quarantillo,Gavin Tucker,0,1,-1,-167,135
8,Sam Hughes,Tecia Torres,1,1,-1,355,-500
9,Peter Barrett,Chase Hooper,0,0,-1,255,-335


Accuracy results so far (without UFC 256 results)

In [27]:
KNN_acc = 0
Forest_acc = 0
fights_happened = 0
for i in range(len(results)):
    if (results['Winner'].iloc[i] != -1):
        fights_happened += 1
    if (results['Winner'].iloc[i] == results['KNN'].iloc[i]):
        KNN_acc += 1
    if (results['Winner'].iloc[i] == results['Forest'].iloc[i]):
        Forest_acc += 1
print("KNN prediction accuracy: " + str(round((KNN_acc / fights_happened) * 100, 2)) + "%.")
print("Forest prediction accuracy: " + str(round((Forest_acc / fights_happened) * 100, 2)) + "%.")

KNN prediction accuracy: 55.56%.
Forest prediction accuracy: 55.56%.
