Simulating betting with our models. KNN 71-1 was switched to 109-3 because it was more effective for predicting upsets.

In [162]:
import pandas as pd

In [163]:
training_data = pd.read_csv('../../data/ufc-master.csv') #For training
prediction_data = pd.read_csv('../../data/for-predictions.csv') #Fights that models have to predict

In [164]:
training_data = training_data[['B_fighter', 'R_fighter', 'B_odds', 'R_odds', 'title_bout', 'reach_dif', 'B_age', 'R_age', 
                               'B_current_lose_streak', 'R_current_lose_streak', 'B_current_win_streak',
                               'R_current_win_streak', 'better_rank', 'B_wins', 'R_wins', 'B_losses', 'R_losses',
                               'B_Stance', 'R_Stance', 'Winner']]

In [165]:
#In the blue fighter stance column we have to fix one data point, where 'Switch' is written as 'Switch ' with an extra space
#at the end
training_data['B_Stance'] = training_data['B_Stance'].replace({'Switch ': 'Switch'})
#Fixing values in the reach_dif columns
#We will fix outliers using the data available on the UFC website instead of removing the "broken" datapoints entirely
filter1 = (training_data['reach_dif'] == -187.96) & (training_data['B_fighter'] == 'Parker Porter')
filter2 = (training_data['reach_dif'] == -187.96) & (training_data['B_fighter'] == 'Irwin Rivera')
filter3 = training_data['reach_dif'] == -160.02
training_data[filter1] = training_data[filter1].replace({-187.96: -2.54})
training_data[filter2] = training_data[filter2].replace({-187.96: -17.78 })
training_data[filter3] = training_data[filter3].replace({-160.02: 5.08})

In [166]:
#Now we will use columns B_wins, B_losses, R_wins and R_losses to create a column for both fighters
#that contains the win rate (proportion of wins out of wins and losses combined)
B_ratio = training_data['B_wins'] / (training_data['B_wins'] + training_data['B_losses'])
R_ratio = training_data['R_wins'] / (training_data['R_wins'] + training_data['R_losses'])
training_data['B_wr'] = B_ratio
training_data['R_wr'] = R_ratio
#It is possible that in some of the rows that value is now NaN as the fighter has never fought before. In task 1 we found out
#that the fighters making debut usually win 43% of the time so we will replace NaN with 0.43 as giving them 0 would not be 
# "fair" and will hurt the prediction accuracy
training_data['B_wr'].fillna(0.43, inplace=True)
training_data['R_wr'].fillna(0.43, inplace=True)
#Now we will drop win and loss columns for both fighters because these features are not important for us anymore after
#creating the win rate column
training_data = training_data.drop(columns=['B_wins', 'B_losses', 'R_wins', 'R_losses'])

In [167]:
#Changing values into 1s and 0s where necessary and one-hot encoding stance and rank features
training_data['title_bout'] = (training_data['title_bout']).astype(int)
training_data['Winner'] = training_data['Winner'].map(dict(Blue=1, Red=0))
training_data = pd.get_dummies(training_data, columns=['B_Stance', 'R_Stance', 'better_rank'])

In [168]:
#Creating training and validation sets for choosing hyperparameters for models
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(training_data.drop(columns=['B_fighter', 'R_fighter', 'B_odds', 'R_odds', 'Winner']), training_data['Winner'], test_size = 0.15, random_state = 2)

In [169]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
#Note: in the KNN algorithm p=1 is Manhattan distance and p=2 is the Euclidean distance
best_acc = 0
best_comb = [0, 0]
for i in range(1, 301, 10):
    for j in range(1, 3, 1):
        model = KNeighborsClassifier(n_neighbors = i, p = j)
        model.fit(X_train, y_train)
        acc = accuracy_score(y_val, model.predict(X_val))
        if (acc > best_acc):
            best_acc = acc
            best_comb[0] = i
            best_comb[1] = j
print("The best achieved accuracy was: " + str(round(best_acc * 100, 2)) + "%.")
print("The neighbors value should be: " + str(best_comb[0]))
print("The value for p should be: " + str(best_comb[1]))

The best achieved accuracy was: 62.3%.
The neighbors value should be: 71
The value for p should be: 1


In [170]:
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier(n_estimators = 350, random_state=0, max_depth=11, max_features=9)
forest.fit(X_train, y_train)
accuracy = accuracy_score(y_val, forest.predict(X_val))
print("Accuracy of the forest classifier model: " + str(accuracy))

Accuracy of the forest classifier model: 0.6050670640834576


In [171]:
pred = pd.read_csv('../../data/for-predictions.csv')
pred_df = pred[['B_odds', 'R_odds', 'title_bout', 'reach_dif', 'B_age', 'R_age', 
                               'B_current_lose_streak', 'R_current_lose_streak', 'B_current_win_streak',
                               'R_current_win_streak', 'better_rank', 'B_wins', 'R_wins', 'B_losses', 'R_losses',
                               'B_Stance', 'R_Stance']].copy(deep=True)

In [172]:
#Now we will use columns wins and losses for both fighters to create a column that has a win ratio out of all wins and losses
B_ratio = pred_df['B_wins'] / (pred_df['B_wins'] + pred_df['B_losses'])
R_ratio = pred_df['R_wins'] / (pred_df['R_wins'] + pred_df['R_losses'])
pred_df['B_wr'] = B_ratio
pred_df['R_wr'] = R_ratio
#It is possible that in some of the rows that value is now NaN as the fighter has never fought before. In task 1 we found out
#that the fighters making debut usually win 43% of the time so we will replace NaN with 0.43 as giving them 0 would not 
#represent reality very well
pred_df['B_wr'].fillna(0.43, inplace=True)
pred_df['R_wr'].fillna(0.43, inplace=True)
#Now we will drop win and loss columns for both fighters because we have added the winrate column
pred_df = pred_df.drop(columns=['B_wins', 'B_losses', 'R_wins', 'R_losses'])

In [173]:
#One-hot encoding as in the training dataset
pred_df = pd.get_dummies(pred_df, columns=['B_Stance', 'R_Stance', 'better_rank'])

In [174]:
#Adding missing columns (one-hot encoding does not create them when some values are not represented)
pred_df['better_rank_Blue'] = 0
pred_df['B_Stance_Open Stance'] = 0
pred_df['R_Stance_Open Stance'] = 0

In [175]:
knn_final_1 = KNeighborsClassifier(n_neighbors = 109, p=3)
knn_final_1.fit(training_data.drop(columns=['B_fighter', 'R_fighter', 'B_odds', 'R_odds', 'Winner']), training_data['Winner'])
pred['KNN'] = knn_final_1.predict(pred_df.drop(columns=['B_odds', 'R_odds']))

In [176]:
knn_final_2 = KNeighborsClassifier(n_neighbors = 109, p=3)
knn_final_2.fit(training_data.drop(columns=['B_fighter', 'R_fighter', 'Winner']), training_data['Winner'])
pred['KNN-2'] = knn_final_2.predict(pred_df)

Predictions with random forest classifier

Model that does not use betting values that were available before the fight

In [177]:
forest_final_1 = RandomForestClassifier(n_estimators = 350, random_state=0, max_depth=11, max_features=9)
forest_final_1.fit(training_data.drop(columns=['B_fighter', 'R_fighter', 'B_odds', 'R_odds', 'Winner']), training_data['Winner'])
pred['Forest'] = forest_final_1.predict(pred_df.drop(columns=['B_odds', 'R_odds']))

Model that uses betting values that were available

In [178]:
forest_final_2 = RandomForestClassifier(n_estimators = 350, random_state=0, max_depth=11, max_features=9)
forest_final_2.fit(training_data.drop(columns=['B_fighter', 'R_fighter', 'Winner']), training_data['Winner'])
pred['Forest-2'] = forest_final_2.predict(pred_df)

Checking the prediction results of the models

To which events fights in the dataset for predictions belonged  
UFC 256 (December 12) - fights 0-9  
UFC Vegas 16 (December 5) - fights 10-20  
UFC Vegas 15 (November 28) - fights 20-30

In [179]:
#Reminder: in the model predictions 1 is the blue fighter and 0 is the red fighter
winners = [0.5, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, -1, 0, -1, 0, 1, -1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1]
pred['Winner'] = winners
results = pred[['B_fighter', 'R_fighter', 'KNN', 'Forest', 'KNN-2', 'Forest-2', 'Winner', 'B_odds', 'R_odds']].copy(deep=True)
#How the results table looks
results

Unnamed: 0,B_fighter,R_fighter,KNN,Forest,KNN-2,Forest-2,Winner,B_odds,R_odds
0,Brandon Moreno,Deiveson Figueiredo,1,1,0,0,0.5,250,-330
1,Charles Oliveira,Tony Ferguson,1,1,0,1,1.0,140,-175
2,Virna Jandiroba,Mackenzie Dern,0,0,0,0,0.0,160,-200
3,Ronaldo Souza,Kevin Holland,0,0,0,0,0.0,-118,-106
4,Ciryl Gane,Junior dos Santos,1,1,1,1,1.0,-455,333
5,Daniel Pineda,Cub Swanson,1,1,1,1,0.0,-159,127
6,Rafael Fiziev,Renato Moicano,1,1,1,1,1.0,-159,130
7,Billy Quarantillo,Gavin Tucker,0,1,1,0,0.0,-167,135
8,Sam Hughes,Tecia Torres,1,1,0,0,0.0,355,-500
9,Peter Barrett,Chase Hooper,0,0,0,0,0.0,255,-335


Accuracy results for each model

In [180]:
KNN_acc = 0
Forest_acc = 0
KNN_acc_2 = 0
Forest_acc_2 = 0
fights_happened = 0
for i in range(0, 31):
    if (results['Winner'].iloc[i] != -1):
        fights_happened += 1
    if (results['Winner'].iloc[i] == results['KNN'].iloc[i]):
        KNN_acc += 1
    if (results['Winner'].iloc[i] == results['Forest'].iloc[i]):
        Forest_acc += 1
    if (results['Winner'].iloc[i] == results['KNN-2'].iloc[i]):
        KNN_acc_2 += 1
    if (results['Winner'].iloc[i] == results['Forest-2'].iloc[i]):
        Forest_acc_2 += 1
        
print("KNN prediction accuracy: " + str(round((KNN_acc / fights_happened) * 100, 2)) + "%.")
print("Forest prediction accuracy: " + str(round((Forest_acc / fights_happened) * 100, 2)) + "%.")
print("KNN-2 (with odds) prediction accuracy: " + str(round((KNN_acc_2 / fights_happened) * 100, 2)) + "%.")
print("Forest prediction (with odds) accuracy: " + str(round((Forest_acc_2 / fights_happened) * 100, 2)) + "%.")

KNN prediction accuracy: 67.86%.
Forest prediction accuracy: 57.14%.
KNN-2 (with odds) prediction accuracy: 57.14%.
Forest prediction (with odds) accuracy: 67.86%.


Simulating betting with all 4 prediciton models.

In [181]:
import numpy as np
KNN_total = 0
Forest_total = 0
KNN_total_2 = 0
Forest_total_2 = 0
fights_happened = 0
b_payout = 0
r_payout = 0
winner_payout = 0
bet = 100


for i in range(0, 31):
    if results['B_odds'].iloc[i] < 0:
        b_payout = bet/(np.abs(results['B_odds'].iloc[i]/100))
    else:
        b_payout = results['B_odds'].iloc[i]

    if results['R_odds'].iloc[i] < 0:
        r_payout = bet/(np.abs(results['R_odds'].iloc[i]/100))
    else:
        r_payout = results['R_odds'].iloc[i]

    if results['Winner'].iloc[i] != -1:
        fights_happened += 1
        if results['Winner'].iloc[i] == 0:
            winner_payout = r_payout
        else:
            winner_payout = b_payout

    if results['Winner'].iloc[i] == results['KNN'].iloc[i]:
        KNN_total += winner_payout + bet
    if results['Winner'].iloc[i] == results['Forest'].iloc[i]:
        Forest_total += winner_payout + bet
    if results['Winner'].iloc[i] == results['KNN-2'].iloc[i]:
        KNN_total_2 += winner_payout + bet
    if results['Winner'].iloc[i] == results['Forest-2'].iloc[i]:
        Forest_total_2 += winner_payout + bet

print("Total spent:" + str(fights_happened*bet))
total_bet = fights_happened*bet
print("Betting "+str(bet)+" units on "+str(fights_happened)+" matches:")
print("KNN (no odds) made: "+str(KNN_total-total_bet) + ", that is " +str(((KNN_total-total_bet)/total_bet*100))+"%")
print("Forest (no odds) made: "+str(Forest_total-total_bet)+ ", that is " +str(((Forest_total-total_bet)/total_bet*100))+"%")
print("KNN made: "+str(KNN_total_2-total_bet)+ ", that is " +str(((KNN_total_2-total_bet)/total_bet*100))+"%")
print("Forest made: "+str(Forest_total_2-total_bet)+ ", that is " +str(((Forest_total_2-total_bet)/total_bet*100))+"%")


Total spent:2800
Betting 100 units on 28 matches:
KNN (no odds) made: 498.12193136841915, that is 17.79006897744354%
Forest (no odds) made: -238.37524377282352, that is -8.513401563315126%
KNN made: -426.8780686315804, that is -15.245645308270728%
Forest made: 358.12193136841915, that is 12.790068977443541%


We see that KNN 109 model earned the most since we knew it was good at prediciting upsets.


Interesingly the forest model that used odds for predicting earned a significant profit as well even though it was accurate
for different fights.

Betting only when our models predicted an upset:

In [182]:
KNN_total = 0
KNN_bet = 0
Forest_total = 0
Forest_bet = 0
KNN_total_2 = 0
KNN_bet_2 = 0
Forest_total_2 = 0
Forest_bet_2 = 0
fights_happened = 0
b_payout = 0
r_payout = 0
winner_payout = 0
bet = 100


for i in range(0, 31):
    if results['B_odds'].iloc[i] < 0:
        b_payout = bet/(np.abs(results['B_odds'].iloc[i]/100))
    else:
        b_payout = results['B_odds'].iloc[i]


    if results['R_odds'].iloc[i] < 0:
        r_payout = bet/(np.abs(results['R_odds'].iloc[i]/100))
    else:
        r_payout = results['R_odds'].iloc[i]
    upset = False
    if results['Winner'].iloc[i] != -1:
        fights_happened += 1

        if results['R_odds'].iloc[i] < results['B_odds'].iloc[i]:
            if results['KNN'].iloc[i] == 0:
                KNN_bet += bet
            if results['Forest'].iloc[i] == 0:
                Forest_bet += bet
            if results['KNN-2'].iloc[i] == 0:
                KNN_bet_2 += bet
            if results['Forest-2'].iloc[i] == 0:
                Forest_bet_2 += bet

        if results['R_odds'].iloc[i] > results['B_odds'].iloc[i]:
            if results['KNN'].iloc[i] == 1:
                KNN_bet += bet
            if results['Forest'].iloc[i] == 1:
                Forest_bet += bet
            if results['KNN-2'].iloc[i] == 1:
                KNN_bet_2 += bet
            if results['Forest-2'].iloc[i] == 1:
                Forest_bet_2 += bet

        if results['Winner'].iloc[i] == 0:
            if results['R_odds'].iloc[i] > results['B_odds'].iloc[i]:
                continue
            winner_payout = r_payout
        else:
            if results['R_odds'].iloc[i] < results['B_odds'].iloc[i]:
                continue
            winner_payout = b_payout


    if results['Winner'].iloc[i] == results['KNN'].iloc[i]:
        KNN_total += winner_payout + bet
    if results['Winner'].iloc[i] == results['Forest'].iloc[i]:

        Forest_total += winner_payout + bet
    if results['Winner'].iloc[i] == results['KNN-2'].iloc[i]:
        KNN_total_2 += winner_payout + bet
    if results['Winner'].iloc[i] == results['Forest-2'].iloc[i]:
        Forest_total_2 += winner_payout + bet

print("Total spent:" + str(KNN_bet))
print("Total spent:" + str(Forest_bet))
print("Total spent:" + str(KNN_bet_2))
print("Total spent:" + str(Forest_bet_2))

print("KNN (no odds) made: "+str(KNN_total-KNN_bet) + ", that is " +str(((KNN_total-KNN_bet)/KNN_bet*100))+"%")
print("Forest (no odds) made: "+str(Forest_total-Forest_bet)+ ", that is " +str(((Forest_total-Forest_bet)/Forest_bet*100))+"%")
print("KNN made: "+str(KNN_total_2-KNN_bet_2)+ ", that is " +str(((KNN_total_2-KNN_bet_2)/KNN_bet_2*100))+"%")
print("Forest made: "+str(Forest_total_2-KNN_bet_2)+ ", that is " +str(((Forest_total_2-Forest_bet_2)/Forest_bet_2*100))+"%")

Total spent:2000
Total spent:2100
Total spent:2600
Total spent:2100
KNN (no odds) made: 58.78230872690983, that is 2.9391154363454914%
Forest (no odds) made: -197.71486641433353, that is -9.414993638777787%
KNN made: -421.21769127309017, that is -16.20068043358039%
Forest made: -571.2176912730904, that is -3.3913186320519233%


We see that the method of betting only when the model predicts an upset is not as effective as it was with the random test dataset.