In [33]:
import pandas as pd



Conveniently the dataset also includes betting payout profits for a 100 unit bet.

This can be used to simulate betting and see whether using our model can actually be used to earn money.

To do this we use the same KNN 109 model as we did in Odds Analysis.

In [34]:
data = pd.read_csv('../../data/ufc-master.csv')

data = data[['Winner','R_ev','B_ev','B_fighter', 'R_fighter', 'title_bout', 'B_current_win_streak',
        'B_current_lose_streak', 'R_current_win_streak', 'R_current_lose_streak', 'B_Stance', 'R_Stance',
        'B_avg_TD_landed', 'R_avg_TD_landed', 'B_wins', 'B_losses', 'R_wins', 'R_losses', 'B_age', 'R_age',
        'height_dif', 'reach_dif', 'better_rank', 'gender', 'R_odds', 'B_odds']]


In [35]:
#In the blue fighter stance column we have to fix one data point, where 'Switch' is written as 'Switch ' with an extra space
#in the end
data['B_Stance'] = data['B_Stance'].replace({'Switch ': 'Switch'})

#In the height dif there is one outlier where the difference between the two fighters is 187.96 cm, which is obviously a mistake.
#Instead of excluding this datapoint I am going to fix this manually using the height data available for both fighters on the
#UFC website (the fighters were Parker Porter and Kyle Daukaus)
data['height_dif'] = data['height_dif'].replace({-187.96: -7.62})
data['height_dif'].value_counts()

#In the reach difference we have 2 mistakes, where one of the values is -187.96 and the other is -160.02. These mistakes will be
#fixed as well.
#In the first case the fighters involved are Jinh Yu Frey and Kay Hanse
#In the second case the fighters involved are Parker Porter vs Kyle Daukaus and Irwin Rivera vs Giga Chikadze
filter1 = (data['reach_dif'] == -187.96) & (data['B_fighter'] == 'Parker Porter')
filter2 = (data['reach_dif'] == -187.96) & (data['B_fighter'] == 'Irwin Rivera')
filter3 = data['reach_dif'] == -160.02
data[filter1] = data[filter1].replace({-187.96: -2.54})
data[filter2] = data[filter2].replace({-187.96: -17.78 })
data[filter3] = data[filter3].replace({-160.02: 5.08})

In [36]:
data['B_avg_TD_landed'].fillna(0, inplace=True)
data['R_avg_TD_landed'].fillna(0, inplace=True)

In [37]:
#Now we will use columns wins and losses for both fighters to create a column that has a win ratio out of all wins and losses
B_ratio = data['B_wins'] / (data['B_wins'] + data['B_losses'])
R_ratio = data['R_wins'] / (data['R_wins'] + data['R_losses'])
data['B_wr'] = B_ratio
data['R_wr'] = R_ratio
#It is possible that in some of the rows that value is now NaN as the fighter has never fought before. In task 1 we found out
#that the fighters making debut usually win 43% of the time so we will replace NaN with 0.43 as giving them 0 would not
#represent reality very well
data['B_wr'].fillna(0.43, inplace=True)
data['R_wr'].fillna(0.43, inplace=True)
#Now we will drop win and loss columns for both fighters because we have added the winrate column
data = data.drop(columns=['B_wins', 'B_losses', 'R_wins', 'R_losses'])

#Now changing categorical variables into 1s and 0s where necessary
data['title_bout'] = (data['title_bout']).astype(int)
data['Winner'] = data['Winner'].map(dict(Blue=0, Red=1))
data['gender'] = data['gender'].map(dict(MALE=1, FEMALE=0))
data = pd.get_dummies(data, columns=['B_Stance', 'R_Stance', 'better_rank'])

In [38]:
#Dropping fighters names as well because this whole prediction is based on the stats only
data = data.drop(columns=['B_fighter', 'R_fighter'])

In [39]:
#Creating training and validation sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data, data['Winner'], test_size = 0.15, random_state = 0)

#Removing betting odds from dataset, but saving them to a different dataframe for easy comparison.
X_train = X_train.drop(columns=['R_ev','B_ev', 'Winner', 'R_odds', 'B_odds'])
X_with_odds = X_test.copy()
X_test = X_test.drop(columns=['R_ev','B_ev', 'Winner', 'R_odds', 'B_odds'])

In [40]:
#First we are going to use the KNN algorithm that we found earlier and see if it does better with more features or if we have
#to think about changing the algorithm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
knn = KNeighborsClassifier(n_neighbors = 109, p=2)
knn.fit(X_train, y_train)
predictions = knn.predict(X_test)
acc = accuracy_score(y_test, predictions)
print(acc)

0.5827123695976155


In [41]:
#Merging the results so model prediction is easlily comparable to the actual winner and odds.
y_pred_df = pd.DataFrame(data = predictions, columns = ['prediction'], index = X_test.index.copy())
df_out = pd.merge(y_pred_df,X_with_odds, how = 'left', left_index = True, right_index = True)


In [42]:
clean = df_out[['prediction', 'Winner', 'R_ev', 'B_ev']]

#Adding a second dataset to compare our profits to that of betting on high odds every time
clean_high_odds = df_out[['prediction', 'Winner', 'R_ev', 'B_ev','R_odds', 'B_odds']]
clean_high_odds.insert(2, "Higher_odds", (clean_high_odds['R_odds'] < clean_high_odds['B_odds']))
clean_high_odds["Higher_odds"] = clean_high_odds["Higher_odds"].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clean_high_odds["Higher_odds"] = clean_high_odds["Higher_odds"].astype(int)


Now that we have the dataset with predicitons, winners and profit we can use this to simulate us betting
100 units of money on 671 matches (test set for convinience) based on our model.

In [43]:
#Filtering out the losing bets (total investment will still be 671*100)

correct = clean[clean['prediction'] == clean['Winner']]
total = -100*len(clean)

for index,row in correct.iterrows():
    total += 100 #these are all wins and you always get at least your money back with a win
    #The rest is profit:
    if row['Winner'] == 1.0:
        total += row['R_ev']
    else:
        total += row['B_ev']

print("Total earnings: "+str(total))
print("That is "+str((total/(100*len(clean)))*100)+"% of the total")


Total earnings: 1779.9777894589508
That is 2.652723978329286% of the total


From these results we see that betting using our model earns on average 2.65% profit.

This is not a very high profit but compared to most other forms of gambling it is at least mathematically a viable form of investment.

Lets compare these results to if we were to instead always bet on the highest odds fighter instead:

In [44]:
correct = clean_high_odds[clean_high_odds['Higher_odds'] == clean_high_odds['Winner']]
total = -100*len(clean_high_odds)

for index,row in correct.iterrows():
    total += 100 #these are all wins and you always get at least your money back with a win
    #The rest is profit:
    if row['Winner'] == 1.0:
        total += row['R_ev']
    else:
        total += row['B_ev']

print("Total earnings: "+str(total))
print("That is "+str((total/(100*len(clean_high_odds)))*100)+"% of the total")

Total earnings: -3647.363631301074
That is -5.4357133104337905% of the total


As we saw in odds analysis the fighter with higher odds wins 63.9% of the time, which is higher than our models accuracy,
however even if the favorite does win it doesn't always return a 2x profit,
therefore we end up losing 5.43% of our money betting betting using this method.


How about when we only bet when the model predicts an upset

In [45]:
clean_high_odds = clean_high_odds[clean_high_odds['prediction'] != clean_high_odds['Higher_odds']]
correct = clean_high_odds[clean_high_odds['prediction'] == clean_high_odds['Winner']]
total = -100*len(clean_high_odds)
print(str(len(clean_high_odds)/len(clean)*100) + "% of all bets")
print(str(len(correct)/len(clean_high_odds)*100) + "% accuracy")

for index,row in correct.iterrows():
    total += 100 #these are all wins and you always get at least your money back with a win
    #The rest is profit:
    if row['Winner'] == 1.0:
        total += row['R_ev']
    else:
        total += row['B_ev']

print("Total earnings: "+str(total))
print("That is "+str((total/(100*len(clean_high_odds)))*100)+"% of the total")

37.257824143070046% of all bets
42.4% accuracy
Total earnings: 2712.434074420004
That is 10.849736297680016% of the total


With this method we only end up betting about 37% of the time and only achiveing 42.4% accuracy, but because betting
on an underdog has a much higher payout we end up earning significantly more money than if we bet on all model predicitons.

This methods rate of return is 10.8%, which is quite significant.