In [94]:
import pandas as pd



Testing if our highest accuract model (predicts using odds too) performs better.

In [95]:
data = pd.read_csv('../../data/ufc-master.csv')

data = data[['R_ev','B_ev','B_fighter', 'R_fighter', 'title_bout', 'reach_dif', 'age_dif', 'avg_td_dif', 'Winner', 'B_odds', 'R_odds', 'gender',
            'B_wins', 'B_losses', 'R_wins', 'R_losses', 'better_rank']]


In [96]:
#Fixing the wrong reach difference values
filter1 = (data['reach_dif'] == -187.96) & (data['B_fighter'] == 'Parker Porter')
filter2 = (data['reach_dif'] == -187.96) & (data['B_fighter'] == 'Irwin Rivera')
filter3 = data['reach_dif'] == -160.02
data[filter1] = data[filter1].replace({-187.96: -2.54})
data[filter2] = data[filter2].replace({-187.96: -17.78 })
data[filter3] = data[filter3].replace({-160.02: 5.08})

In [97]:
data = data.drop(columns=['B_fighter', 'R_fighter'])

In [98]:
data['Winner'] = data['Winner'].map(dict(Blue=0, Red=1))
data['gender'] = data['gender'].map(dict(MALE=1, FEMALE=0))
data['title_bout'] = (data['title_bout']).astype(int)

In [99]:
#Now we will use columns wins and losses for both fighters to create a column that has a win ratio out of all wins and losses
B_ratio = data['B_wins'] / (data['B_wins'] + data['B_losses'])
R_ratio = data['R_wins'] / (data['R_wins'] + data['R_losses'])
data['B_wr'] = B_ratio
data['R_wr'] = R_ratio
#It is possible that in some of the rows that value is now NaN as the fighter has never fought before. In task 1 we found out
#that the fighters making debut usually win 43% of the time so we will replace NaN with 0.43 as giving them 0 would not
#represent reality very well
data['B_wr'].fillna(0.43, inplace=True)
data['R_wr'].fillna(0.43, inplace=True)
#Now we will drop win and loss columns for both fighters because we have added the winrate column
data = data.drop(columns=['B_wins', 'B_losses', 'R_wins', 'R_losses'])

In [100]:
odds_dif = data['B_odds'] - data['R_odds']
data['odds_dif'] = odds_dif
data = data.drop(columns=['B_odds', 'R_odds'])

In [101]:
data = pd.get_dummies(data, columns=['better_rank'])

In [102]:
#Creating training and validation sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data, data['Winner'], test_size = 0.15, random_state = 0)

#Removing betting odds from dataset, but saving them to a different dataframe for easy comparison.
X_train = X_train.drop(columns=['R_ev','B_ev', 'Winner'])
X_with_odds = X_test.copy()
X_test = X_test.drop(columns=['R_ev','B_ev', 'Winner'])

In [103]:
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier(n_estimators = 400, random_state=0, max_depth=7)
forest.fit(X_train, y_train)
predictions = forest.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
print(accuracy)

0.6453055141579732


In [104]:
#Merging the results so model prediction is easlily comparable to the actual winner and odds.
y_pred_df = pd.DataFrame(data = predictions, columns = ['prediction'], index = X_test.index.copy())
df_out = pd.merge(y_pred_df,X_with_odds, how = 'left', left_index = True, right_index = True)


In [105]:
correct = df_out[df_out['Winner'] == df_out['prediction']]
len(correct)/len(df_out)

0.6453055141579732

In [106]:
clean = df_out[['prediction', 'Winner', 'R_ev', 'B_ev']]

Now that we have the dataset with predicitons, winners and profit we can use this to simulate us betting
100 units of money on 671 matches (test set for convinience) based on our model.

In [107]:
#Filtering out the losing bets (total investment will still be 671*100)

correct = clean[clean['prediction'] == clean['Winner']]
total = -100*len(clean)

for index,row in correct.iterrows():
    total += 100 #these are all wins and you always get at least your money back with a win
    #The rest is profit:
    if row['Winner'] == 1.0:
        total += row['R_ev']
    else:
        total += row['B_ev']

print("Total earnings: "+str(total))
print("That is "+str((total/(100*len(clean)))*100)+"% of the total")

Total earnings: -1541.918191191086
That is -2.2979406724159253% of the total


We see that using a model that predicts using betting odds while giving a higher accuracy score,
does not yield a better return of investment when it comes to betting.

The loss is 2.29% which is lower than just betting on the favorite, but it is still a loss.

