In [191]:
import pandas as pd



Our dataset also comes with odds from betting sites. These odds are in the american (moneyline) format.

For example the odds of +200 (+ is underdog) means that betting 100 dollars and they win, you earn 200 dollars.

However odds of -200 means that in order to make 100 dollars in profit you need to bet 200 dollars.




We are going to check if the betting sites predictions align with those of our prediction model.

In [192]:
data = pd.read_csv('../../data/ufc-master.csv')

data = data[['Winner','R_odds','B_odds','B_fighter', 'R_fighter', 'title_bout', 'B_current_win_streak',
        'B_current_lose_streak', 'R_current_win_streak', 'R_current_lose_streak', 'B_Stance', 'R_Stance',
        'B_avg_TD_landed', 'R_avg_TD_landed', 'B_wins', 'B_losses', 'R_wins', 'R_losses', 'B_age', 'R_age',
        'height_dif', 'reach_dif', 'better_rank', 'gender']]


In [193]:
#In the blue fighter stance column we have to fix one data point, where 'Switch' is written as 'Switch ' with an extra space
#in the end
data['B_Stance'] = data['B_Stance'].replace({'Switch ': 'Switch'})

#In the height dif there is one outlier where the difference between the two fighters is 187.96 cm, which is obviously a mistake.
#Instead of excluding this datapoint I am going to fix this manually using the height data available for both fighters on the
#UFC website (the fighters were Parker Porter and Kyle Daukaus)
data['height_dif'] = data['height_dif'].replace({-187.96: -7.62})
data['height_dif'].value_counts()

#In the reach difference we have 2 mistakes, where one of the values is -187.96 and the other is -160.02. These mistakes will be
#fixed as well.
#In the first case the fighters involved are Jinh Yu Frey and Kay Hanse
#In the second case the fighters involved are Parker Porter vs Kyle Daukaus and Irwin Rivera vs Giga Chikadze
filter1 = (data['reach_dif'] == -187.96) & (data['B_fighter'] == 'Parker Porter')
filter2 = (data['reach_dif'] == -187.96) & (data['B_fighter'] == 'Irwin Rivera')
filter3 = data['reach_dif'] == -160.02
data[filter1] = data[filter1].replace({-187.96: -2.54})
data[filter2] = data[filter2].replace({-187.96: -17.78 })
data[filter3] = data[filter3].replace({-160.02: 5.08})

In [194]:
data['B_avg_TD_landed'].fillna(0, inplace=True)
data['R_avg_TD_landed'].fillna(0, inplace=True)

In [195]:
#Now we will use columns wins and losses for both fighters to create a column that has a win ratio out of all wins and losses
B_ratio = data['B_wins'] / (data['B_wins'] + data['B_losses'])
R_ratio = data['R_wins'] / (data['R_wins'] + data['R_losses'])
data['B_wr'] = B_ratio
data['R_wr'] = R_ratio
#It is possible that in some of the rows that value is now NaN as the fighter has never fought before. In task 1 we found out
#that the fighters making debut usually win 43% of the time so we will replace NaN with 0.43 as giving them 0 would not
#represent reality very well
data['B_wr'].fillna(0.43, inplace=True)
data['R_wr'].fillna(0.43, inplace=True)
#Now we will drop win and loss columns for both fighters because we have added the winrate column
data = data.drop(columns=['B_wins', 'B_losses', 'R_wins', 'R_losses'])

#Now changing categorical variables into 1s and 0s where necessary
data['title_bout'] = (data['title_bout']).astype(int)
data['Winner'] = data['Winner'].map(dict(Blue=0, Red=1))
data['gender'] = data['gender'].map(dict(MALE=1, FEMALE=0))
data = pd.get_dummies(data, columns=['B_Stance', 'R_Stance', 'better_rank'])

In [196]:
#Dropping fighters names as well because this whole prediction is based on the stats only
data = data.drop(columns=['B_fighter', 'R_fighter'])

In [197]:
#Creating training and validation sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data, data['Winner'], test_size = 0.15, random_state = 0)

#Removing betting odds from dataset, but saving them to a different dataframe for easy comparison.
X_train = X_train.drop(columns=['R_odds','B_odds', 'Winner'])
X_with_odds = X_test.copy()
X_test = X_test.drop(columns=['R_odds','B_odds', 'Winner'])

In [198]:
#First we are going to use the KNN algorithm that we found earlier and see if it does better with more features or if we have
#to think about changing the algorithm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
knn = KNeighborsClassifier(n_neighbors = 109, p=2)
knn.fit(X_train, y_train)
predictions = knn.predict(X_test)
acc = accuracy_score(y_test, predictions)
print(acc)

0.5827123695976155


In [199]:
#Merging the results so model prediction is easlily comparable to the actual winner and odds.
y_pred_df = pd.DataFrame(data = predictions, columns = ['prediction'], index = X_test.index.copy())
df_out = pd.merge(y_pred_df,X_with_odds, how = 'left', left_index = True, right_index = True)

In [200]:
#Test to see if everything is correct. this result should be exactly the same as accuracy_score.
correct = df_out[df_out['Winner'] == df_out['prediction']]
len(correct)/len(df_out)


0.5827123695976155

In [201]:
clean = df_out[['prediction', 'Winner', 'R_odds', 'B_odds']]
clean.insert(2, "Higher_odds", (clean['R_odds'] < clean['B_odds']))
clean["Higher_odds"] = clean["Higher_odds"].astype(int)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clean["Higher_odds"] = clean["Higher_odds"].astype(int)


Now we check how many times the actual winner had higher odds, this is being done on the test dataset
but because it was chosen randomly that is not an issue.

In [202]:
len(clean[clean['Winner'] == clean['Higher_odds']])/len(clean)

0.639344262295082

Surprisingly in only 63.9% of cases the fighter with the higher odds won the match. This is more accurate than our KNN model,
but if our model consistently predicts lower odds fighters to win and they actually win, our model will be better for earning money.


Now we check whether our model predicted differently from the betting houses.

In [203]:
same_result = len(clean[clean['prediction'] == clean['Higher_odds']])/len(clean)
print(same_result)

#Filtering out results with the same prediciton (where we predicted the underdog to win, but betting sites did not)
diff_result = clean[clean['prediction'] != clean['Higher_odds']]
#We also remove results where our prediciton wasnt correct
diff_result = diff_result[clean['prediction'] == clean['Winner']]
underdog = len(diff_result)/len(clean)
print(underdog)



0.6274217585692996
0.15797317436661698


  diff_result = diff_result[clean['prediction'] == clean['Winner']]


We find that in 62.7% of cases our model predicted the same result.

We also found out that in 15.7% of total cases our model predicted differently from the betting sites and was correct in doing so.

This result might have great implications to betting payout since you earn significantly more money when you bet on the underdog and they win.

