In [11]:
import pandas as pd

#drop rows with n/a values
horses = pd.read_csv('Downloads/race-result-horse.csv')
horses=horses.dropna()

#Drop horses that did not finish or were disqualified, or for some reason did not have a legit finishing position
list1=['4 DH','3 DH','5 DH','2 DH','6 DH','1 DH','8 DH','PU','7 DH','10 DH','9 DH','UR','FE','11 DH','DNF','12 DH','DISQ']
horses = horses[~horses.finishing_position.isin(list1)]

#Drop running position from data, presumably you will be betting money before this information is known
horses = horses.drop(['running_position_1','running_position_2','running_position_3'], axis=1)

#Change Data types
horses = horses.astype({'finishing_position':'int32','actual_weight':'int32','declared_horse_weight':'int32','draw':'int32','win_odds':'float'})

In [12]:
#Treat categorical variables with mean encoding. There are way too many labels for dummy columns. 
horse_name_encode = horses.groupby('horse_name')['finishing_position'].mean()
horses.loc[:, 'horse_name_replacement'] = horses['horse_name'].map(horse_name_encode)

horse_name_encode = horses.groupby('jockey')['finishing_position'].mean()
horses.loc[:, 'jockey_replacement'] = horses['jockey'].map(horse_name_encode)

horse_name_encode = horses.groupby('trainer')['finishing_position'].mean()
horses.loc[:, 'trainer_replacement'] = horses['trainer'].map(horse_name_encode)

#Drop original columns
horses=horses.drop(['horse_name','jockey','trainer'], axis=1)

#Establish a benchmark for performance. The goal of this algorithm is to make profitable bets. 
horses.groupby('finishing_position')['win_odds'].mean()
#The above code shows the average betting odds of a winning horse is 9.108. This mean the model should be correctly predicting the result 1/10.108 times to 
#break even. If it can correctly predict better than that, the algorithm will make money. This translates to a percentage of 9.9% accuracy. 

finishing_position
1      9.108556
2     12.219456
3     15.503064
4     18.380556
5     21.955318
6     24.826172
7     28.454382
8     33.123529
9     36.784295
10    43.161327
11    46.303522
12    53.546634
13    58.836583
14    66.307279
Name: win_odds, dtype: float64

In [6]:
#Get train and test sets. 
from sklearn.model_selection import train_test_split

horses=horses.values
X = horses[:,1:6]
y = horses[:,0]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2)

#Scale the data
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler(feature_range=(0,1))
scale_X_train = scaler.fit_transform(X_train)
scale_X_test = scaler.fit_transform(X_test)

In [5]:
#First attempt at prediction, simple logistic regression with no frills
from sklearn.linear_model import LogisticRegression

lr= LogisticRegression(max_iter=1000)
lr.fit(scale_X_train, y_train)

lr.score(scale_X_test, y_test)

#This gives an accuracy score of ~14%. This is accurate enough to be profitable. 

0.14166524774391281

In [8]:
#Second attempt at prediction, with simple K Nearest neighbors, again no frills
from sklearn.neighbors import KNeighborsClassifier

knn= KNeighborsClassifier()
knn.fit(scale_X_train, y_train)

knn.score(scale_X_test, y_test)

#This gives an accuracy score of ~11%. This is accurate enough to be profitable. 

0.11169759918270049

In [12]:
#logisticregression with a grid search. 
from sklearn.model_selection import GridSearchCV

tol = [.005, .01, .015]
max_iter = [250,500,750]

param_grid= dict(tol=tol, max_iter=max_iter)

grid = GridSearchCV(estimator=lr, param_grid=param_grid, cv=5)

scaledX=scaler.fit_transform(X)

result=grid.fit(scaledX, y)

print(result.best_score_)
print(result.best_params_)

#This gave an accuracy score of 14.5%

0.14500747196216374
{'max_iter': 250, 'tol': 0.005}


In [9]:
#Grid Search with KNN
from sklearn.model_selection import GridSearchCV

n_neighbors = [15,20,25]

param_grid= dict(n_neighbors=n_neighbors)

grid = GridSearchCV(estimator=knn, param_grid=param_grid, cv=5)

scaledX=scaler.fit_transform(X)

result=grid.fit(scaledX, y)

print(result.best_score_)
print(result.best_params_)
#This offered a modest improvement, bumping it from 11% to about 12% accuracy

0.12293968356553404
{'n_neighbors': 20}


In [13]:
#transforming the 'finishing position' column into a binary variable. The two options will be winner and non-winner. 
horses['winner'] = horses['finishing_position'] == 1
horses=horses.drop(['finishing_position'], axis=1)
horses['winner'] = horses['winner'].astype('int32')

#Get train and test sets. 
from sklearn.model_selection import train_test_split

horses=horses.values
X = horses[:,0:7]
y = horses[:,7]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2)

#Scale the data
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler(feature_range=(0,1))
scale_X_train = scaler.fit_transform(X_train)
scale_X_test = scaler.fit_transform(X_test)

In [1]:
#Try logisticRegression again, this time with the new binary variables
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import confusion_matrix

lr= LogisticRegressionCV(cv=5,max_iter=1000)
lr.fit(scale_X_train, y_train)

print(lr.score(scale_X_test, y_test))
y_pred=lr.predict(scale_X_test)
print(confusion_matrix(y_test,y_pred))

#Accuracy jumps to 92%, however, on closer inspection, the model is simply predicting 'non-winner' every time. I need to change
#class_weight

NameError: name 'scale_X_train' is not defined

In [18]:
#Try a bunch of class weights
for a in [1.5,3.5,4.5]:
    lr= LogisticRegressionCV(cv=5,max_iter=1000, class_weight={1:a})
    lr.fit(scale_X_train, y_train)

    print(lr.score(scale_X_test, y_test))
    y_pred=lr.predict(scale_X_test)
    print(confusion_matrix(y_test,y_pred))
    
#As the class_weight increases, accuracy decreases. However, this is an acceptable trade-off. The model is finally starting
#to predict some winners as well. 

#At a class weight of 1.5, the model predicts 13 winners, 10 are actually correct. 
#This is an accuracy rate of 77% when guessing winner. At a class weight of 4.5,
#the model predicts 40 winners, of which 18 are correct. This is an accuracy rate
#of 45% when guessing winner. Going back to the original odds calculation, both these
#accuracy rates are high enough to be profitable. 

0.920313298144049
[[5395    3]
 [ 465   10]]
0.919121403030819
[[5398    0]
 [ 475    0]]
0.9184403201089733
[[5376   22]
 [ 457   18]]
