## Preliminary Logistic Regression Model

We're going to merge the features we've created from feature-engineering/ to our greyhound data, restrict our sample to full field sizes of 8 where we contain information on every greyhound in the race (all greyhounds must have had at least one race).

----

Import libraries, packages, and greyhound data

In [73]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as stats
import os
import decouple
import sys
config = decouple.AutoConfig(' ')
os.chdir(config('ROOT_DIRECTORY'))
sys.path.insert(0, '')

from scipy.stats import zscore
from multielo import MultiElo, Player, Tracker
from multielo.multielo import defaults
from sklearn.linear_model import LogisticRegression

# Read in data
df_raw = pd.read_csv('./data/clean/dog_results.csv')

display(df_raw)

Unnamed: 0,FasttrackDogId,Place,DogName,Box,Rug,Weight,StartPrice,Margin1,Margin2,PIR,...,FasttrackRaceId,TrainerId,TrainerName,Distance,RaceGrade,Track,RaceNum,TrackDist,RaceDate,FieldSize
0,157500927,1,RAINE ALLEN,1,1,27.4,2.4,2.30,,Q/111,...,335811282,7683,C GRENFELL,500.0,Restricted Win,Bendigo,1.0,Bendigo500,2018-07-01,6
1,1820620018,2,SURF A LOT,2,2,32.8,6.3,2.30,2.30,M/332,...,335811282,137227,C TYLEY,500.0,Restricted Win,Bendigo,1.0,Bendigo500,2018-07-01,6
2,1950680026,3,PINGIN' BEE,6,6,25.5,9.3,3.84,1.54,S/443,...,335811282,132763,P DAPIRAN,500.0,Restricted Win,Bendigo,1.0,Bendigo500,2018-07-01,6
3,1524380048,4,LUCAS THE GREAT,7,7,32.2,9.1,5.27,1.43,M/655,...,335811282,116605,E HAMILTON,500.0,Restricted Win,Bendigo,1.0,Bendigo500,2018-07-01,6
4,124225458,5,QUAVO,4,4,28.9,3.4,5.56,0.29,M/766,...,335811282,132763,P DAPIRAN,500.0,Restricted Win,Bendigo,1.0,Bendigo500,2018-07-01,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
782997,491585906,3,GLORIOUS GUNN,8,8,27.1,3.8,3.75,2.43,6644,...,745616339,87891,G HORNE,520.0,Grade 5,Cannington,12.0,Cannington520,2021-12-31,7
782998,485659451,4,WOOD FIRE,3,3,32.1,4.1,3.75,0.14,3233,...,745616339,68549,C HALSE,520.0,Grade 5,Cannington,12.0,Cannington520,2021-12-31,7
782999,528381655,5,TRENDING QUARTER,6,6,31.8,16.2,5.25,1.43,4566,...,745616339,83581,J DAILLY,520.0,Grade 5,Cannington,12.0,Cannington520,2021-12-31,7
783000,537992387,6,ELITE WEAPON,1,1,26.7,2.9,5.25,0.00,1455,...,745616339,293372,S WILLIAMS,520.0,Grade 5,Cannington,12.0,Cannington520,2021-12-31,7


Take only columns of interest, and merge our features to the dataframe

In [74]:
# Copy raw dataframe
df = df_raw.copy()

# Take only columns of interest
col_list = ['FasttrackDogId', 'StartPrice', 'Place', 'Box', 'Weight', 'FasttrackRaceId', 
            'TrainerId', 'TrackDist', 'RaceDate', 'FieldSize']
df = df[col_list]

'''
Read in features and merge to main dataframe
'''

# Trainer ELO Rating
df_temp = pd.read_csv('./data/features/trainer-elo.csv')
df = df.merge(df_temp, on=['TrainerId', 'FasttrackRaceId', 'RaceDate'], how='left')

# Mean SpeedNorm by Weight
df['WeightQuantile'] = pd.qcut(df['Weight'], 10, labels=False)+1
df_temp = pd.read_csv('./data/features/mean-speednorm-by-weight.csv').drop('WeightQuantile_std', axis=1)
df = df.merge(df_temp, on=['WeightQuantile'], how='left')

# Mean SpeedNorm by Track Distance Box
df_temp = pd.read_csv('./data/features/mean-speednorm-by-trackdistbox.csv').drop('TrackDistBox_std', axis=1)
df = df.merge(df_temp, on=['TrackDist', 'Box'], how='left')

# EMA SpeedNorm by Greyhound
df_temp = pd.read_csv('./data/features/ema-speednorm-by-greyhound.csv')
df = df.merge(df_temp, on=['FasttrackDogId', 'FasttrackRaceId', 'RaceDate'], how='left')

# EMA SplitMargin by Greyhound
df_temp = pd.read_csv('./data/features/ema-split-margin-by-greyhound.csv')
df = df.merge(df_temp, on=['FasttrackDogId', 'FasttrackRaceId', 'TrackDist'], how='left')

# SplitMargin importance by (track, distance)
df_temp = pd.read_csv('./data/features/split-margin-importance-by-trackdist.csv')
df = df.merge(df_temp, on='TrackDist', how='left')

# Multiply the two columns
df['SplitMarginMultiply'] = df['SplitMarginQuantileEMA']*df['TrackSplitMarginQuantile']

display(df)

Unnamed: 0,FasttrackDogId,StartPrice,Place,Box,Weight,FasttrackRaceId,TrainerId,TrackDist,RaceDate,FieldSize,TrainerRating,WeightQuantile,WeightQuantile_mean,TrackDistBox_mean,SpeedNorm_EMA,SpeedNorm_MSTD,SplitMarginQuantileEMA,TrackSplitMarginQuantile,SplitMarginMultiply
0,157500927,2.4,1,1,27.4,335811282,7683,Bendigo500,2018-07-01,6,1000.000000,3,-0.080413,,,,,2.0,
1,1820620018,6.3,2,2,32.8,335811282,137227,Bendigo500,2018-07-01,6,1000.000000,8,0.113288,,,,,2.0,
2,1950680026,9.3,3,6,25.5,335811282,132763,Bendigo500,2018-07-01,6,1000.000000,1,-0.163575,,,,,2.0,
3,1524380048,9.1,4,7,32.2,335811282,116605,Bendigo500,2018-07-01,6,1000.000000,7,0.066307,,,,,2.0,
4,124225458,3.4,5,4,28.9,335811282,132763,Bendigo500,2018-07-01,6,1000.000000,4,-0.057036,,,,,2.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
782997,491585906,3.8,3,8,27.1,745616339,87891,Cannington520,2021-12-31,7,-385.966583,3,-0.080413,-0.043643,0.712705,0.573589,5.165463,2.0,10.330926
782998,485659451,4.1,4,3,32.1,745616339,68549,Cannington520,2021-12-31,7,-337.539747,7,0.066307,-0.004835,0.605300,0.338167,7.604727,2.0,15.209454
782999,528381655,16.2,5,6,31.8,745616339,83581,Cannington520,2021-12-31,7,-536.354620,7,0.066307,-0.021015,0.167931,0.461219,4.866273,2.0,9.732546
783000,537992387,2.9,6,1,26.7,745616339,293372,Cannington520,2021-12-31,7,-300.109813,2,-0.104427,0.148961,0.497658,0.523239,8.269239,2.0,16.538479


Take only races with a full field (8 greyhounds), and take only races where we have information on every greyhound. We notice that this significantly reduces our sample size (both operations cause about a 2/3 of data to be removed).

In [75]:
# Take only races with a full field
df = df[df['FieldSize'] == 8]

# Train/test only races where we have feature information on every greyhound
df_temp = df.copy()
df_temp = df_temp[['FasttrackRaceId', 'TrainerRating', 'WeightQuantile_mean', 
                   'TrackDistBox_mean', 'SpeedNorm_EMA', 'SplitMarginMultiply', 'SpeedNorm_MSTD']]
df_temp = df_temp[df_temp.isnull().any(axis=1)]
list_of_races = df_temp['FasttrackRaceId'].unique()

df = df[~df['FasttrackRaceId'].isin(list_of_races)]

display(df)

Unnamed: 0,FasttrackDogId,StartPrice,Place,Box,Weight,FasttrackRaceId,TrainerId,TrackDist,RaceDate,FieldSize,TrainerRating,WeightQuantile,WeightQuantile_mean,TrackDistBox_mean,SpeedNorm_EMA,SpeedNorm_MSTD,SplitMarginQuantileEMA,TrackSplitMarginQuantile,SplitMarginMultiply
15681,141317072,2.7,1,2,32.2,344185004,112239,Sandown Park515,2018-07-26,8,1119.016107,7,0.066307,0.174559,0.566093,0.014258,10.000000,2.0,20.000000
15682,2056920026,24.4,2,6,32.8,344185004,66928,Sandown Park515,2018-07-26,8,1060.269042,8,0.113288,-0.072968,0.544259,0.449794,5.165843,2.0,10.331685
15683,148732873,27.3,3,8,25.1,344185004,127874,Sandown Park515,2018-07-26,8,926.390496,1,-0.163575,-0.099260,0.246939,0.912738,8.143741,2.0,16.287482
15684,124886334,4.3,4,3,31.0,344185004,66993,Sandown Park515,2018-07-26,8,1003.422993,6,0.023263,0.050035,0.691340,0.954432,3.819672,2.0,7.639344
15685,2034470006,18.2,5,7,28.2,344185004,130058,Sandown Park515,2018-07-26,8,925.882385,4,-0.057036,-0.129545,0.767446,0.746452,8.281843,2.0,16.563686
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
782983,514248948,10.1,4,4,28.5,745616337,83581,Cannington520,2021-12-31,8,-536.354620,4,-0.057036,-0.025759,0.018879,0.691687,6.222584,2.0,12.445168
782984,491176598,5.9,5,6,26.6,745616337,87891,Cannington520,2021-12-31,8,-385.966583,2,-0.104427,-0.021015,0.034358,0.964976,4.671660,2.0,9.343319
782985,491176074,10.9,6,1,33.4,745616337,83951,Cannington520,2021-12-31,8,-499.940634,9,0.115418,0.148961,-0.768727,2.036605,4.669291,2.0,9.338583
782986,469847830,19.6,7,5,33.5,745616337,83516,Cannington520,2021-12-31,8,-465.977139,9,0.115418,-0.045962,-0.329491,0.774007,2.233334,2.0,4.466668


Normalise particular columns by race

In [76]:
# normalise columns by race
norm_list = ['TrainerRating', 'SpeedNorm_EMA', 'SpeedNorm_MSTD']
for col in norm_list:
    df[col + '_Z'] = df.groupby('FasttrackRaceId')[col].transform(lambda x: zscore(x))

df['SplitMarginMultiply'] = df.groupby('FasttrackRaceId')['SplitMarginMultiply'].transform(lambda x: (x  - x.mean())/100.)    
display(df)

Unnamed: 0,FasttrackDogId,StartPrice,Place,Box,Weight,FasttrackRaceId,TrainerId,TrackDist,RaceDate,FieldSize,...,WeightQuantile_mean,TrackDistBox_mean,SpeedNorm_EMA,SpeedNorm_MSTD,SplitMarginQuantileEMA,TrackSplitMarginQuantile,SplitMarginMultiply,TrainerRating_Z,SpeedNorm_EMA_Z,SpeedNorm_MSTD_Z
15681,141317072,2.7,1,2,32.2,344185004,112239,Sandown Park515,2018-07-26,8,...,0.066307,0.174559,0.566093,0.014258,10.000000,2.0,0.067538,0.907386,0.048760,-1.882528
15682,2056920026,24.4,2,6,32.8,344185004,66928,Sandown Park515,2018-07-26,8,...,0.113288,-0.072968,0.544259,0.449794,5.165843,2.0,-0.029145,0.172572,-0.002451,-0.432572
15683,148732873,27.3,3,8,25.1,344185004,127874,Sandown Park515,2018-07-26,8,...,-0.163575,-0.099260,0.246939,0.912738,8.143741,2.0,0.030413,-1.501994,-0.699774,1.108630
15684,124886334,4.3,4,3,31.0,344185004,66993,Sandown Park515,2018-07-26,8,...,0.023263,0.050035,0.691340,0.954432,3.819672,2.0,-0.056069,-0.538464,0.342507,1.247436
15685,2034470006,18.2,5,7,28.2,344185004,130058,Sandown Park515,2018-07-26,8,...,-0.057036,-0.129545,0.767446,0.746452,8.281843,2.0,0.033175,-1.508349,0.521005,0.555044
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
782983,514248948,10.1,4,4,28.5,745616337,83581,Cannington520,2021-12-31,8,...,-0.057036,-0.025759,0.018879,0.691687,6.222584,2.0,0.033573,-1.343757,0.243479,-0.334948
782984,491176598,5.9,5,6,26.6,745616337,87891,Cannington520,2021-12-31,8,...,-0.104427,-0.021015,0.034358,0.964976,4.671660,2.0,0.002554,0.730914,0.282581,0.217436
782985,491176074,10.9,6,1,33.4,745616337,83951,Cannington520,2021-12-31,8,...,0.115418,0.148961,-0.768727,2.036605,4.669291,2.0,0.002507,-0.841410,-1.746098,2.383452
782986,469847830,19.6,7,5,33.5,745616337,83516,Cannington520,2021-12-31,8,...,0.115418,-0.045962,-0.329491,0.774007,2.233334,2.0,-0.046212,-0.372868,-0.636540,-0.168559


Remove small number of NaN values (~20 rows), split data, and train a logistical regression model

In [77]:
# Create 'win' column for target
df['Win'] = (df['Place'] == 1).astype(int)

# Remove NaN values
df = df.dropna()
df = df.sort_values(by=["RaceDate", "FasttrackRaceId"], ascending=True)

# Split into train and test sets
train_data = df[df['RaceDate'] < '2021-01-01'].reset_index(drop = True)
test_data = df[df['RaceDate'] >= '2021-01-01'].reset_index(drop = True)

# Feature columns
feature_list = ['WeightQuantile_mean', 'TrackDistBox_mean', 
                'TrainerRating_Z', 'SpeedNorm_EMA_Z', 'SplitMarginMultiply', 'SpeedNorm_MSTD_Z']

# split into features and labels
train_x, train_y = train_data[feature_list], train_data['Win']
test_x, test_y = test_data[feature_list], test_data['Win']

# Train model on training data
logit_model = LogisticRegression()
logit_model.fit(train_x, train_y)

# Scale the raw model output so they sum to unity
test_data['prob_unscaled'] = logit_model.predict_proba(test_x)[:,1]
test_data['prob_scaled'] = test_data.groupby('FasttrackRaceId')['prob_unscaled'].apply(lambda x: x / sum(x))

display(test_data)

Unnamed: 0,FasttrackDogId,StartPrice,Place,Box,Weight,FasttrackRaceId,TrainerId,TrackDist,RaceDate,FieldSize,...,SpeedNorm_MSTD,SplitMarginQuantileEMA,TrackSplitMarginQuantile,SplitMarginMultiply,TrainerRating_Z,SpeedNorm_EMA_Z,SpeedNorm_MSTD_Z,Win,prob_unscaled,prob_scaled
0,434274825,4.2,1,6,31.8,623861932,112239,Ballarat450,2021-01-01,8,...,0.500868,9.471418,3.0,0.037147,0.640426,0.580081,-0.480946,1,0.145542,0.145240
1,380917315,5.9,2,5,32.8,623861932,129459,Ballarat450,2021-01-01,8,...,0.511453,8.916374,3.0,0.020496,1.117924,0.559387,-0.444868,0,0.166273,0.165929
2,465017181,3.2,3,4,31.6,623861932,118232,Ballarat450,2021-01-01,8,...,0.351607,10.000000,3.0,0.053005,1.570902,1.965983,-0.989675,0,0.356678,0.355940
3,318755759,16.1,4,2,28.2,623861932,101324,Ballarat450,2021-01-01,8,...,1.106616,5.139639,3.0,-0.092806,-1.045036,-0.760553,1.583630,0,0.046193,0.046097
4,387031520,32.1,5,7,27.6,623861932,101324,Ballarat450,2021-01-01,8,...,0.996652,8.071073,3.0,-0.004863,-1.045036,-0.660417,1.208839,0,0.049696,0.049593
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
55371,331345891,39.5,4,8,29.1,747048620,63454,Bendigo425,2021-12-31,8,...,0.356532,5.385881,4.0,-0.039361,-0.085641,-0.578115,-0.848525,0,0.061567,0.060671
55372,530060512,5.7,5,3,26.5,747048620,222983,Bendigo425,2021-12-31,8,...,0.616281,7.894103,4.0,0.060968,1.015585,0.796812,-0.285809,0,0.172932,0.170414
55373,359294661,42.5,6,6,28.4,747048620,130810,Bendigo425,2021-12-31,8,...,1.319232,3.232219,4.0,-0.125508,-0.922238,-0.961568,1.237055,0,0.028636,0.028219
55374,373611738,8.6,7,4,28.6,747048620,129328,Bendigo425,2021-12-31,8,...,1.708954,6.274972,4.0,-0.003798,0.750009,-0.607763,2.081343,0,0.112738,0.111097


Compare tipping results between prelim logistic regression model and SP

In [78]:
# Compare tipping results from model favourite and starting price favourite
test_data['model_win_prediction'] = test_data.groupby('FasttrackRaceId')['prob_scaled'].apply(lambda x: x == max(x))
test_data['odds_win_prediction'] = test_data.groupby('FasttrackRaceId')['StartPrice'].apply(lambda x: x == min(x))

print("Model predicts the winner in {:.2%} of races".format(
    len(test_data[(test_data['model_win_prediction'] == True) & (test_data['Win'] == 1)]) / test_data['FasttrackRaceId'].nunique()
    ))
print("Starting Price Odds predicts the winner in {:.2%} of races".format(
    len(test_data[(test_data['odds_win_prediction'] == True) & (test_data['Win'] == 1)]) / test_data['FasttrackRaceId'].nunique()
    ))

Model predicts the winner in 32.33% of races
Starting Price Odds predicts the winner in 37.47% of races


Feature importance

In [79]:
feature_importance = logit_model.coef_[0]
print(feature_importance)
print(feature_list)

[0.61608354 2.9205851  0.34953086 0.60048819 3.26663079 0.19296565]
['WeightQuantile_mean', 'TrackDistBox_mean', 'TrainerRating_Z', 'SpeedNorm_EMA_Z', 'SplitMarginMultiply', 'SpeedNorm_MSTD_Z']
