## Preliminary Logistic Regression Model

We're going to merge the features we've created from feature-engineering/ to our greyhound data, restrict our sample to full field sizes of 8 where we contain information on every greyhound in the race (all greyhounds must have had at least one race).

----

Import libraries, packages, and greyhound data

In [29]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as stats
import os
import decouple
import sys
config = decouple.AutoConfig(' ')
os.chdir(config('ROOT_DIRECTORY'))
sys.path.insert(0, '')

from scipy.stats import zscore
from multielo import MultiElo, Player, Tracker
from multielo.multielo import defaults
from sklearn.linear_model import LogisticRegression

# Read in data
df_raw = pd.read_csv('./data/clean/dog_results.csv')

display(df_raw)

Unnamed: 0,FasttrackDogId,Place,DogName,Box,Rug,Weight,StartPrice,Margin1,Margin2,PIR,...,FasttrackRaceId,TrainerId,TrainerName,Distance,RaceGrade,Track,RaceNum,TrackDist,RaceDate,FieldSize
0,157500927,1,RAINE ALLEN,1,1,27.4,2.4,2.30,,Q/111,...,335811282,7683,C GRENFELL,500.0,Restricted Win,Bendigo,1.0,Bendigo500,2018-07-01,6
1,1820620018,2,SURF A LOT,2,2,32.8,6.3,2.30,2.30,M/332,...,335811282,137227,C TYLEY,500.0,Restricted Win,Bendigo,1.0,Bendigo500,2018-07-01,6
2,1950680026,3,PINGIN' BEE,6,6,25.5,9.3,3.84,1.54,S/443,...,335811282,132763,P DAPIRAN,500.0,Restricted Win,Bendigo,1.0,Bendigo500,2018-07-01,6
3,1524380048,4,LUCAS THE GREAT,7,7,32.2,9.1,5.27,1.43,M/655,...,335811282,116605,E HAMILTON,500.0,Restricted Win,Bendigo,1.0,Bendigo500,2018-07-01,6
4,124225458,5,QUAVO,4,4,28.9,3.4,5.56,0.29,M/766,...,335811282,132763,P DAPIRAN,500.0,Restricted Win,Bendigo,1.0,Bendigo500,2018-07-01,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
782997,491585906,3,GLORIOUS GUNN,8,8,27.1,3.8,3.75,2.43,6644,...,745616339,87891,G HORNE,520.0,Grade 5,Cannington,12.0,Cannington520,2021-12-31,7
782998,485659451,4,WOOD FIRE,3,3,32.1,4.1,3.75,0.14,3233,...,745616339,68549,C HALSE,520.0,Grade 5,Cannington,12.0,Cannington520,2021-12-31,7
782999,528381655,5,TRENDING QUARTER,6,6,31.8,16.2,5.25,1.43,4566,...,745616339,83581,J DAILLY,520.0,Grade 5,Cannington,12.0,Cannington520,2021-12-31,7
783000,537992387,6,ELITE WEAPON,1,1,26.7,2.9,5.25,0.00,1455,...,745616339,293372,S WILLIAMS,520.0,Grade 5,Cannington,12.0,Cannington520,2021-12-31,7


Take only columns of interest, and merge our features to the dataframe

In [30]:
# Copy raw dataframe
df = df_raw.copy()

# Take only columns of interest
col_list = ['FasttrackDogId', 'StartPrice', 'Place', 'Box', 'Weight', 'FasttrackRaceId', 
            'TrainerId', 'TrackDist', 'RaceDate', 'FieldSize']
df = df[col_list]

'''
Read in features and merge to main dataframe
'''

# Trainer ELO Rating
df_temp = pd.read_csv('./data/features/trainer-elo.csv')
df = df.merge(df_temp, on=['TrainerId', 'FasttrackRaceId', 'RaceDate'], how='left')

# Mean SpeedNorm by Weight
df['WeightQuantile'] = pd.qcut(df['Weight'], 10, labels=False)+1
df_temp = pd.read_csv('./data/features/mean-speednorm-by-weight.csv').drop('WeightQuantile_std', axis=1)
df = df.merge(df_temp, on=['WeightQuantile'], how='left')

# Mean SpeedNorm by Track Distance Box
df_temp = pd.read_csv('./data/features/mean-speednorm-by-trackdistbox.csv').drop('TrackDistBox_std', axis=1)
df = df.merge(df_temp, on=['TrackDist', 'Box'], how='left')

# EMA SpeedNorm by Greyhound
df_temp = pd.read_csv('./data/features/ema-speednorm-by-greyhound.csv')
df = df.merge(df_temp, on=['FasttrackDogId', 'FasttrackRaceId', 'RaceDate'], how='left')

# Split Margin by Track Distance
df_temp = pd.read_csv('./data/features/split-margin-by-trackdist.csv')
df = df.merge(df_temp, on=['FasttrackDogId', 'FasttrackRaceId', 'TrackDist'], how='left')

display(df)

Unnamed: 0,FasttrackDogId,StartPrice,Place,Box,Weight,FasttrackRaceId,TrainerId,TrackDist,RaceDate,FieldSize,TrainerRating,WeightQuantile,WeightQuantile_mean,TrackDistBox_mean,SpeedNorm_EMA,SplitMarginMultiply
0,157500927,2.4,1,1,27.4,335811282,7683,Bendigo500,2018-07-01,6,1000.000000,3,-0.080413,,,
1,1820620018,6.3,2,2,32.8,335811282,137227,Bendigo500,2018-07-01,6,1000.000000,8,0.113288,,,
2,1950680026,9.3,3,6,25.5,335811282,132763,Bendigo500,2018-07-01,6,1000.000000,1,-0.163575,,,
3,1524380048,9.1,4,7,32.2,335811282,116605,Bendigo500,2018-07-01,6,1000.000000,7,0.066307,,,
4,124225458,3.4,5,4,28.9,335811282,132763,Bendigo500,2018-07-01,6,1000.000000,4,-0.057036,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
782997,491585906,3.8,3,8,27.1,745616339,87891,Cannington520,2021-12-31,7,-385.966583,3,-0.080413,-0.043643,0.712705,14.691609
782998,485659451,4.1,4,3,32.1,745616339,68549,Cannington520,2021-12-31,7,-337.539747,7,0.066307,-0.004835,0.605300,23.082324
782999,528381655,16.2,5,6,31.8,745616339,83581,Cannington520,2021-12-31,7,-536.354620,7,0.066307,-0.021015,0.346363,12.978245
783000,537992387,2.9,6,1,26.7,745616339,293372,Cannington520,2021-12-31,7,-300.109813,2,-0.104427,0.148961,0.497658,23.509648


Take only races with a full field (8 greyhounds), and take only races where we have information on every greyhound. We notice that this significantly reduces our sample size (both operations cause about a 2/3 of data to be removed).

In [31]:
# Take only races with a full field
df = df[df['FieldSize'] == 8]

# Take only races where we have feature information on every greyhound
df_temp = df.copy()
df_temp = df_temp[['FasttrackRaceId', 'TrainerRating', 'WeightQuantile_mean', 
                   'TrackDistBox_mean', 'SpeedNorm_EMA', 'SplitMarginMultiply']]
df_temp = df_temp[df_temp.isnull().any(axis=1)]
list_of_races = df_temp['FasttrackRaceId'].unique()

df = df[~df['FasttrackRaceId'].isin(list_of_races)]

display(df)

Unnamed: 0,FasttrackDogId,StartPrice,Place,Box,Weight,FasttrackRaceId,TrainerId,TrackDist,RaceDate,FieldSize,TrainerRating,WeightQuantile,WeightQuantile_mean,TrackDistBox_mean,SpeedNorm_EMA,SplitMarginMultiply
3308,1963750031,5.8,1,2,29.8,337082320,68549,Bendigo425,2018-07-06,8,1026.605267,5,-0.006573,0.067085,2.045300,50.000000
3309,149399734,8.2,2,6,33.3,337082320,108283,Bendigo425,2018-07-06,8,1028.168607,9,0.115418,-0.049565,2.206318,50.000000
3310,1729820019,3.6,3,7,34.4,337082320,115064,Bendigo425,2018-07-06,8,1163.701658,9,0.115418,-0.037729,2.530805,50.000000
3311,157500921,11.0,4,4,31.6,337082320,7683,Bendigo425,2018-07-06,8,994.665471,7,0.066307,-0.014121,2.395203,25.000000
3312,1486670016,6.1,5,8,29.4,337082320,112806,Bendigo425,2018-07-06,8,1028.152647,5,-0.006573,0.076176,2.045300,50.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
782983,514248948,10.1,4,4,28.5,745616337,83581,Cannington520,2021-12-31,8,-536.354620,4,-0.057036,-0.025759,0.082804,14.901889
782984,491176598,5.9,5,6,26.6,745616337,87891,Cannington520,2021-12-31,8,-385.966583,2,-0.104427,-0.021015,0.034358,14.061121
782985,491176074,10.9,6,1,33.4,745616337,83951,Cannington520,2021-12-31,8,-499.940634,9,0.115418,0.148961,-0.768727,16.224093
782986,469847830,19.6,7,5,33.5,745616337,83516,Cannington520,2021-12-31,8,-465.977139,9,0.115418,-0.045962,-0.329491,6.700002


Normalise particular columns by race

In [32]:
# normalise columns by race
norm_list = ['TrainerRating', 'SpeedNorm_EMA', 'SplitMarginMultiply']
for col in norm_list:
    df[col + '_Z'] = df.groupby('FasttrackRaceId')[col].transform(lambda x: zscore(x))

display(df)

  return (a - mns) / sstd


Unnamed: 0,FasttrackDogId,StartPrice,Place,Box,Weight,FasttrackRaceId,TrainerId,TrackDist,RaceDate,FieldSize,TrainerRating,WeightQuantile,WeightQuantile_mean,TrackDistBox_mean,SpeedNorm_EMA,SplitMarginMultiply,TrainerRating_Z,SpeedNorm_EMA_Z,SplitMarginMultiply_Z
3308,1963750031,5.8,1,2,29.8,337082320,68549,Bendigo425,2018-07-06,8,1026.605267,5,-0.006573,0.067085,2.045300,50.000000,-0.270397,-0.660397,0.554700
3309,149399734,8.2,2,6,33.3,337082320,108283,Bendigo425,2018-07-06,8,1028.168607,9,0.115418,-0.049565,2.206318,50.000000,-0.248303,0.109199,0.554700
3310,1729820019,3.6,3,7,34.4,337082320,115064,Bendigo425,2018-07-06,8,1163.701658,9,0.115418,-0.037729,2.530805,50.000000,1.667137,1.660112,0.554700
3311,157500921,11.0,4,4,31.6,337082320,7683,Bendigo425,2018-07-06,8,994.665471,7,0.066307,-0.014121,2.395203,25.000000,-0.721790,1.011989,-2.218801
3312,1486670016,6.1,5,8,29.4,337082320,112806,Bendigo425,2018-07-06,8,1028.152647,5,-0.006573,0.076176,2.045300,50.000000,-0.248528,-0.660397,0.554700
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
782983,514248948,10.1,4,4,28.5,745616337,83581,Cannington520,2021-12-31,8,-536.354620,4,-0.057036,-0.025759,0.082804,14.901889,-1.343757,0.382359,0.348984
782984,491176598,5.9,5,6,26.6,745616337,87891,Cannington520,2021-12-31,8,-385.966583,2,-0.104427,-0.021015,0.034358,14.061121,0.730914,0.260748,0.142567
782985,491176074,10.9,6,1,33.4,745616337,83951,Cannington520,2021-12-31,8,-499.940634,9,0.115418,0.148961,-0.768727,16.224093,-0.841410,-1.755190,0.673598
782986,469847830,19.6,7,5,33.5,745616337,83516,Cannington520,2021-12-31,8,-465.977139,9,0.115418,-0.045962,-0.329491,6.700002,-0.372868,-0.652601,-1.664658


Remove small number of NaN values (~20 rows), split data, and train a logistical regression model

In [33]:
# Create 'win' column for target
df['win'] = (df['Place'] == 1).astype(int)

# Remove NaN values
df = df.dropna()
df = df.sort_values(by=["RaceDate", "FasttrackRaceId"], ascending=True)

# Split into train and test sets
train_data = df[df['RaceDate'] < '2021-01-01'].reset_index(drop = True)
test_data = df[df['RaceDate'] >= '2021-01-01'].reset_index(drop = True)

# Feature columns
feature_list = ['WeightQuantile_mean', 'TrackDistBox_mean', 
                'TrainerRating_Z', 'SpeedNorm_EMA_Z', 'SplitMarginMultiply_Z']

# split into features and labels
train_x, train_y = train_data[feature_list], train_data['win']
test_x, test_y = test_data[feature_list], test_data['win']

# Train model on training data
logit_model = LogisticRegression()
logit_model.fit(train_x, train_y)

# Scale the raw model output so they sum to unity
test_data['prob_unscaled'] = logit_model.predict_proba(test_x)[:,1]
test_data['prob_scaled'] = test_data.groupby('FasttrackRaceId')['prob_unscaled'].apply(lambda x: x / sum(x))

display(test_data)

Unnamed: 0,FasttrackDogId,StartPrice,Place,Box,Weight,FasttrackRaceId,TrainerId,TrackDist,RaceDate,FieldSize,...,WeightQuantile_mean,TrackDistBox_mean,SpeedNorm_EMA,SplitMarginMultiply,TrainerRating_Z,SpeedNorm_EMA_Z,SplitMarginMultiply_Z,win,prob_unscaled,prob_scaled
0,434274825,4.2,1,6,31.8,623861932,112239,Ballarat450,2021-01-01,8,...,0.066307,-0.047410,1.501255,38.805610,0.640426,0.502106,1.219828,1,0.192125,0.178631
1,380917315,5.9,2,5,32.8,623861932,129459,Ballarat450,2021-01-01,8,...,0.113288,-0.039966,1.577306,34.909954,1.117924,0.686541,0.477812,0,0.202084,0.187892
2,465017181,3.2,3,4,31.6,623861932,118232,Ballarat450,2021-01-01,8,...,0.066307,-0.023747,2.109246,36.507546,1.570902,1.976576,0.782110,0,0.413276,0.384251
3,318755759,16.1,4,2,28.2,623861932,101324,Ballarat450,2021-01-01,8,...,-0.057036,0.034294,0.990281,23.039407,-1.045036,-0.737082,-1.783205,0,0.025320,0.023542
4,387031520,32.1,5,7,27.6,623861932,101324,Ballarat450,2021-01-01,8,...,-0.080413,-0.028673,1.022982,30.202301,-1.045036,-0.657777,-0.418868,0,0.034830,0.032384
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52203,331345891,39.5,4,8,29.1,747048620,63454,Bendigo425,2021-12-31,8,...,-0.006573,0.076176,0.008747,25.303261,-0.085641,-0.709692,-0.439950,0,0.064678,0.062668
52204,530060512,5.7,5,3,26.5,747048620,222983,Bendigo425,2021-12-31,8,...,-0.104427,-0.070193,0.533514,10.000000,1.015585,0.552440,-2.187406,0,0.063491,0.061518
52205,359294661,42.5,6,6,28.4,747048620,130810,Bendigo425,2021-12-31,8,...,-0.057036,-0.049565,0.052270,23.360656,-0.922238,-0.605013,-0.661773,0,0.032957,0.031933
52206,373611738,8.6,7,4,28.6,747048620,129328,Bendigo425,2021-12-31,8,...,-0.057036,-0.014121,0.064785,35.114323,0.750009,-0.574912,0.680359,0,0.098286,0.095231


Compare tipping results between prelim logistic regression model and SP

In [34]:
# Compare tipping results from model favourite and starting price favourite
test_data['model_win_prediction'] = test_data.groupby('FasttrackRaceId')['prob_scaled'].apply(lambda x: x == max(x))
test_data['odds_win_prediction'] = test_data.groupby('FasttrackRaceId')['StartPrice'].apply(lambda x: x == min(x))

print("Model predicts the winner in {:.2%} of races".format(
    len(test_data[(test_data['model_win_prediction'] == True) & (test_data['win'] == 1)]) / test_data['FasttrackRaceId'].nunique()
    ))
print("Starting Price Odds predicts the winner in {:.2%} of races".format(
    len(test_data[(test_data['odds_win_prediction'] == True) & (test_data['win'] == 1)]) / test_data['FasttrackRaceId'].nunique()
    ))

Model predicts the winner in 31.29% of races
Starting Price Odds predicts the winner in 37.66% of races


Feature importance

In [35]:
feature_importance = logit_model.coef_[0]
print(feature_importance)
print(feature_list)

[0.77738236 2.9281024  0.33735369 0.58208176 0.35554137]
['WeightQuantile_mean', 'TrackDistBox_mean', 'TrainerRating_Z', 'SpeedNorm_EMA_Z', 'SplitMarginMultiply_Z']
