## Preliminary Logistic Regression Model

We're going to merge the features we've created from feature-engineering/ to our greyhound data, restrict our sample to full field sizes of 8 where we contain information on every greyhound in the race (all greyhounds must have had at least one race).

----

Import libraries, packages, and greyhound data

In [136]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as stats
import os
import decouple
import sys
config = decouple.AutoConfig(' ')
os.chdir(config('ROOT_DIRECTORY'))
sys.path.insert(0, '')

from scipy.stats import zscore
from multielo import MultiElo, Player, Tracker
from multielo.multielo import defaults
from sklearn.linear_model import LogisticRegression

# Read in data
df_raw = pd.read_csv('./data/clean/dog_results.csv')

display(df_raw)

Unnamed: 0,FasttrackDogId,Place,DogName,Box,Rug,Weight,StartPrice,Margin1,Margin2,PIR,...,FasttrackRaceId,TrainerId,TrainerName,Distance,RaceGrade,Track,RaceNum,TrackDist,RaceDate,FieldSize
0,157500927,1,RAINE ALLEN,1,1,27.4,2.4,2.30,,Q/111,...,335811282,7683,C GRENFELL,500.0,Restricted Win,Bendigo,1.0,Bendigo500,2018-07-01,6
1,1820620018,2,SURF A LOT,2,2,32.8,6.3,2.30,2.30,M/332,...,335811282,137227,C TYLEY,500.0,Restricted Win,Bendigo,1.0,Bendigo500,2018-07-01,6
2,1950680026,3,PINGIN' BEE,6,6,25.5,9.3,3.84,1.54,S/443,...,335811282,132763,P DAPIRAN,500.0,Restricted Win,Bendigo,1.0,Bendigo500,2018-07-01,6
3,1524380048,4,LUCAS THE GREAT,7,7,32.2,9.1,5.27,1.43,M/655,...,335811282,116605,E HAMILTON,500.0,Restricted Win,Bendigo,1.0,Bendigo500,2018-07-01,6
4,124225458,5,QUAVO,4,4,28.9,3.4,5.56,0.29,M/766,...,335811282,132763,P DAPIRAN,500.0,Restricted Win,Bendigo,1.0,Bendigo500,2018-07-01,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
782997,491585906,3,GLORIOUS GUNN,8,8,27.1,3.8,3.75,2.43,6644,...,745616339,87891,G HORNE,520.0,Grade 5,Cannington,12.0,Cannington520,2021-12-31,7
782998,485659451,4,WOOD FIRE,3,3,32.1,4.1,3.75,0.14,3233,...,745616339,68549,C HALSE,520.0,Grade 5,Cannington,12.0,Cannington520,2021-12-31,7
782999,528381655,5,TRENDING QUARTER,6,6,31.8,16.2,5.25,1.43,4566,...,745616339,83581,J DAILLY,520.0,Grade 5,Cannington,12.0,Cannington520,2021-12-31,7
783000,537992387,6,ELITE WEAPON,1,1,26.7,2.9,5.25,0.00,1455,...,745616339,293372,S WILLIAMS,520.0,Grade 5,Cannington,12.0,Cannington520,2021-12-31,7


Take only columns of interest, and merge our features to the dataframe

In [137]:
# Copy raw dataframe
df = df_raw.copy()

# Take only columns of interest
col_list = ['FasttrackDogId', 'StartPrice', 'Place', 'Box', 'Weight', 'FasttrackRaceId', 
            'TrainerId', 'TrackDist', 'RaceDate', 'FieldSize']
df = df[col_list]

'''
Read in features and merge to main dataframe
'''

# Trainer ELO Rating
df_temp = pd.read_csv('./data/features/trainer-elo.csv')
df = df.merge(df_temp, on=['TrainerId', 'FasttrackRaceId', 'RaceDate'], how='left')

# Mean SpeedNorm by Weight
df['WeightQuantile'] = pd.qcut(df['Weight'], 10, labels=False)+1
df_temp = pd.read_csv('./data/features/mean-speednorm-by-weight.csv').drop('WeightQuantile_std', axis=1)
df = df.merge(df_temp, on=['WeightQuantile'], how='left')

# Mean SpeedNorm by Track Distance Box
df_temp = pd.read_csv('./data/features/mean-speednorm-by-trackdistbox.csv').drop('TrackDistBox_std', axis=1)
df = df.merge(df_temp, on=['TrackDist', 'Box'], how='left')

# EMA SpeedNorm by Greyhound
df_temp = pd.read_csv('./data/features/ema-speednorm-by-greyhound.csv')
df = df.merge(df_temp, on=['FasttrackDogId', 'FasttrackRaceId', 'RaceDate'], how='left')

display(df)

Unnamed: 0,FasttrackDogId,StartPrice,Place,Box,Weight,FasttrackRaceId,TrainerId,TrackDist,RaceDate,FieldSize,TrainerRating,WeightQuantile,WeightQuantile_mean,TrackDistBox_mean,SpeedNorm_EMA
0,157500927,2.4,1,1,27.4,335811282,7683,Bendigo500,2018-07-01,6,1000.000000,3,-0.080413,,
1,1820620018,6.3,2,2,32.8,335811282,137227,Bendigo500,2018-07-01,6,1000.000000,8,0.113288,,
2,1950680026,9.3,3,6,25.5,335811282,132763,Bendigo500,2018-07-01,6,1000.000000,1,-0.163575,,
3,1524380048,9.1,4,7,32.2,335811282,116605,Bendigo500,2018-07-01,6,1000.000000,7,0.066307,,
4,124225458,3.4,5,4,28.9,335811282,132763,Bendigo500,2018-07-01,6,1000.000000,4,-0.057036,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
782997,491585906,3.8,3,8,27.1,745616339,87891,Cannington520,2021-12-31,7,-385.966583,3,-0.080413,-0.043643,0.712705
782998,485659451,4.1,4,3,32.1,745616339,68549,Cannington520,2021-12-31,7,-337.539747,7,0.066307,-0.004835,0.605300
782999,528381655,16.2,5,6,31.8,745616339,83581,Cannington520,2021-12-31,7,-536.354620,7,0.066307,-0.021015,0.167931
783000,537992387,2.9,6,1,26.7,745616339,293372,Cannington520,2021-12-31,7,-300.109813,2,-0.104427,0.148961,0.497658


Take only races with a full field (8 greyhounds), and take only races where we have information on every greyhound. We notice that this significantly reduces our sample size (both operations cause about a 2/3 of data to be removed).

In [138]:
# Take only races with a full field
df = df[df['FieldSize'] == 8]

# Take only races where we have feature information on every greyhound
df_temp = df.copy()
df_temp = df_temp[['FasttrackRaceId', 'TrainerRating', 'WeightQuantile_mean', 'TrackDistBox_mean', 'SpeedNorm_EMA']]
df_temp = df_temp[df_temp.isnull().any(axis=1)]
list_of_races = df_temp['FasttrackRaceId'].unique()

df = df[~df['FasttrackRaceId'].isin(list_of_races)]

display(df)

Unnamed: 0,FasttrackDogId,StartPrice,Place,Box,Weight,FasttrackRaceId,TrainerId,TrackDist,RaceDate,FieldSize,TrainerRating,WeightQuantile,WeightQuantile_mean,TrackDistBox_mean,SpeedNorm_EMA
3308,1963750031,5.8,1,2,29.8,337082320,68549,Bendigo425,2018-07-06,8,1026.605267,5,-0.006573,0.067085,2.045300
3309,149399734,8.2,2,6,33.3,337082320,108283,Bendigo425,2018-07-06,8,1028.168607,9,0.115418,-0.049565,2.206318
3310,1729820019,3.6,3,7,34.4,337082320,115064,Bendigo425,2018-07-06,8,1163.701658,9,0.115418,-0.037729,2.530805
3311,157500921,11.0,4,4,31.6,337082320,7683,Bendigo425,2018-07-06,8,994.665471,7,0.066307,-0.014121,2.395203
3312,1486670016,6.1,5,8,29.4,337082320,112806,Bendigo425,2018-07-06,8,1028.152647,5,-0.006573,0.076176,2.045300
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
782983,514248948,10.1,4,4,28.5,745616337,83581,Cannington520,2021-12-31,8,-536.354620,4,-0.057036,-0.025759,0.018879
782984,491176598,5.9,5,6,26.6,745616337,87891,Cannington520,2021-12-31,8,-385.966583,2,-0.104427,-0.021015,0.034358
782985,491176074,10.9,6,1,33.4,745616337,83951,Cannington520,2021-12-31,8,-499.940634,9,0.115418,0.148961,-0.768727
782986,469847830,19.6,7,5,33.5,745616337,83516,Cannington520,2021-12-31,8,-465.977139,9,0.115418,-0.045962,-0.329491


Normalise particular columns by race

In [139]:
# normalise columns by race
norm_list = ['TrainerRating', 'SpeedNorm_EMA']
for col in norm_list:
    df[col + '_Z'] = df.groupby('FasttrackRaceId')[col].transform(lambda x: zscore(x))

display(df)

  return (a - mns) / sstd


Unnamed: 0,FasttrackDogId,StartPrice,Place,Box,Weight,FasttrackRaceId,TrainerId,TrackDist,RaceDate,FieldSize,TrainerRating,WeightQuantile,WeightQuantile_mean,TrackDistBox_mean,SpeedNorm_EMA,TrainerRating_Z,SpeedNorm_EMA_Z
3308,1963750031,5.8,1,2,29.8,337082320,68549,Bendigo425,2018-07-06,8,1026.605267,5,-0.006573,0.067085,2.045300,-0.270397,-0.660397
3309,149399734,8.2,2,6,33.3,337082320,108283,Bendigo425,2018-07-06,8,1028.168607,9,0.115418,-0.049565,2.206318,-0.248303,0.109199
3310,1729820019,3.6,3,7,34.4,337082320,115064,Bendigo425,2018-07-06,8,1163.701658,9,0.115418,-0.037729,2.530805,1.667137,1.660112
3311,157500921,11.0,4,4,31.6,337082320,7683,Bendigo425,2018-07-06,8,994.665471,7,0.066307,-0.014121,2.395203,-0.721790,1.011989
3312,1486670016,6.1,5,8,29.4,337082320,112806,Bendigo425,2018-07-06,8,1028.152647,5,-0.006573,0.076176,2.045300,-0.248528,-0.660397
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
782983,514248948,10.1,4,4,28.5,745616337,83581,Cannington520,2021-12-31,8,-536.354620,4,-0.057036,-0.025759,0.018879,-1.343757,0.243479
782984,491176598,5.9,5,6,26.6,745616337,87891,Cannington520,2021-12-31,8,-385.966583,2,-0.104427,-0.021015,0.034358,0.730914,0.282581
782985,491176074,10.9,6,1,33.4,745616337,83951,Cannington520,2021-12-31,8,-499.940634,9,0.115418,0.148961,-0.768727,-0.841410,-1.746098
782986,469847830,19.6,7,5,33.5,745616337,83516,Cannington520,2021-12-31,8,-465.977139,9,0.115418,-0.045962,-0.329491,-0.372868,-0.636540


Remove small number of NaN values (~20 rows), split data, and train a logistical regression model

In [141]:
# Remove NaN values
df = df.dropna()
df = df.sort_values(by=["RaceDate", "FasttrackRaceId"], ascending=True)

# Split into train and test sets
train_data = df[df['RaceDate'] < '2021-06-01'].reset_index(drop = True)
test_data = df[df['RaceDate'] >= '2021-06-01'].reset_index(drop = True)

# Feature columns
feature_list = ['WeightQuantile_mean', 'TrackDistBox_mean', 'TrainerRating_Z', 'SpeedNorm_EMA_Z']

# Create 'win' column for target
df['win'] = (df['Place'] == 1).astype(int)

# split into features and labels
train_x, train_y = train_data[feature_list], train_data['win']
test_x, test_y = test_data[feature_list], test_data['win']

# Train model on training data
logit_model = LogisticRegression()
logit_model.fit(train_x, train_y)

# Scale the raw model output so they sum to unity
test_data['prob_unscaled'] = logit_model.predict_proba(test_x)[:,1]
test_data['prob_scaled'] = test_data.groupby('FasttrackRaceId')['prob_unscaled'].apply(lambda x: x / sum(x))

display(test_data)

Unnamed: 0,FasttrackDogId,StartPrice,Place,Box,Weight,FasttrackRaceId,TrainerId,TrackDist,RaceDate,FieldSize,TrainerRating,WeightQuantile,WeightQuantile_mean,TrackDistBox_mean,SpeedNorm_EMA,TrainerRating_Z,SpeedNorm_EMA_Z,win,prob_unscaled,prob_scaled
0,460033737,3.4,1,2,31.9,674476381,120532,Devonport452,2021-06-01,8,163.683649,7,0.066307,0.022645,0.520569,-0.459104,1.201630,1,0.181868,0.178155
1,351419976,61.0,2,7,32.6,674476381,88389,Devonport452,2021-06-01,8,56.683905,8,0.113288,-0.080116,-0.141007,-1.222270,-1.388497,0,0.021531,0.021092
2,383604324,4.4,3,8,28.7,674476381,64552,Devonport452,2021-06-01,8,358.143217,4,-0.057036,0.060450,-0.043645,0.927861,-1.007314,0,0.080831,0.079181
3,324020361,20.0,4,6,29.6,674476381,120532,Devonport452,2021-06-01,8,163.683649,5,-0.006573,-0.018195,-0.063304,-0.459104,-1.084283,0,0.039094,0.038296
4,451767738,4.2,5,5,32.5,674476381,123131,Devonport452,2021-06-01,8,255.076923,8,0.113288,-0.046116,0.496112,0.192750,1.105880,0,0.182276,0.178555
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
46690,331345891,39.5,4,8,29.1,747048620,63454,Bendigo425,2021-12-31,8,762.398479,5,-0.006573,0.076176,-0.153072,-0.085641,-0.578115,0,0.079372,0.073656
46691,530060512,5.7,5,3,26.5,747048620,222983,Bendigo425,2021-12-31,8,903.042250,2,-0.104427,-0.070193,0.481113,1.015585,0.796812,0,0.172567,0.160139
46692,359294661,42.5,6,6,28.4,747048620,130810,Bendigo425,2021-12-31,8,655.551855,4,-0.057036,-0.049565,-0.329940,-0.922238,-0.961568,0,0.032216,0.029895
46693,373611738,8.6,7,4,28.6,747048620,129328,Bendigo425,2021-12-31,8,869.124053,4,-0.057036,-0.014121,-0.166748,0.750009,-0.607763,0,0.080270,0.074489


Compare tipping results between prelim logistic regression model and SP

In [142]:
# Compare tipping results from model favourite and starting price favourite
test_data['model_win_prediction'] = test_data.groupby('FasttrackRaceId')['prob_scaled'].apply(lambda x: x == max(x))
test_data['odds_win_prediction'] = test_data.groupby('FasttrackRaceId')['StartPrice'].apply(lambda x: x == min(x))

print("Model predicts the winner in {:.2%} of races".format(
    len(test_data[(test_data['model_win_prediction'] == True) & (test_data['win'] == 1)]) / test_data['FasttrackRaceId'].nunique()
    ))
print("Starting Price Odds predicts the winner in {:.2%} of races".format(
    len(test_data[(test_data['odds_win_prediction'] == True) & (test_data['win'] == 1)]) / test_data['FasttrackRaceId'].nunique()
    ))

Model predicts the winner in 32.09% of races
Starting Price Odds predicts the winner in 39.23% of races
