## Preliminary Logistic Regression Model

We're going to merge the features we've created from feature-engineering/ to our greyhound data, restrict our sample to full field sizes of 8 where we contain information on every greyhound in the race (all greyhounds must have had at least one race).

----

Import libraries, packages, and greyhound data

In [8]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as stats
import os
import decouple
import sys
config = decouple.AutoConfig(' ')
os.chdir(config('ROOT_DIRECTORY'))
sys.path.insert(0, '')

from scipy.stats import zscore
from multielo import MultiElo, Player, Tracker
from multielo.multielo import defaults
from sklearn.linear_model import LogisticRegression

# Read in data
df_raw = pd.read_csv('./data/clean/dog_results.csv')

display(df_raw)

Unnamed: 0,FasttrackDogId,Place,DogName,Box,Rug,Weight,StartPrice,Margin1,Margin2,PIR,...,FasttrackRaceId,TrainerId,TrainerName,Distance,RaceGrade,Track,RaceNum,TrackDist,RaceDate,FieldSize
0,157500927,1,RAINE ALLEN,1,1,27.4,2.4,2.30,,Q/111,...,335811282,7683,C GRENFELL,500.0,Restricted Win,Bendigo,1.0,Bendigo500,2018-07-01,6
1,1820620018,2,SURF A LOT,2,2,32.8,6.3,2.30,2.30,M/332,...,335811282,137227,C TYLEY,500.0,Restricted Win,Bendigo,1.0,Bendigo500,2018-07-01,6
2,1950680026,3,PINGIN' BEE,6,6,25.5,9.3,3.84,1.54,S/443,...,335811282,132763,P DAPIRAN,500.0,Restricted Win,Bendigo,1.0,Bendigo500,2018-07-01,6
3,1524380048,4,LUCAS THE GREAT,7,7,32.2,9.1,5.27,1.43,M/655,...,335811282,116605,E HAMILTON,500.0,Restricted Win,Bendigo,1.0,Bendigo500,2018-07-01,6
4,124225458,5,QUAVO,4,4,28.9,3.4,5.56,0.29,M/766,...,335811282,132763,P DAPIRAN,500.0,Restricted Win,Bendigo,1.0,Bendigo500,2018-07-01,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
782997,491585906,3,GLORIOUS GUNN,8,8,27.1,3.8,3.75,2.43,6644,...,745616339,87891,G HORNE,520.0,Grade 5,Cannington,12.0,Cannington520,2021-12-31,7
782998,485659451,4,WOOD FIRE,3,3,32.1,4.1,3.75,0.14,3233,...,745616339,68549,C HALSE,520.0,Grade 5,Cannington,12.0,Cannington520,2021-12-31,7
782999,528381655,5,TRENDING QUARTER,6,6,31.8,16.2,5.25,1.43,4566,...,745616339,83581,J DAILLY,520.0,Grade 5,Cannington,12.0,Cannington520,2021-12-31,7
783000,537992387,6,ELITE WEAPON,1,1,26.7,2.9,5.25,0.00,1455,...,745616339,293372,S WILLIAMS,520.0,Grade 5,Cannington,12.0,Cannington520,2021-12-31,7


Take only columns of interest, and merge our features to the dataframe

In [9]:
# Copy raw dataframe
df = df_raw.copy()

# Take only columns of interest
col_list = ['FasttrackDogId', 'StartPrice', 'Place', 'Box', 'Weight', 'FasttrackRaceId', 
            'TrainerId', 'TrackDist', 'RaceDate', 'FieldSize']
df = df[col_list]

'''
Read in features and merge to main dataframe
'''

# Trainer ELO Rating
df_temp = pd.read_csv('./data/features/trainer-elo.csv')
df = df.merge(df_temp, on=['TrainerId', 'FasttrackRaceId', 'RaceDate'], how='left')

# Mean SpeedNorm by Weight
df['WeightQuantile'] = pd.qcut(df['Weight'], 10, labels=False)+1
df_temp = pd.read_csv('./data/features/mean-speednorm-by-weight.csv').drop('WeightQuantile_std', axis=1)
df = df.merge(df_temp, on=['WeightQuantile'], how='left')

# Mean SpeedNorm by Track Distance Box
df_temp = pd.read_csv('./data/features/mean-speednorm-by-trackdistbox.csv').drop('TrackDistBox_std', axis=1)
df = df.merge(df_temp, on=['TrackDist', 'Box'], how='left')

# EMA SpeedNorm by Greyhound
df_temp = pd.read_csv('./data/features/ema-speednorm-by-greyhound.csv')
df = df.merge(df_temp, on=['FasttrackDogId', 'FasttrackRaceId', 'RaceDate'], how='left')

# EMA SplitMargin by Greyhound
df_temp = pd.read_csv('./data/features/ema-split-margin-by-greyhound.csv')
df = df.merge(df_temp, on=['FasttrackDogId', 'FasttrackRaceId', 'TrackDist'], how='left')

# SplitMargin importance by (track, distance)
df_temp = pd.read_csv('./data/features/split-margin-importance-by-trackdist.csv')
df = df.merge(df_temp, on='TrackDist', how='left')

# Multiply the two columns
df['SplitMarginMultiply'] = df['SplitMarginQuantileEMA']*df['TrackSplitMarginQuantile']

display(df)

Unnamed: 0,FasttrackDogId,StartPrice,Place,Box,Weight,FasttrackRaceId,TrainerId,TrackDist,RaceDate,FieldSize,TrainerRating,WeightQuantile,WeightQuantile_mean,TrackDistBox_mean,SpeedNorm_EMA,SplitMarginQuantileEMA,TrackSplitMarginQuantile,SplitMarginMultiply
0,157500927,2.4,1,1,27.4,335811282,7683,Bendigo500,2018-07-01,6,1000.000000,3,-0.080413,,,,2.0,
1,1820620018,6.3,2,2,32.8,335811282,137227,Bendigo500,2018-07-01,6,1000.000000,8,0.113288,,,,2.0,
2,1950680026,9.3,3,6,25.5,335811282,132763,Bendigo500,2018-07-01,6,1000.000000,1,-0.163575,,,,2.0,
3,1524380048,9.1,4,7,32.2,335811282,116605,Bendigo500,2018-07-01,6,1000.000000,7,0.066307,,,,2.0,
4,124225458,3.4,5,4,28.9,335811282,132763,Bendigo500,2018-07-01,6,1000.000000,4,-0.057036,,,,2.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
782997,491585906,3.8,3,8,27.1,745616339,87891,Cannington520,2021-12-31,7,-385.966583,3,-0.080413,-0.043643,0.712705,5.165463,2.0,10.330926
782998,485659451,4.1,4,3,32.1,745616339,68549,Cannington520,2021-12-31,7,-337.539747,7,0.066307,-0.004835,0.605300,7.604727,2.0,15.209454
782999,528381655,16.2,5,6,31.8,745616339,83581,Cannington520,2021-12-31,7,-536.354620,7,0.066307,-0.021015,0.167931,4.866273,2.0,9.732546
783000,537992387,2.9,6,1,26.7,745616339,293372,Cannington520,2021-12-31,7,-300.109813,2,-0.104427,0.148961,0.497658,8.269239,2.0,16.538479


Take only races with a full field (8 greyhounds), and take only races where we have information on every greyhound. We notice that this significantly reduces our sample size (both operations cause about a 2/3 of data to be removed).

In [10]:
# Take only races with a full field
df = df[df['FieldSize'] == 8]

# Take only races where we have feature information on every greyhound
df_temp = df.copy()
df_temp = df_temp[['FasttrackRaceId', 'TrainerRating', 'WeightQuantile_mean', 
                   'TrackDistBox_mean', 'SpeedNorm_EMA', 'SplitMarginMultiply']]
df_temp = df_temp[df_temp.isnull().any(axis=1)]
list_of_races = df_temp['FasttrackRaceId'].unique()

df = df[~df['FasttrackRaceId'].isin(list_of_races)]

display(df)

Unnamed: 0,FasttrackDogId,StartPrice,Place,Box,Weight,FasttrackRaceId,TrainerId,TrackDist,RaceDate,FieldSize,TrainerRating,WeightQuantile,WeightQuantile_mean,TrackDistBox_mean,SpeedNorm_EMA,SplitMarginQuantileEMA,TrackSplitMarginQuantile,SplitMarginMultiply
3308,1963750031,5.8,1,2,29.8,337082320,68549,Bendigo425,2018-07-06,8,1026.605267,5,-0.006573,0.067085,2.045300,10.000000,4.0,40.000000
3309,149399734,8.2,2,6,33.3,337082320,108283,Bendigo425,2018-07-06,8,1028.168607,9,0.115418,-0.049565,2.206318,10.000000,4.0,40.000000
3310,1729820019,3.6,3,7,34.4,337082320,115064,Bendigo425,2018-07-06,8,1163.701658,9,0.115418,-0.037729,2.530805,10.000000,4.0,40.000000
3311,157500921,11.0,4,4,31.6,337082320,7683,Bendigo425,2018-07-06,8,994.665471,7,0.066307,-0.014121,2.395203,5.000000,4.0,20.000000
3312,1486670016,6.1,5,8,29.4,337082320,112806,Bendigo425,2018-07-06,8,1028.152647,5,-0.006573,0.076176,2.045300,10.000000,4.0,40.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
782983,514248948,10.1,4,4,28.5,745616337,83581,Cannington520,2021-12-31,8,-536.354620,4,-0.057036,-0.025759,0.018879,6.222584,2.0,12.445168
782984,491176598,5.9,5,6,26.6,745616337,87891,Cannington520,2021-12-31,8,-385.966583,2,-0.104427,-0.021015,0.034358,4.671660,2.0,9.343319
782985,491176074,10.9,6,1,33.4,745616337,83951,Cannington520,2021-12-31,8,-499.940634,9,0.115418,0.148961,-0.768727,4.669291,2.0,9.338583
782986,469847830,19.6,7,5,33.5,745616337,83516,Cannington520,2021-12-31,8,-465.977139,9,0.115418,-0.045962,-0.329491,2.233334,2.0,4.466668


Normalise particular columns by race

In [11]:
# normalise columns by race
norm_list = ['TrainerRating', 'SpeedNorm_EMA']
for col in norm_list:
    df[col + '_Z'] = df.groupby('FasttrackRaceId')[col].transform(lambda x: zscore(x))

df['SplitMarginMultiply_Z'] = df.groupby('FasttrackRaceId')['SplitMarginMultiply'].transform(lambda x: (x  - x.mean())/100.)    
display(df)

  return (a - mns) / sstd


Unnamed: 0,FasttrackDogId,StartPrice,Place,Box,Weight,FasttrackRaceId,TrainerId,TrackDist,RaceDate,FieldSize,...,WeightQuantile,WeightQuantile_mean,TrackDistBox_mean,SpeedNorm_EMA,SplitMarginQuantileEMA,TrackSplitMarginQuantile,SplitMarginMultiply,TrainerRating_Z,SpeedNorm_EMA_Z,SplitMarginMultiply_Z
3308,1963750031,5.8,1,2,29.8,337082320,68549,Bendigo425,2018-07-06,8,...,5,-0.006573,0.067085,2.045300,10.000000,4.0,40.000000,-0.270397,-0.660397,0.040000
3309,149399734,8.2,2,6,33.3,337082320,108283,Bendigo425,2018-07-06,8,...,9,0.115418,-0.049565,2.206318,10.000000,4.0,40.000000,-0.248303,0.109199,0.040000
3310,1729820019,3.6,3,7,34.4,337082320,115064,Bendigo425,2018-07-06,8,...,9,0.115418,-0.037729,2.530805,10.000000,4.0,40.000000,1.667137,1.660112,0.040000
3311,157500921,11.0,4,4,31.6,337082320,7683,Bendigo425,2018-07-06,8,...,7,0.066307,-0.014121,2.395203,5.000000,4.0,20.000000,-0.721790,1.011989,-0.160000
3312,1486670016,6.1,5,8,29.4,337082320,112806,Bendigo425,2018-07-06,8,...,5,-0.006573,0.076176,2.045300,10.000000,4.0,40.000000,-0.248528,-0.660397,0.040000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
782983,514248948,10.1,4,4,28.5,745616337,83581,Cannington520,2021-12-31,8,...,4,-0.057036,-0.025759,0.018879,6.222584,2.0,12.445168,-1.343757,0.243479,0.033573
782984,491176598,5.9,5,6,26.6,745616337,87891,Cannington520,2021-12-31,8,...,2,-0.104427,-0.021015,0.034358,4.671660,2.0,9.343319,0.730914,0.282581,0.002554
782985,491176074,10.9,6,1,33.4,745616337,83951,Cannington520,2021-12-31,8,...,9,0.115418,0.148961,-0.768727,4.669291,2.0,9.338583,-0.841410,-1.746098,0.002507
782986,469847830,19.6,7,5,33.5,745616337,83516,Cannington520,2021-12-31,8,...,9,0.115418,-0.045962,-0.329491,2.233334,2.0,4.466668,-0.372868,-0.636540,-0.046212


Remove small number of NaN values (~20 rows), split data, and train a logistical regression model

In [12]:
# Create 'win' column for target
df['Win'] = (df['Place'] == 1).astype(int)

# Remove NaN values
df = df.dropna()
df = df.sort_values(by=["RaceDate", "FasttrackRaceId"], ascending=True)

# Split into train and test sets
train_data = df[df['RaceDate'] < '2021-03-01'].reset_index(drop = True)
test_data = df[df['RaceDate'] >= '2021-03-01'].reset_index(drop = True)

# Feature columns
feature_list = ['WeightQuantile_mean', 'TrackDistBox_mean', 
                'TrainerRating_Z', 'SpeedNorm_EMA_Z', 'SplitMarginMultiply_Z']

# split into features and labels
train_x, train_y = train_data[feature_list], train_data['Win']
test_x, test_y = test_data[feature_list], test_data['Win']

# Train model on training data
logit_model = LogisticRegression()
logit_model.fit(train_x, train_y)

# Scale the raw model output so they sum to unity
test_data['prob_unscaled'] = logit_model.predict_proba(test_x)[:,1]
test_data['prob_scaled'] = test_data.groupby('FasttrackRaceId')['prob_unscaled'].apply(lambda x: x / sum(x))

display(test_data)

Unnamed: 0,FasttrackDogId,StartPrice,Place,Box,Weight,FasttrackRaceId,TrainerId,TrackDist,RaceDate,FieldSize,...,SpeedNorm_EMA,SplitMarginQuantileEMA,TrackSplitMarginQuantile,SplitMarginMultiply,TrainerRating_Z,SpeedNorm_EMA_Z,SplitMarginMultiply_Z,Win,prob_unscaled,prob_scaled
0,414325010,1.9,1,5,28.8,643554741,68481,Cannington275,2021-03-01,8,...,0.580074,8.240245,8.0,65.921963,0.999805,2.265631,0.234516,1,0.479793,0.473434
1,221780686,12.6,2,2,24.8,643554741,69407,Cannington275,2021-03-01,8,...,-0.203930,3.186345,8.0,25.490764,-1.287367,-0.098751,-0.169796,0,0.037122,0.036630
2,282125669,9.0,3,6,30.5,643554741,99115,Cannington275,2021-03-01,8,...,0.045358,4.787651,8.0,38.301209,-0.042611,0.653046,-0.041691,0,0.095004,0.093745
3,289290525,15.0,4,8,30.6,643554741,102996,Cannington275,2021-03-01,8,...,-0.359230,5.597088,8.0,44.776703,0.691113,-0.567101,0.023063,0,0.100276,0.098947
4,315938138,9.5,5,4,32.8,643554741,97945,Cannington275,2021-03-01,8,...,-0.336849,4.149843,8.0,33.198744,1.013074,-0.499604,-0.092716,0,0.091693,0.090478
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56107,331345891,39.5,4,8,29.1,747048620,63454,Bendigo425,2021-12-31,8,...,-0.153072,5.385881,4.0,21.543525,-0.085641,-0.578115,-0.039361,0,0.075724,0.072758
56108,530060512,5.7,5,3,26.5,747048620,222983,Bendigo425,2021-12-31,8,...,0.481113,7.894103,4.0,31.576413,1.015585,0.796812,0.060968,0,0.177571,0.170615
56109,359294661,42.5,6,6,28.4,747048620,130810,Bendigo425,2021-12-31,8,...,-0.329940,3.232219,4.0,12.928875,-0.922238,-0.961568,-0.125508,0,0.024628,0.023663
56110,373611738,8.6,7,4,28.6,747048620,129328,Bendigo425,2021-12-31,8,...,-0.166748,6.274972,4.0,25.099889,0.750009,-0.607763,-0.003798,0,0.083096,0.079841


Compare tipping results between prelim logistic regression model and SP

In [13]:
# Compare tipping results from model favourite and starting price favourite
test_data['model_win_prediction'] = test_data.groupby('FasttrackRaceId')['prob_scaled'].apply(lambda x: x == max(x))
test_data['odds_win_prediction'] = test_data.groupby('FasttrackRaceId')['StartPrice'].apply(lambda x: x == min(x))

print("Model predicts the winner in {:.2%} of races".format(
    len(test_data[(test_data['model_win_prediction'] == True) & (test_data['Win'] == 1)]) / test_data['FasttrackRaceId'].nunique()
    ))
print("Starting Price Odds predicts the winner in {:.2%} of races".format(
    len(test_data[(test_data['odds_win_prediction'] == True) & (test_data['Win'] == 1)]) / test_data['FasttrackRaceId'].nunique()
    ))

Model predicts the winner in 31.51% of races
Starting Price Odds predicts the winner in 37.70% of races


Feature importance

In [14]:
feature_importance = logit_model.coef_[0]
print(feature_importance)
print(feature_list)

[0.68907497 2.91024744 0.37021877 0.55078879 2.96461149]
['WeightQuantile_mean', 'TrackDistBox_mean', 'TrainerRating_Z', 'SpeedNorm_EMA_Z', 'SplitMarginMultiply_Z']
