## Split Margin Feature by Track, Distance

Here we'll create a feature that combines both the split margin importance by (track, distance), and the predicted initial acceleration of each greyhound (using their previous SplitMargin values).

We do perform some crude estimates here, but these can be adjusted for in the future.

----

Import libraries, packages, and greyhound data

In [4]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as stats
import os
import decouple
import sys
config = decouple.AutoConfig(' ')
os.chdir(config('ROOT_DIRECTORY'))
sys.path.insert(0, '')

from scipy.stats import zscore
from multielo import MultiElo, Player, Tracker
from multielo.multielo import defaults

# Read in data
df_raw = pd.read_csv('./data/clean/dog_results.csv')

display(df_raw)

Unnamed: 0,FasttrackDogId,Place,DogName,Box,Rug,Weight,StartPrice,Margin1,Margin2,PIR,...,FasttrackRaceId,TrainerId,TrainerName,Distance,RaceGrade,Track,RaceNum,TrackDist,RaceDate,FieldSize
0,157500927,1,RAINE ALLEN,1,1,27.4,2.4,2.30,,Q/111,...,335811282,7683,C GRENFELL,500.0,Restricted Win,Bendigo,1.0,Bendigo500,2018-07-01,6
1,1820620018,2,SURF A LOT,2,2,32.8,6.3,2.30,2.30,M/332,...,335811282,137227,C TYLEY,500.0,Restricted Win,Bendigo,1.0,Bendigo500,2018-07-01,6
2,1950680026,3,PINGIN' BEE,6,6,25.5,9.3,3.84,1.54,S/443,...,335811282,132763,P DAPIRAN,500.0,Restricted Win,Bendigo,1.0,Bendigo500,2018-07-01,6
3,1524380048,4,LUCAS THE GREAT,7,7,32.2,9.1,5.27,1.43,M/655,...,335811282,116605,E HAMILTON,500.0,Restricted Win,Bendigo,1.0,Bendigo500,2018-07-01,6
4,124225458,5,QUAVO,4,4,28.9,3.4,5.56,0.29,M/766,...,335811282,132763,P DAPIRAN,500.0,Restricted Win,Bendigo,1.0,Bendigo500,2018-07-01,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
782997,491585906,3,GLORIOUS GUNN,8,8,27.1,3.8,3.75,2.43,6644,...,745616339,87891,G HORNE,520.0,Grade 5,Cannington,12.0,Cannington520,2021-12-31,7
782998,485659451,4,WOOD FIRE,3,3,32.1,4.1,3.75,0.14,3233,...,745616339,68549,C HALSE,520.0,Grade 5,Cannington,12.0,Cannington520,2021-12-31,7
782999,528381655,5,TRENDING QUARTER,6,6,31.8,16.2,5.25,1.43,4566,...,745616339,83581,J DAILLY,520.0,Grade 5,Cannington,12.0,Cannington520,2021-12-31,7
783000,537992387,6,ELITE WEAPON,1,1,26.7,2.9,5.25,0.00,1455,...,745616339,293372,S WILLIAMS,520.0,Grade 5,Cannington,12.0,Cannington520,2021-12-31,7


First, we will determine the SplitMargin importance by track and distance, split them into 10 quantiles, then merge back to the original dataframe.

In [16]:
# Copy raw dataframe
df = df_raw.copy()

# Remove NaN SplitMargin's for now
df = df[~df['SplitMargin'].isna()]

'''
Create a temporary dataframe that we will merge back to the original dataframe.
'''

df_temp = df.copy()

# Remove outlier values
upper_limit = df_temp.SplitMargin.median() + 3*df_temp.SplitMargin.mad()
lower_limit = df_temp.SplitMargin.median() - 3*df_temp.SplitMargin.mad()
df_temp = df_temp[df_temp['SplitMargin'].between(lower_limit, upper_limit)]

# Create a SplitMarkerPlace column
df_temp = df_temp.sort_values(by=['FasttrackRaceId', 'SplitMargin'], ascending=True)
df_temp['SplitMarginPlace'] = df_temp.groupby('FasttrackRaceId').cumcount()+1

# Create a SplitMarginWin column (who reach the split marker first)
df_temp['SplitMarginWin'] = (df_temp['SplitMarginPlace'] == 1).astype(int)

# Take only Greyhounds who reached split marker first, and calculate the win rate by track and distance
df_temp = df_temp[df_temp['SplitMarginWin'] == 1]
df_temp['Win'] = (df_temp['Place'] == 1).astype(int)
df_temp = df_temp.groupby('TrackDist', as_index=False).agg(NumberOfWins = ('Win', 'sum'),
                                                           SampleSize = ('Win', 'count'))
df_temp['WinRate'] = round(100*df_temp['NumberOfWins']/df_temp['SampleSize'], 2)

# Take only TrackDist with at least a sample size of 1000
df_temp = df_temp[df_temp['SampleSize'] >= 1000]
df_temp = df_temp.sort_values(by='WinRate')

# Break into Quantiles and Merge to original dataframe
df_temp['TrackSplitMarginQuantile'] = pd.qcut(df_temp['WinRate'], 10, labels=False)+1
df_temp = df_temp[['TrackDist', 'TrackSplitMarginQuantile']]
df = df.merge(df_temp, on=['TrackDist'], how='inner')

display(df)

Unnamed: 0,FasttrackDogId,Place,DogName,Box,Rug,Weight,StartPrice,Margin1,Margin2,PIR,...,TrainerId,TrainerName,Distance,RaceGrade,Track,RaceNum,TrackDist,RaceDate,FieldSize,TrackSplitMarginQuantile
0,157500927,1,RAINE ALLEN,1,1,27.4,2.4,2.30,,Q/111,...,7683,C GRENFELL,500.0,Restricted Win,Bendigo,1.0,Bendigo500,2018-07-01,6,3
1,1820620018,2,SURF A LOT,2,2,32.8,6.3,2.30,2.30,M/332,...,137227,C TYLEY,500.0,Restricted Win,Bendigo,1.0,Bendigo500,2018-07-01,6,3
2,1950680026,3,PINGIN' BEE,6,6,25.5,9.3,3.84,1.54,S/443,...,132763,P DAPIRAN,500.0,Restricted Win,Bendigo,1.0,Bendigo500,2018-07-01,6,3
3,1524380048,4,LUCAS THE GREAT,7,7,32.2,9.1,5.27,1.43,M/655,...,116605,E HAMILTON,500.0,Restricted Win,Bendigo,1.0,Bendigo500,2018-07-01,6,3
4,124225458,5,QUAVO,4,4,28.9,3.4,5.56,0.29,M/766,...,132763,P DAPIRAN,500.0,Restricted Win,Bendigo,1.0,Bendigo500,2018-07-01,6,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
562664,280075556,3,FLYING BELLA,8,8,31.4,7.2,1.00,0.01,M/33,...,54920,J BUCKLEY,385.0,Grade 5,Shepparton,12.0,Shepparton385,2021-12-30,7,7
562665,280075554,4,SWEET HARRIET,3,3,27.8,16.8,7.40,6.40,M/54,...,54920,J BUCKLEY,385.0,Grade 5,Shepparton,12.0,Shepparton385,2021-12-30,7,7
562666,465422502,5,ZIEMS PARK GIRL,6,6,30.4,5.9,7.51,0.11,M/65,...,121397,G KANTZIDIS,385.0,Grade 5,Shepparton,12.0,Shepparton385,2021-12-30,7,7
562667,276161759,6,RHYMES VALLEY,1,1,29.9,20.5,9.31,1.80,M/46,...,299690,M HETHERTON,385.0,Grade 5,Shepparton,12.0,Shepparton385,2021-12-30,7,7


We will now break each Speed and SplitMargin (we will invert it first, so that smaller SplitMargin values are large) into quantiles.

In [17]:
# Create a Speed column
df['Speed'] = df['Distance']/df['RunTime']

# Break Speed's and Split Margins into quantiles for each (Track, Distance)
df['SpeedQuantile'] = df.groupby('TrackDist')['Speed'].transform(lambda x: pd.qcut(x, 10, labels=False)+1)
df['SplitMargin'] = 1/df['SplitMargin']
df['SplitMarginQuantile'] = df.groupby('TrackDist')['SplitMargin'].transform(lambda x: pd.qcut(x, 10, labels=False)+1)

# Take EMA of SplitMarginQuantile and shift by one (prevent data leakage)
alpha_ = 0.2
df['SplitMarginQuantile'] = df.groupby('FasttrackDogId')['SplitMarginQuantile'].transform(lambda x: x.ewm(alpha=alpha_).mean().shift(1))

# Multiply the two quantiles
df['SplitMarginMultiply'] = df['TrackSplitMarginQuantile']*df['SplitMarginQuantile']

# Take only columns of interest
df = df[['FasttrackDogId', 'FasttrackRaceId', 'TrackDist', 'SplitMarginMultiply']]

display(df)

Unnamed: 0,FasttrackDogId,FasttrackRaceId,TrackDist,SplitMarginMultiply
0,157500927,335811282,Bendigo500,
1,1820620018,335811282,Bendigo500,
2,1950680026,335811282,Bendigo500,
3,1524380048,335811282,Bendigo500,
4,124225458,335811282,Bendigo500,
...,...,...,...,...
562664,280075556,746724990,Shepparton385,25.687341
562665,280075554,746724990,Shepparton385,38.676283
562666,465422502,746724990,Shepparton385,46.115955
562667,276161759,746724990,Shepparton385,28.339808


Save to ./data/features as a .csv

In [18]:
df.to_csv('./data/features/split-margin-by-trackdist.csv', index=False)