## Trainer Elo Feature

We apply a multi-elo model to our greyhound racing data, and create a corresponding ELO rating feature for each trainer. The package can be installed from here: https://github.com/djcunningham0/multielo

When a model is being trained, it may be of interest to tune the hyper-parameters used in the multi-elo model.

----

Import libraries, packages, and raw greyhound data

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as stats
import os
import decouple
import sys
config = decouple.AutoConfig(' ')
os.chdir(config('ROOT_DIRECTORY'))
sys.path.insert(0, '')

from scipy.stats import zscore
from multielo import MultiElo, Player, Tracker
from multielo.multielo import defaults

# Read in data
df_raw = pd.read_csv('./data/clean/dog_results.csv', dtype={"TrainerId": str, "Place": str})

# Take only data up to the end of 2018 (for efficiency)
df_raw = df_raw[pd.to_datetime(df_raw['RaceDate'], format='%Y-%m-%d').dt.year <= 2018]

display(df_raw)

Unnamed: 0,FasttrackDogId,Place,DogName,Box,Rug,Weight,StartPrice,Margin1,Margin2,PIR,...,Prizemoney,FasttrackRaceId,TrainerId,TrainerName,Distance,RaceGrade,Track,RaceNum,TrackDist,RaceDate
0,157500927,1,RAINE ALLEN,1,1,27.4,2.4,2.30,,Q/111,...,0.0,335811282,7683,C GRENFELL,500,Restricted Win,Bendigo,1,Bendigo500,2018-07-01
1,1820620018,2,SURF A LOT,2,2,32.8,6.3,2.30,2.30,M/332,...,0.0,335811282,137227,C TYLEY,500,Restricted Win,Bendigo,1,Bendigo500,2018-07-01
2,1950680026,3,PINGIN' BEE,6,6,25.5,9.3,3.84,1.54,S/443,...,0.0,335811282,132763,P DAPIRAN,500,Restricted Win,Bendigo,1,Bendigo500,2018-07-01
3,1524380048,4,LUCAS THE GREAT,7,7,32.2,9.1,5.27,1.43,M/655,...,0.0,335811282,116605,E HAMILTON,500,Restricted Win,Bendigo,1,Bendigo500,2018-07-01
4,124225458,5,QUAVO,4,4,28.9,3.4,5.56,0.29,M/766,...,0.0,335811282,132763,P DAPIRAN,500,Restricted Win,Bendigo,1,Bendigo500,2018-07-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
109664,196371033,3,OBLIVINATOR,4,4,27.8,6.5,2.25,1.00,2,...,250.0,397584624,84310,C WILLS,395,Grade 5,Albion Park,10,Albion Park395,2018-12-31
109665,2052170012,4,STRUDEL,8,8,24.4,10.0,4.25,1.71,6,...,125.0,397584624,59065,J LINNAN,395,Grade 5,Albion Park,10,Albion Park395,2018-12-31
109666,2052170009,5,STROGANOFF,6,6,29.7,26.0,4.75,0.71,7,...,0.0,397584624,59065,J LINNAN,395,Grade 5,Albion Park,10,Albion Park395,2018-12-31
109667,222091369,6,GYPSY WILD,5,5,26.7,7.0,5.50,0.71,5,...,0.0,397584624,124111,R HAZELGROVE,395,Grade 5,Albion Park,10,Albion Park395,2018-12-31


Create any additional columns that will be of use to measure against Trainer ELO feature

In [7]:
# Copy raw data
df = df_raw.copy()

# Determine field size for each race
df["FieldSize"] = df.groupby(["FasttrackRaceId"])["FasttrackDogId"].transform('nunique')

# Take only fields > 1
df = df[df["FieldSize"] > 1]

# Calculate average speed
df["Speed"] = df["Distance"]/df["RunTime"]

# Remove NaN tracks
df = df[~df.TrackDist.isna()]

# Normalise speed by track
df["SpeedNorm"] = df.groupby("TrackDist")["Speed"].transform(lambda x: zscore(x, nan_policy = 'omit'))

display(df.head())

Unnamed: 0,FasttrackDogId,Place,DogName,Box,Rug,Weight,StartPrice,Margin1,Margin2,PIR,...,TrainerName,Distance,RaceGrade,Track,RaceNum,TrackDist,RaceDate,FieldSize,Speed,SpeedNorm
0,157500927,1,RAINE ALLEN,1,1,27.4,2.4,2.3,,Q/111,...,C GRENFELL,500,Restricted Win,Bendigo,1,Bendigo500,2018-07-01,6,17.445918,0.708076
1,1820620018,2,SURF A LOT,2,2,32.8,6.3,2.3,2.3,M/332,...,C TYLEY,500,Restricted Win,Bendigo,1,Bendigo500,2018-07-01,6,17.349063,0.284863
2,1950680026,3,PINGIN' BEE,6,6,25.5,9.3,3.84,1.54,S/443,...,P DAPIRAN,500,Restricted Win,Bendigo,1,Bendigo500,2018-07-01,6,17.283097,-0.003381
3,1524380048,4,LUCAS THE GREAT,7,7,32.2,9.1,5.27,1.43,M/655,...,E HAMILTON,500,Restricted Win,Bendigo,1,Bendigo500,2018-07-01,6,17.223562,-0.263526
4,124225458,5,QUAVO,4,4,28.9,3.4,5.56,0.29,M/766,...,P DAPIRAN,500,Restricted Win,Bendigo,1,Bendigo500,2018-07-01,6,17.211704,-0.315339


Create a temporary dataframe that is in the appropriate format for ELO rating package

In [8]:
# Take a temp frame with date, race id, dog id, and place
df_temp = df.copy()[["RaceDate", "FasttrackRaceId", "TrainerId", "Place"]]
df_temp = df_temp.sort_values(by=["RaceDate", "FasttrackRaceId"], ascending=True)

# Remove place values of 0 (did not finish?)
df_temp = df_temp[df_temp["Place"] != 0]

# Rename 'Place' column values
df_temp["Place"] = df_temp["Place"].apply(lambda x: "Place"+str(int(x)))

# Pivot the dataframe such that columns are places for each race, and the values are the trainer id's
df_temp = df_temp.reset_index().pivot(index=["index", "RaceDate", "FasttrackRaceId"], columns="Place", values="TrainerId")

# Fill NaN values with empty strings ('') so we can squash each race together
df_temp = df_temp.reset_index().rename_axis(None, axis=1).drop("index", axis=1).fillna('')

# Group by date and race id and squash
col_list = df_temp.columns[2:]
for col in col_list:
    df_temp[col] = df_temp.groupby(["RaceDate", "FasttrackRaceId"],as_index=False)[col].transform(lambda x: ''.join(x))

# Take only one row for each race
df_temp = df_temp.drop_duplicates(subset="FasttrackRaceId")

# Replace empty place's with None - recognised as empty by multi elo library
for col in col_list:
    df_temp[col] = df_temp[col].apply(lambda x: None if x == '' else x)

# Change index to race id and sort values by time
df_temp = df_temp.set_index('FasttrackRaceId').rename(columns={"RaceDate": "date"})
df_temp = df_temp.sort_values(by="date", ascending=True)

display(df_temp)

Unnamed: 0_level_0,date,Place1,Place2,Place3,Place4,Place5,Place6,Place7,Place8
FasttrackRaceId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
335811282,2018-07-01,7683,137227,132763,116605,132763,121129,,
334309951,2018-07-01,76940,97373,85046,187428,97376,64611,61107,97869
334309952,2018-07-01,92323,92323,106474,67225,9125,9125,92923,273541
334309953,2018-07-01,92923,35220,102299,97678,279950,28394,81875,72727
334309954,2018-07-01,97376,108359,82782,62089,38914,79104,67225,136733
...,...,...,...,...,...,...,...,...,...
397073100,2018-12-31,137143,10897,266799,212582,131763,129145,6905,185810
397073099,2018-12-31,118529,68532,100313,130148,129725,67709,108358,117228
397073102,2018-12-31,248478,68044,130427,108219,66732,111351,129899,108840
392592906,2018-12-31,100284,88389,100284,88389,64195,106495,88389,88389


Gather ELO ratings for each trainer (over time) and merge back into original dataframe

In [9]:
# Calculate historical ELO ratings of each trainer
tracker = Tracker()
tracker.process_data(df_temp)
df_trainers = tracker.get_history_df().copy().rename(columns={"player_id":"TrainerId", "date": "RaceDate"})

# Take first ELO rating of trainer for that date (if trainer has had more than one race for a given date)
df_trainers = df_trainers.sort_values(by="RaceDate", ascending=True).drop_duplicates(["TrainerId", "RaceDate"], keep='first')

# Merge to original dataframe
df["TrainerId"] = df["TrainerId"].astype(str)
df = df.merge(df_trainers, on=["TrainerId", "RaceDate"], how="left").rename(columns={'rating': 'TrainerRating'})

# Drop NaN
df = df[~df["TrainerRating"].isna()]

# Shift Trainer ELO Rating (prevent data leakage)
df["TrainerRating"] = df.groupby("TrainerId")["TrainerRating"].transform(lambda x: x.shift(1))

display(df.sort_values(by='RaceDate', ascending=True))

Unnamed: 0,FasttrackDogId,Place,DogName,Box,Rug,Weight,StartPrice,Margin1,Margin2,PIR,...,Distance,RaceGrade,Track,RaceNum,TrackDist,RaceDate,FieldSize,Speed,SpeedNorm,TrainerRating
0,157500927,1,RAINE ALLEN,1,1,27.4,2.4,2.30,,Q/111,...,500,Restricted Win,Bendigo,1,Bendigo500,2018-07-01,6,17.445918,0.708076,
337,1550690057,4,VINAKA OSCAR,5,5,32.1,8.2,2.00,0.86,4,...,331,GRADE 5 PATHWAY NON-PENALTY,Albion Park,12,Albion Park331,2018-07-01,8,16.930946,0.250284,1000.536652
336,1277000126,3,SPICE BUCKET,7,7,26.7,9.5,1.00,0.86,5,...,331,GRADE 5 PATHWAY NON-PENALTY,Albion Park,12,Albion Park331,2018-07-01,8,16.983068,0.404356,
335,1182140039,2,SAGE OF OMAHA,1,1,32.3,2.0,0.25,0.29,2,...,331,GRADE 5 PATHWAY NON-PENALTY,Albion Park,12,Albion Park331,2018-07-01,8,17.035512,0.559380,
334,2014680029,1,DI CANIO,8,8,32.2,5.8,0.25,,3,...,331,GRADE 5 PATHWAY NON-PENALTY,Albion Park,12,Albion Park331,2018-07-01,8,17.053065,0.611268,988.827900
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
109262,231591759,3,LUCA'S ENTITY,8,8,31.7,2.6,5.20,2.01,M/44,...,390,Maiden,Shepparton,2,Shepparton390,2018-12-31,8,17.105263,-0.043924,1057.149204
109263,235407876,4,BACK YA SELF,3,3,33.2,2.6,5.99,0.79,M/33,...,390,Maiden,Shepparton,2,Shepparton390,2018-12-31,8,17.060367,-0.193888,993.503788
109264,212327818,5,ASTON HAZARD,5,5,31.5,13.9,6.53,0.54,M/55,...,390,Maiden,Shepparton,2,Shepparton390,2018-12-31,8,17.030568,-0.293428,947.644465
109284,1316500023,3,MOLLY'S MONEY,6,6,27.0,7.6,7.29,4.66,M/333,...,450,Grade 7,Shepparton,5,Shepparton450,2018-12-31,7,17.214996,-0.320529,987.658882


In [11]:
display(df.TrainerRating.corr(df.SpeedNorm))

0.2572296390011069