## Trainer Elo Feature

We apply a multi-elo model to our greyhound racing data, and create a corresponding ELO rating feature for each trainer. The package can be installed from here: https://github.com/djcunningham0/multielo

When a model is being trained, it may be of interest to tune the hyper-parameters used in the multi-elo model.

----

Import libraries, packages, and raw greyhound data

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as stats
import os
import decouple
import sys
config = decouple.AutoConfig(' ')
os.chdir(config('ROOT_DIRECTORY'))
sys.path.insert(0, '')

from scipy.stats import zscore
from multielo import MultiElo, Player, Tracker
from multielo.multielo import defaults

# Read in data
df_raw = pd.read_csv('./data/clean/dog_results.csv', dtype={"TrainerId": str, "Place": str})

display(df_raw)

Unnamed: 0,FasttrackDogId,Place,DogName,Box,Rug,Weight,StartPrice,Margin1,Margin2,PIR,...,FasttrackRaceId,TrainerId,TrainerName,Distance,RaceGrade,Track,RaceNum,TrackDist,RaceDate,FieldSize
0,157500927,1,RAINE ALLEN,1,1,27.4,2.4,2.30,,Q/111,...,335811282,7683,C GRENFELL,500.0,Restricted Win,Bendigo,1.0,Bendigo500,2018-07-01,6
1,1820620018,2,SURF A LOT,2,2,32.8,6.3,2.30,2.30,M/332,...,335811282,137227,C TYLEY,500.0,Restricted Win,Bendigo,1.0,Bendigo500,2018-07-01,6
2,1950680026,3,PINGIN' BEE,6,6,25.5,9.3,3.84,1.54,S/443,...,335811282,132763,P DAPIRAN,500.0,Restricted Win,Bendigo,1.0,Bendigo500,2018-07-01,6
3,1524380048,4,LUCAS THE GREAT,7,7,32.2,9.1,5.27,1.43,M/655,...,335811282,116605,E HAMILTON,500.0,Restricted Win,Bendigo,1.0,Bendigo500,2018-07-01,6
4,124225458,5,QUAVO,4,4,28.9,3.4,5.56,0.29,M/766,...,335811282,132763,P DAPIRAN,500.0,Restricted Win,Bendigo,1.0,Bendigo500,2018-07-01,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
782997,491585906,3,GLORIOUS GUNN,8,8,27.1,3.8,3.75,2.43,6644,...,745616339,87891,G HORNE,520.0,Grade 5,Cannington,12.0,Cannington520,2021-12-31,7
782998,485659451,4,WOOD FIRE,3,3,32.1,4.1,3.75,0.14,3233,...,745616339,68549,C HALSE,520.0,Grade 5,Cannington,12.0,Cannington520,2021-12-31,7
782999,528381655,5,TRENDING QUARTER,6,6,31.8,16.2,5.25,1.43,4566,...,745616339,83581,J DAILLY,520.0,Grade 5,Cannington,12.0,Cannington520,2021-12-31,7
783000,537992387,6,ELITE WEAPON,1,1,26.7,2.9,5.25,0.00,1455,...,745616339,293372,S WILLIAMS,520.0,Grade 5,Cannington,12.0,Cannington520,2021-12-31,7


Create any additional columns that will be of use to measure against Trainer ELO feature

In [4]:
# Copy raw data
df = df_raw.copy()

# Calculate average speed
df["Speed"] = df["Distance"]/df["RunTime"]

# Normalise speed by track
df["SpeedNorm"] = df.groupby("TrackDist")["Speed"].transform(lambda x: zscore(x, nan_policy = 'omit'))

display(df)

Unnamed: 0,FasttrackDogId,Place,DogName,Box,Rug,Weight,StartPrice,Margin1,Margin2,PIR,...,TrainerName,Distance,RaceGrade,Track,RaceNum,TrackDist,RaceDate,FieldSize,Speed,SpeedNorm
0,157500927,1,RAINE ALLEN,1,1,27.4,2.4,2.30,,Q/111,...,C GRENFELL,500.0,Restricted Win,Bendigo,1.0,Bendigo500,2018-07-01,6,17.445918,0.467375
1,1820620018,2,SURF A LOT,2,2,32.8,6.3,2.30,2.30,M/332,...,C TYLEY,500.0,Restricted Win,Bendigo,1.0,Bendigo500,2018-07-01,6,17.349063,0.052453
2,1950680026,3,PINGIN' BEE,6,6,25.5,9.3,3.84,1.54,S/443,...,P DAPIRAN,500.0,Restricted Win,Bendigo,1.0,Bendigo500,2018-07-01,6,17.283097,-0.230144
3,1524380048,4,LUCAS THE GREAT,7,7,32.2,9.1,5.27,1.43,M/655,...,E HAMILTON,500.0,Restricted Win,Bendigo,1.0,Bendigo500,2018-07-01,6,17.223562,-0.485192
4,124225458,5,QUAVO,4,4,28.9,3.4,5.56,0.29,M/766,...,P DAPIRAN,500.0,Restricted Win,Bendigo,1.0,Bendigo500,2018-07-01,6,17.211704,-0.535991
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
782997,491585906,3,GLORIOUS GUNN,8,8,27.1,3.8,3.75,2.43,6644,...,G HORNE,520.0,Grade 5,Cannington,12.0,Cannington520,2021-12-31,7,17.077176,0.274907
782998,485659451,4,WOOD FIRE,3,3,32.1,4.1,3.75,0.14,3233,...,C HALSE,520.0,Grade 5,Cannington,12.0,Cannington520,2021-12-31,7,17.071569,0.254759
782999,528381655,5,TRENDING QUARTER,6,6,31.8,16.2,5.25,1.43,4566,...,J DAILLY,520.0,Grade 5,Cannington,12.0,Cannington520,2021-12-31,7,17.015707,0.054004
783000,537992387,6,ELITE WEAPON,1,1,26.7,2.9,5.25,0.00,1455,...,S WILLIAMS,520.0,Grade 5,Cannington,12.0,Cannington520,2021-12-31,7,17.015707,0.054004


Create a temporary dataframe that is in the appropriate format for ELO rating package

In [5]:
# Take a temp frame with date, race id, dog id, and place
df_temp = df.copy()[["RaceDate", "FasttrackRaceId", "TrainerId", "Place"]]
df_temp = df_temp.sort_values(by=["RaceDate", "FasttrackRaceId"], ascending=True)

# Remove place values of 0 (did not finish?)
df_temp = df_temp[df_temp["Place"] != 0]

# Rename 'Place' column values
df_temp["Place"] = df_temp["Place"].apply(lambda x: "Place"+str(int(x)))

# Pivot the dataframe such that columns are places for each race, and the values are the trainer id's
df_temp = df_temp.reset_index().pivot(index=["index", "RaceDate", "FasttrackRaceId"], columns="Place", values="TrainerId")

# Fill NaN values with empty strings ('') so we can squash each race together
df_temp = df_temp.reset_index().rename_axis(None, axis=1).drop("index", axis=1).fillna('')

# Group by date and race id and squash
col_list = df_temp.columns[2:]
for col in col_list:
    df_temp[col] = df_temp.groupby(["RaceDate", "FasttrackRaceId"],as_index=False)[col].transform(lambda x: ''.join(x))

# Take only one row for each race
df_temp = df_temp.drop_duplicates(subset="FasttrackRaceId")

# Replace empty place's with None - recognised as empty by multi elo library
for col in col_list:
    df_temp[col] = df_temp[col].apply(lambda x: None if x == '' else x)

# Change index to race id and sort values by time
df_temp = df_temp.set_index('FasttrackRaceId').rename(columns={"RaceDate": "date"})
df_temp = df_temp.sort_values(by="date", ascending=True)

display(df_temp)

Unnamed: 0_level_0,date,Place1,Place2,Place3,Place4,Place5,Place6,Place7,Place8
FasttrackRaceId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
335811282,2018-07-01,7683,137227,132763,116605,132763,121129,,
334309951,2018-07-01,76940,97373,85046,187428,97376,64611,61107,97869
334309952,2018-07-01,92323,92323,106474,67225,9125,9125,92923,273541
334309953,2018-07-01,92923,35220,102299,97678,279950,28394,81875,72727
334309954,2018-07-01,97376,108359,82782,62089,38914,79104,67225,136733
...,...,...,...,...,...,...,...,...,...
747017057,2021-12-31,111979,130010,17228,136744,137791,,,
747017054,2021-12-31,11461,108266,107980,11461,116984,,,
747017052,2021-12-31,137791,11461,113393,109140,107925,138221,,
747018068,2021-12-31,101324,289343,16216,101324,100210,101324,,


Gather ELO ratings for each trainer (over time) and merge back into original dataframe

In [6]:
# Calculate historical ELO ratings of each trainer
tracker = Tracker()
tracker.process_data(df_temp)
df_trainers = tracker.get_history_df().copy().rename(columns={"player_id":"TrainerId", "date": "RaceDate"})

# Take first ELO rating of trainer for that date (if trainer has had more than one race for a given date)
df_trainers = df_trainers.sort_values(by="RaceDate", ascending=True).drop_duplicates(["TrainerId", "RaceDate"], keep='first')

# Merge to original dataframe
df["TrainerId"] = df["TrainerId"].astype(str)
df = df.merge(df_trainers, on=["TrainerId", "RaceDate"], how="left").rename(columns={'rating': 'TrainerRating'})

# Drop duplicate values
df = df.drop_duplicates(subset=['TrainerId', 'FasttrackRaceId', 'RaceDate'], keep='first')

# Shift Trainer ELO Rating (prevent data leakage)
df["TrainerRating"] = df.groupby("TrainerId")["TrainerRating"].transform(lambda x: x.shift(1))

# Take only columns of interest
df = df[['TrainerId', 'FasttrackRaceId', 'RaceDate', 'TrainerRating']]

# Fill missing ELO ratings with 1000 (starting ELO)
df['TrainerRating'] = df['TrainerRating'].fillna(float(1000))

# Sort values and reset index
df = df.sort_values(by=['RaceDate', 'FasttrackRaceId'], ascending=True).reset_index(drop=True)

display(df)

Unnamed: 0,TrainerId,FasttrackRaceId,RaceDate,TrainerRating
0,105986,334309830,2018-07-01,1000.000000
1,6934,334309830,2018-07-01,1000.000000
2,58389,334309830,2018-07-01,1000.000000
3,110650,334309830,2018-07-01,1000.000000
4,67940,334309830,2018-07-01,1000.000000
...,...,...,...,...
715316,135148,747048625,2021-12-31,964.443978
715317,112522,747048625,2021-12-31,816.049854
715318,130297,747048625,2021-12-31,756.506157
715319,130795,747048625,2021-12-31,805.502553


Save to ./data/features as a .csv

In [7]:
df.to_csv('./data/features/trainer-elo.csv', index=False)