In [None]:
%pip install --quiet --upgrade pip 
%pip install numpy --quiet
%pip install Pandas --quiet
%pip install sklearn --quiet
%pip install ipywidgets --quiet

# Horse Racing Results Predictor using Linear regression #

Use simple linear regression techniques to determine if we can reliably predict the **speed** of a horse in a given race based on its previous performance. 

Data used here is derived from the features extracted by the [Feature Analysis](https://github.com/LeeSanderson/RacingData/blob/main/Data/FeatureAnalysis.ipynb) notebook.

In [None]:
import numpy as np
import pandas as pd
import math
from abc import ABC, abstractmethod
from datetime import datetime, date

In [None]:
races = pd.read_csv("Race_Features.csv")
races['Off'] =  pd.to_datetime(races['Off'], format='%Y-%m-%d %H:%M:%S')
races.columns

In [None]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

race_info = ['RaceId', 'CourseId', 'Off', 'HorseId', 'HorseName', 'HorseCount', 'JockeyId', 'JockeyName', 'DecimalOdds', 'Wins', 'FinishingPosition']
predictors = ([
    'DistanceInMeters', 
    'WeightInPounds', 
    'Surface_AllWeather', 'Surface_Dirt', 'Surface_Turf', 
    'Going_Firm', 'Going_Good', 'Going_Good_To_Firm', 'Going_Good_To_Soft', 'Going_Heavy', 'Going_Soft', 
    'RaceType_Flat', 'RaceType_Hurdle', 'RaceType_Other', 'RaceType_SteepleChase', 

    'LastRaceDistanceInMeters', 
    'LastRaceWeightInPounds', 
    'LastRaceSpeed',
    'DaysRested',
    'LastRaceAvgRelFinishingPosition', 
    'LastRaceSurface_AllWeather', 'LastRaceSurface_Dirt', 'LastRaceSurface_Turf', 
    'LastRaceGoing_Good', 'LastRaceGoing_Good_To_Soft', 'LastRaceGoing_Soft', 'LastRaceGoing_Good_To_Firm', 'LastRaceGoing_Firm', 'LastRaceGoing_Heavy', 
    'LastRaceRaceType_Other', 'LastRaceRaceType_Hurdle', 'LastRaceRaceType_SteepleChase', 'LastRaceRaceType_Flat', 
    
    'JockeyNumberOfPriorRaces',
    'DaysSinceJockeyLastRaced',     
    'JockeyWinPercentage',
    'JockeyTop3Percentage',
    'JockeyAvgRelFinishingPosition'
    ])
prediction = ["Speed"]

train = races[race_info + predictors + prediction].dropna().copy()

In [None]:
# Cap rested days
train.loc[train["DaysRested"] > 10, "DaysRested"] = 10
train.loc[train["DaysSinceJockeyLastRaced"] > 10, "DaysSinceJockeyLastRaced"] = 10

# Cap prior races
# train.loc[train["JockeyNumberOfPriorRaces"] > 400] = 400 (negative effect)

train[predictors].describe()

In [None]:
# Now that we've dropped any rows with na values, calculate which races in which we can predict the speed for all horses.
train = train.drop("PredictableHorseCount", axis=1, errors='ignore')
groups = train.groupby(['RaceId']).apply(lambda g: pd.Series({'PredictableHorseCount': g['RaceId'].count()}, index=['PredictableHorseCount']))
train = pd.merge(train, groups, how='left', on=['RaceId'])

# 50% of races have 11 horse, 25% have 8 or less, 14% have 6 or less, 9% have 5 or less. Fewer horses should be more predictable
all_races_count = len(races["RaceId"].unique())
all_predictable = train[(train["HorseCount"] == train["PredictableHorseCount"]) & (train["HorseCount"] <= 5)]["RaceId"].unique().tolist()
all_predictable_count = len(all_predictable)
print(f"Possible predictable races = {all_predictable_count} out of {all_races_count} ({all_predictable_count * 100 / all_races_count}%)")

In [None]:
from sklearn.model_selection import train_test_split

_, test_race_ids = train_test_split(all_predictable, test_size=0.2, random_state = 42)

test = train[train["RaceId"].isin(test_race_ids)]
train = train[train["RaceId"].isin(test_race_ids) == False]

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline

# Create a pipeline that includes feature scaling and linear regression
model = make_pipeline(StandardScaler(), PolynomialFeatures(degree=2, interaction_only=False), LinearRegression())

In [None]:
#help(model.fit)

In [None]:
inputs = train[predictors]
targets = train[prediction[0]].values

# print(inputs.shape)
# print(targets.shape)

In [None]:
model.fit(inputs, targets)
test_inputs = test[predictors]
predictions = model.predict(test_inputs)

In [None]:
test_with_predictions = test.copy()
test_with_predictions["PredictedSpeed"] = predictions

In [None]:
test_with_predictions["PredictedRank"] = test_with_predictions.groupby("RaceId")["PredictedSpeed"].rank(method="dense", ascending=False)

In [None]:
test_with_predictions[["RaceId", "HorseId", "HorseName", "FinishingPosition", "PredictedRank", "Speed", "PredictedSpeed"]].head()

In [None]:
correct_winners = test_with_predictions[(test_with_predictions["FinishingPosition"] == 1) & (test_with_predictions["PredictedRank"] == 1)]

correct_winners_count = len(correct_winners)
incorrect_winners_count = len(test_with_predictions[(test_with_predictions["FinishingPosition"] == 1) & (test_with_predictions["PredictedRank"] != 1)])

winnings = correct_winners["DecimalOdds"].sum()

print(f"Correct winners {correct_winners_count}, Incorrect winners {incorrect_winners_count}")
print(f"Win percentage {100.0 * correct_winners_count / incorrect_winners_count}")
print(f"Winnings {winnings}, losses {incorrect_winners_count}, diff  {winnings - incorrect_winners_count}")

In [None]:
# Base = 23.3630494927759
# PolynomialFeatures(degree=2, interaction_only=True) = 25.01557632398754
# With PolynomialFeatures(degree=2, interaction_only=False) = 25.13252260679763
# With rest days capped =  25.210608424336975
# with max 11 horses in race = 28.612597776862906 (£467)
# with max 8 horses in race = 31.598513011152416 (£336)
# with max 6 horses in race = 38.113207547169814 (£343)
# with max 5 horses in race = 47.77777777777778 (£269)