In [1]:
#%pip install numpy
#%pip install Pandas

# Horse Racing Results Predictor #

The American professional gambler [Bill Benter](https://en.wikipedia.org/wiki/Bill_Benter) is said to have made earned nearly $1 billion through the development of one of the most successful analysis computer software programs in the horse racing market.

Bill published his techniques in the paper [Computer-Based Horse Race Handicapping and Wagering Systems](https://www.gwern.net/docs/statistics/decision/1994-benter.pdf). 

The [YouTube Video by Ken Jee](https://www.youtube.com/watch?v=KEeUR8UDy-s) outlines how he did it, how difficult it was, and discusses whether it is likely to be able to replicate this feat today (hint: Ken thinks it highly unlikely for a number of reasons).

Inspired by video, this notebook examines the possibility of replicating Bill's success using data from modern day UK races.

NOTE: This is a fun examination of the technique the can be used in predicting races. It is not intended to be accurate or valid. The author accepts no responsibility for the correctness, completeness or quality of the information provided. Please do not use this information to place any real-world bets. Gambling odds are always skewed in favour of the bookmaker and you will lose in the long run.  

In [2]:
import numpy as np
import pandas as pd
from abc import ABC, abstractmethod

Step 1: Load in the historic race data and ignore any horse that didn't complete the race

In [3]:
import glob
from datetime import timedelta
results_files = glob.glob('Results_*.csv')
results_files

history = pd.concat([pd.read_csv(f) for f in results_files])
history = history[history['ResultStatus'] == 'CompletedRace']
history['Off'] =  pd.to_datetime(history['Off'], format='%m/%d/%Y %H:%M:%S')

Step 2: Define interface for processing the historic data and function to process data in a consistent way. 

In [18]:
class RaceDataProcessor(ABC):
    def initialize(self, history : pd.DataFrame) -> None:
        # Initialise the processor with all historic data
        self.history = history

    def update(self, daily_results : pd.DataFrame) -> None:
        # Update the processor with data
        pass

    def post_update(self, daily_results : pd.DataFrame) -> None:
        # Merge daily_results with history ready for next day's data
        self.history = pd.concat([self.history, daily_results])

    def after_process_data(self) -> None:
        # Allow processor to output results after processing completes
        pass

    def process_race_data(self, history : pd.DataFrame, days_to_process : int = 30):
        history_end = history['Off'].max().date()
        process_start = history_end - timedelta(days=days_to_process)
        initial_history =  history[history['Off'].dt.date < process_start]   
        self.initialize(initial_history)

        while process_start < history_end:
            process_step_end = process_start + timedelta(days=1)
            daily_slice = history[(history['Off'].dt.date >= process_start) & (history['Off'].dt.date < process_step_end)]
            self.update(daily_slice)
            self.post_update(daily_slice)
            process_start = process_step_end

        self.after_process_data()

Step 3: Check to see if we have a closed data set i.e. give all the previous history we know, how many races include horses that we have never seen race before? And how many times did those horses win races?

In [19]:
class PreviousRunnerAnalysisRaceDataProcessor(RaceDataProcessor):
    def initialize(self, history : pd.DataFrame) -> None:
        super().initialize(history)
        self.__update_runner_stats()
        self.__total_days = 0
        self.__total_processed_races = 0
        self.__total_known_runners = 0
        self.__total_unknown_runners = 0
        self.__total_winning_known_runners = 0
        self.__total_winning_unknown_runners = 0
        self.__total_races_with_unknown_runners = 0

    def update(self, daily_results : pd.DataFrame) -> None:
        self.__total_days += 1
        self.__total_processed_races += daily_results['RaceId'].nunique()
        unique_horse = daily_results['HorseId'].nunique()
        known_unique_horse = daily_results[daily_results['HorseId'].isin(self.__known_runners)]['HorseId'].nunique()
        self.__total_known_runners += known_unique_horse
        self.__total_unknown_runners += unique_horse - known_unique_horse
        winners = daily_results[daily_results['FinishingPosition'] == 1]
        known_winners = winners[winners['HorseId'].isin(self.__known_runners)]['HorseId'].nunique()
        self.__total_winning_known_runners += known_winners
        self.__total_winning_unknown_runners += len(winners) - known_winners

        races_with_known_runner_counts = daily_results.groupby('RaceId').apply(lambda df: self.__calculate_counts_for_race_group(df))
        races_with_any_unknown_runners = races_with_known_runner_counts[races_with_known_runner_counts["HorseCount"] != races_with_known_runner_counts["KnownHorseCount"]]
        self.__total_races_with_unknown_runners += races_with_any_unknown_runners.reset_index()['RaceId'].count()

    def post_update(self, daily_results : pd.DataFrame) -> None:
        super().post_update(daily_results)
        self.__update_runner_stats()

    def after_process_data(self) -> None:
        percentage_unknown_runners = 100.0 * self.__total_unknown_runners / (self.__total_unknown_runners + self.__total_known_runners)
        percentage_unknown_winners = 100.0 * self.__total_winning_unknown_runners / (self.__total_winning_unknown_runners + self.__total_winning_known_runners)
        percentage_races_with_unknown_runners = 100.0 * self.__total_races_with_unknown_runners / (self.__total_races_with_unknown_runners + self.__total_processed_races)
        print(
            f'Previous runner data for last {self.__total_days} days / {self.__total_processed_races} races:\n'
            f'  Total known runners: {self.__total_known_runners}\n'
            f'  Total unknown runners: {self.__total_unknown_runners} ({percentage_unknown_runners:.2f} %)\n'
            f'  Total known winners: {self.__total_winning_known_runners}\n'
            f'  Total unknown winners: {self.__total_winning_unknown_runners} ({percentage_unknown_winners:.2f} %)\n'
            f'  Races with unknown runners: {self.__total_races_with_unknown_runners} ({percentage_races_with_unknown_runners:.2f} %)\n'            
            )
        pass        

    def __update_runner_stats(self) -> None:
        self.__known_runners = self.history['HorseId'].unique().tolist()

    def __calculate_counts_for_race_group(self, race_group) -> pd.Series:
        new_columns = {}
        new_columns['HorseCount'] = race_group['HorseId'].count()
        new_columns['KnownHorseCount'] = race_group[race_group['HorseId'].isin(self.__known_runners)]['HorseId'].count()
        return pd.Series(new_columns, index=['HorseCount', 'KnownHorseCount']) 


In [20]:
PreviousRunnerAnalysisRaceDataProcessor().process_race_data(history)

Previous runner data for last 30 days / 1651 races:
  Total known runners: 14388
  Total unknown runners: 1818 (11.22 %)
  Total known winners: 1514
  Total unknown winners: 445 (22.72 %)
  Races with unknown runners: 593 (26.43 %)



**Conclusions from Step 3**: 26.43 % of races include runners that have not previously run and a significant proportion of those are won by horses we have no prior data about. It is unlikely that we can predict with any accuracy these races given the lack of data.

However, this also means that 73.57% of races *do* form a closed data set where we have prior information about races that we can use to inform predictions

Step 4: heck to see if we have a closed data set with respect to jockies i.e. give all the previous history we know, how many races include jockies that we have never seen race before? And how many times did those jockies win races?

In [21]:
class PreviousJockiesAnalysisRaceDataProcessor(RaceDataProcessor):
    def initialize(self, history : pd.DataFrame) -> None:
        super().initialize(history)
        self.__update_jockey_stats()
        self.__total_days = 0
        self.__total_processed_races = 0
        self.__total_known_jockies = 0
        self.__total_unknown_jockies = 0
        self.__total_winning_known_jockies = 0
        self.__total_winning_unknown_jockies = 0
        self.__total_races_with_unknown_jockies = 0

    def update(self, daily_results : pd.DataFrame) -> None:
        self.__total_days += 1
        self.__total_processed_races += daily_results['RaceId'].nunique()
        unique_jockies = daily_results['JockeyId'].nunique()
        known_unique_jockies = daily_results[daily_results['JockeyId'].isin(self.__known_jockies)]['JockeyId'].nunique()
        self.__total_known_jockies += known_unique_jockies
        self.__total_unknown_jockies += unique_jockies - known_unique_jockies
        winners = daily_results[daily_results['FinishingPosition'] == 1]
        known_winners = winners[winners['JockeyId'].isin(self.__known_jockies)]['JockeyId'].nunique()
        self.__total_winning_known_jockies += known_winners
        self.__total_winning_unknown_jockies += len(winners) - known_winners

        races_with_known_jockey_counts = daily_results.groupby('RaceId').apply(lambda df: self.__calculate_counts_for_race_group(df))
        races_with_any_unknown_jockies = races_with_known_jockey_counts[races_with_known_jockey_counts["JockeyCount"] != races_with_known_jockey_counts["KnownJockeyCount"]]
        self.__total_races_with_unknown_jockies += races_with_any_unknown_jockies.reset_index()['RaceId'].count()

    def post_update(self, daily_results : pd.DataFrame) -> None:
        super().post_update(daily_results)
        self.__update_jockey_stats()

    def after_process_data(self) -> None:
        percentage_unknown_jockies = 100.0 * self.__total_unknown_jockies / (self.__total_unknown_jockies + self.__total_known_jockies)
        percentage_unknown_winners = 100.0 * self.__total_winning_unknown_jockies / (self.__total_winning_unknown_jockies + self.__total_winning_known_jockies)
        percentage_races_with_unknown_jockies = 100.0 * self.__total_races_with_unknown_jockies / (self.__total_races_with_unknown_jockies + self.__total_processed_races)
        print(
            f'Previous jockey data for last {self.__total_days} days / {self.__total_processed_races} races:\n'
            f'  Total known jockies: {self.__total_known_jockies}\n'
            f'  Total unknown jockies: {self.__total_unknown_jockies} ({percentage_unknown_jockies:.2f} %)\n'
            f'  Total known winners: {self.__total_winning_known_jockies}\n'
            f'  Total unknown winners: {self.__total_winning_unknown_jockies} ({percentage_unknown_winners:.2f} %)\n'
            f'  Races with unknown jockies: {self.__total_races_with_unknown_jockies} ({percentage_races_with_unknown_jockies:.2f} %)\n'            
            )
        pass        

    def __update_jockey_stats(self) -> None:
        self.__known_jockies = self.history['JockeyId'].unique().tolist()

    def __calculate_counts_for_race_group(self, race_group) -> pd.Series:
        new_columns = {}
        new_columns['JockeyCount'] = race_group['JockeyId'].count()
        new_columns['KnownJockeyCount'] = race_group[race_group['JockeyId'].isin(self.__known_jockies)]['JockeyId'].count()
        return pd.Series(new_columns, index=['JockeyCount', 'KnownJockeyCount']) 

In [22]:
PreviousJockiesAnalysisRaceDataProcessor().process_race_data(history)

Previous jockey data for last 30 days / 1651 races:
  Total known jockies: 7723
  Total unknown jockies: 57 (0.73 %)
  Total known winners: 1422
  Total unknown winners: 537 (27.41 %)
  Races with unknown jockies: 43 (2.54 %)



**Conclusions from Step 4**: A very small number of races include unknown jockies (2.54%). These races should be excluded from analysis for the same reasons outlined above for races with unknown horses.

In [23]:
class RaceDataProcessorIgnoringUnknownRunnersOrJockies(RaceDataProcessor):
    def initialize(self, history : pd.DataFrame) -> None:
        super().initialize(history)
        self.__update_jockey_and_horse_stats()
        self._total_races = 0
        self._total_processed_races = 0
        self._total_ignored_races = 0

    def update(self, daily_results : pd.DataFrame) -> None:
        self.process_filtered_results(self.__remove_races_with_unknown_horses_or_jockies(daily_results))
        pass

    def process_filtered_results(self, daily_results : pd.DataFrame) -> None:
        pass

    def post_update(self, daily_results : pd.DataFrame) -> None:
        super().post_update(daily_results)
        self.__update_jockey_and_horse_stats()

    def __remove_races_with_unknown_horses_or_jockies(self, daily_results : pd.DataFrame) -> pd.DataFrame:
        df = daily_results.groupby('RaceId').apply(lambda g: self.__calculate_counts_for_race_group(g))
        races = daily_results['RaceId'].nunique()
        self._total_races += races
        df = df[(df['JockeyCount'] == df['KnownJockeyCount']) & (df['HorseCount'] == df['KnownHorseCount'])]
        races_to_process = df.reset_index()['RaceId'].unique().tolist()
        count_races_to_process = len(races_to_process)
        self._total_processed_races += count_races_to_process
        self._total_ignored_races += races - count_races_to_process
        return daily_results[daily_results['RaceId'].isin(races_to_process)]

    def __update_jockey_and_horse_stats(self) -> None:
        self.__known_jockies = self.history['JockeyId'].unique().tolist()
        self.__known_runners = self.history['HorseId'].unique().tolist()

    def __calculate_counts_for_race_group(self, race_group) -> pd.Series:
        new_columns = {}
        new_columns['HorseCount'] = race_group['HorseId'].count()
        new_columns['KnownHorseCount'] = race_group[race_group['HorseId'].isin(self.__known_runners)]['HorseId'].count()
        new_columns['JockeyCount'] = race_group['JockeyId'].count()
        new_columns['KnownJockeyCount'] = race_group[race_group['JockeyId'].isin(self.__known_jockies)]['JockeyId'].count()
        return pd.Series(new_columns, index=['HorseCount', 'KnownHorseCount', 'JockeyCount', 'KnownJockeyCount']) 
    
    def after_process_data(self) -> None:
        print(
            f'Processed {self._total_processed_races} of {self._total_races} races:\n'
            f'  Total ignored races: {self._total_ignored_races} (with unknown runner or jockey)'
            )

In [24]:
RaceDataProcessorIgnoringUnknownRunnersOrJockies().process_race_data(history)

Processed 1040 of 1651 races:
  Total ignored races: 611 (with unknown runner or jockey)


Step 5: Baseline predictions by choosing the first horse on each race card. 

Should be fairly random and allow us to score more real predictions against dumb luck.

In [43]:
class RacePredictor(RaceDataProcessorIgnoringUnknownRunnersOrJockies):
    def initialize(self, history: pd.DataFrame) -> None:
        super().initialize(history)
        self.__prediction_summary = pd.DataFrame(columns=['Date', 'Races', 'Predicted', 'Wins', 'Losses', 'Gains'])
        self.__predictions = None

    @property
    def prediction_summary(self) -> pd.DataFrame:
        return self.__prediction_summary 

    @property
    def predictions(self) -> pd.DataFrame:
        return self.__predictions

    def _update_predictions(self, predictions: pd.DataFrame) -> None:
        predicted = len(predictions)
        if predicted > 0:
            self.__predictions = predictions if self.__predictions is None else pd.concat([self.__predictions, predictions])
            prediction_start = predictions.loc[predictions.index[0], 'Off']
            staked = predicted # £1 stake per prediction
            winners = predictions[predictions['PredictedPosition'] == predictions['FinishingPosition']]
            wins = len(winners)
            losses = predicted - wins
            percentageWins = (wins / predicted) * 100.0;
            winnings = winners['DecimalOdds'].sum()
            percentageGains = ((winnings - losses) / staked) * 100.0;
            # print(f'Scored: {predicted}, Won: {wins}, Winnings (with £1 stake): {winnings}, Lost: {losses}, %gains/loss: {percentageGains}')

            row = pd.DataFrame([
                {
                    'Date': prediction_start, 
                    'Predicted': predicted, 
                    'Wins': wins, 
                    'Winnings': winnings,
                    'Losses': losses,
                    'PercentageWins': percentageWins,
                    'GainLoss': winnings - staked, 
                    'PercentGainLoss': percentageGains
                }])
            self.__prediction_summary = pd.concat([self.__prediction_summary, row], axis=0, ignore_index=True)

    def aggregate_prediction_summary(self) -> pd.DataFrame:
        return self.__prediction_summary.agg(
            {
                "Predicted" : ["average", "sum"],
                "Wins" : ["average", "sum"],
                "PercentageWins" : ["average", "std"],
                "GainLoss": ["min", "max", "average", "skew", "std", "sum"],
                "Winnings": ["min", "max", "average", "skew", "std", "sum"],
            }
        )


In [44]:
class FirstStallWinnerPredictor(RacePredictor):
    def process_filtered_results(self, daily_results : pd.DataFrame) -> None:
        predictions = daily_results.sort_values('RaceCardNumber', ascending=True).groupby('RaceId').first().copy()
        predictions['PredictedPosition'] = 1
        self._update_predictions(predictions)

    def after_process_data(self) -> None:
        super().after_process_data()
        print(self.aggregate_prediction_summary())

In [None]:
predictor = FirstStallWinnerPredictor()
predictor.process_race_data(history)
predictor.predictions.head()

**Conclusions from Step 5**: Betting £1 randomly on 1040 races results in an overall loss of £156 :(

Step 6: Analyse factors to understand if they have influence on the outcome of races. Bill Benter suggested the following attributes:

Current condition:
- performance in recent races
- time since last race
- recent workout data
- age of horse

Past performance:
- finishing position in past races
- lengths behind winner in past races
- normalized times of past races

Adjustments to past performance:
- strength of competition in past races
- weight carried in past races
- jockey's contribution to past performances
- compensation for bad luck in past races
- compensation for advantageous or disadvantageous post position in past races

Present race situational factors:
- weight to be carried
- today's jockey's ability
- advantages or disadvantages of the assigned post position

Preferences which could influence the horse's performance in today's race:
- distance preference
- surface preference (turf vs dirt)
- condition of surface preference (wet vs dry)
- specific track preference

Step 6.1: Define abstract feature factory for feature to the daily results that will be used to inform predictions

In [46]:
class FeatureFactory:
    @abstractmethod
    def add_features(self, history: pd.DataFrame, daily_results : pd.DataFrame) -> None:
        pass


class FeaturePredictor(RacePredictor):
    def __init__(self, feature_factory: FeatureFactory) -> None:
        super().__init__()
        self.__feature_factory = feature_factory

    def process_filtered_results(self, daily_results : pd.DataFrame) -> None:
        daily_results_with_new_features = daily_results.copy()
        self.__feature_factory.add_features(self.history, daily_results_with_new_features)
        predictions = self.calculate_predictions(daily_results_with_new_features)
        self._update_predictions(predictions)
    
    @abstractmethod
    def calculate_predictions(self, daily_results : pd.DataFrame) -> pd.DataFrame:
        pass

    def after_process_data(self) -> None:
        super().after_process_data()
        print(self.aggregate_prediction_summary())


class LowestValueFeaturePredictor(FeaturePredictor):
    def __init__(self, feature_factory: FeatureFactory, prediction_feature_name: str) -> None:
        super().__init__(feature_factory)
        self.__prediction_feature_name = prediction_feature_name

    def calculate_predictions(self, daily_results: pd.DataFrame) -> pd.DataFrame:
        predictable = daily_results.dropna(axis=0, subset=[self.__prediction_feature_name])
        predictions = predictable.sort_values(self.__prediction_feature_name, ascending=True).groupby('RaceId').first().copy()
        predictions['PredictedPosition'] = 1
        return predictions


class HighestValueFeaturePredictor(FeaturePredictor):
    def __init__(self, feature_factory: FeatureFactory, prediction_feature_name: str) -> None:
        super().__init__(feature_factory)
        self.__prediction_feature_name = prediction_feature_name

    def calculate_predictions(self, daily_results: pd.DataFrame) -> pd.DataFrame:
        predictable = daily_results.dropna(axis=0, subset=[self.__prediction_feature_name])
        predictions = predictable.sort_values(self.__prediction_feature_name, ascending=False).groupby('RaceId').first().copy()
        predictions['PredictedPosition'] = 1
        return predictions



Step 6.2 Predict using performance in recent races. 

1. For each race in the last x months, assign a score that is equal to the finishing position divided by the number of horses in the race (this assumes races with more runners are harder to win). 
2. Sum the per race performance score and divide by the number of races (to average the performance over the period - since some horses may have ran more races than others in the given time frame)
3. Predict that the horse with be best/lowers past performance will win

In [49]:
class RecentRacePerformanceFeatureFactory(FeatureFactory):
    def __init__(self, days_to_process : int = 30) -> None:
        super().__init__()
        self.__days_to_process = days_to_process


    def add_features(self, history: pd.DataFrame, daily_results: pd.DataFrame) -> None:
        history_end = history['Off'].max().date()
        recent_history_start = history_end - timedelta(days=self.__days_to_process)
        recent_history = history[history['Off'].dt.date >= recent_history_start]
        
        horses_per_race = recent_history.groupby('RaceId').apply(lambda g: self.__calculate_horse_counts_for_race_group(g))
        recent_history = pd.merge(recent_history, horses_per_race, how='left', on=['RaceId'])
        recent_history['RacePerformance'] = recent_history.apply(lambda row: self.__calculate_horse_race_performance(row), axis=1)
        horse_performance = recent_history.groupby('HorseId').apply(lambda g: self.__calculate_overall_horse_performance(g))

        daily_results = pd.merge(daily_results, horse_performance, how='left', on=['HorseId'])


    def __calculate_horse_counts_for_race_group(self, race_group : pd.DataFrame) -> pd.Series:
        new_columns = {}
        new_columns['HorseCount'] = race_group['HorseId'].count()
        return pd.Series(new_columns, index=['HorseCount']) 


    def __calculate_horse_race_performance(self, race_row : pd.DataFrame) -> float:
        return race_row['FinishingPosition'] / race_row['HorseCount']


    def __calculate_overall_horse_performance(self, race_group: pd.DataFrame) -> pd.Series:
        new_columns = {}
        new_columns['RaceCount'] = race_group['RaceId'].count()
        new_columns['RecentPerformance'] = race_group['RacePerformance'].sum() / race_group['RaceId'].count()
        return pd.Series(new_columns, index=['RaceCount', 'RecentPerformance']) 


In [52]:
predictor = LowestValueFeaturePredictor(RecentRacePerformanceFeatureFactory(), 'RecentPerformance')
predictor.process_race_data(history)

Processed 1040 of 1651 races:
  Total ignored races: 611 (with unknown runner or jockey)
           Predicted        Wins  PercentageWins    GainLoss    Winnings
average    34.666667    5.433333       16.058376   -5.319584   29.347083
sum      1040.000000  163.000000             NaN -159.587518  880.412482
std              NaN         NaN        6.733791   16.912677   17.139859
min              NaN         NaN             NaN  -27.639394    5.500000
max              NaN         NaN             NaN   33.000000   63.000000
skew             NaN         NaN             NaN    0.604752    0.706429


In [58]:
predictor.predictions[['HorseId', 'FinishingPosition', 'RecentPerformance']].head()

Unnamed: 0_level_0,HorseId,FinishingPosition,RecentPerformance
RaceId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
816969,4175288,3,1
817906,2654569,2,1
819625,1956567,24,1
819626,2870506,6,1
819627,4352816,12,1
