In [2]:
#%pip install numpy
#%pip install Pandas

# Horse Racing Results Predictor #

The American professional gambler [Bill Benter](https://en.wikipedia.org/wiki/Bill_Benter) is said to have made earned nearly $1 billion through the development of one of the most successful analysis computer software programs in the horse racing market.

Bill published his techniques in the paper [Computer-Based Horse Race Handicapping and Wagering Systems](https://www.gwern.net/docs/statistics/decision/1994-benter.pdf). 

The [YouTube Video by Ken Jee](https://www.youtube.com/watch?v=KEeUR8UDy-s) outlines how he did it, how difficult it was, and discusses whether it is likely to be able to replicate this feat today (hint: Ken thinks it highly unlikely for a number of reasons).

Inspired by video, this notebook examines the possibility of replicating Bill's success using data from modern day UK races.

NOTE: This is a fun examination of the technique the can be used in predicting races. It is not intended to be accurate or valid. The author accepts no responsibility for the correctness, completeness or quality of the information provided. Please do not use this information to place any real-world bets. Gambling odds are always skewed in favour of the bookmaker and you will lose in the long run.  

In [1]:
import numpy as np
import pandas as pd
from abc import ABC, abstractmethod

Step 1: Load in the historic race data and ignore any horse that didn't complete the race

In [2]:
import glob
from datetime import timedelta
results_files = glob.glob('Results_*.csv')
results_files

history = pd.concat([pd.read_csv(f) for f in results_files])
history = history[history['ResultStatus'] == 'CompletedRace']
history['Off'] =  pd.to_datetime(history['Off'], format='%m/%d/%Y %H:%M:%S')

Step 2: Define interface for processing the historic data and function to process data in a consistent way. 

In [3]:
class RaceDataProcessor(ABC):
    def initialize(self, history : pd.DataFrame) -> None:
        # Initialise the processor with all historic data
        self.history = history

    def update(self, daily_results : pd.DataFrame) -> None:
        # Update the processor with data
        pass

    def post_update(self, daily_results : pd.DataFrame) -> None:
        # Merge daily_results with history ready for next day's data
        self.history = pd.concat([self.history, daily_results])

    def after_process_data(self) -> None:
        # Allow processor to output results after processing completes
        pass

In [8]:
from typing import Callable

def process_race_data(processor: RaceDataProcessor, days_to_process : int = 30):
    history_end = history['Off'].max().date()
    process_start = history_end - timedelta(days=days_to_process)
    initial_history =  history[history['Off'].dt.date < process_start]   
    processor.initialize(initial_history)

    while process_start < history_end:
        process_step_end = process_start + timedelta(days=1)
        daily_slice = history[(history['Off'].dt.date >= process_start) & (history['Off'].dt.date < process_step_end)]
        processor.update(daily_slice)
        processor.post_update(daily_slice)
        process_start = process_step_end

    processor.after_process_data()

Step 3: Check to see if we have a closed data set i.e. give all the previous history we know, how many races include horses that we have never seen race before? And how many times did those horses win races?

In [26]:
class PreviousRunnerAnalysisRaceDataProcessor(RaceDataProcessor):
    def initialize(self, history : pd.DataFrame) -> None:
        super().initialize(history)
        self.__update_runner_stats()
        self.__total_days = 0
        self.__total_processed_races = 0
        self.__total_known_runners = 0
        self.__total_unknown_runners = 0
        self.__total_winning_known_runners = 0
        self.__total_winning_unknown_runners = 0
        self.__total_races_with_unknown_runners = 0

    def update(self, daily_results : pd.DataFrame) -> None:
        self.__total_days += 1
        self.__total_processed_races += daily_results['RaceId'].nunique()
        unique_horse = daily_results['HorseId'].nunique()
        known_unique_horse = daily_results[daily_results['HorseId'].isin(self.__known_runners)]['HorseId'].nunique()
        self.__total_known_runners += known_unique_horse
        self.__total_unknown_runners += unique_horse - known_unique_horse
        winners = daily_results[daily_results['FinishingPosition'] == 1]
        known_winners = winners[winners['HorseId'].isin(self.__known_runners)]['HorseId'].nunique()
        self.__total_winning_known_runners += known_winners
        self.__total_winning_unknown_runners += len(winners) - known_winners

        races_with_known_runner_counts = daily_results.groupby('RaceId').apply(lambda df: self.__calculate_counts_for_race_group(df))
        races_with_any_unknown_runners = races_with_known_runner_counts[races_with_known_runner_counts["HorseCount"] != races_with_known_runner_counts["KnownHorseCount"]]
        self.__total_races_with_unknown_runners += races_with_any_unknown_runners.reset_index()['RaceId'].count()

    def post_update(self, daily_results : pd.DataFrame) -> None:
        super().post_update(daily_results)
        self.__update_runner_stats()

    def after_process_data(self) -> None:
        percentage_unknown_runners = 100.0 * self.__total_unknown_runners / (self.__total_unknown_runners + self.__total_known_runners)
        percentage_unknown_winners = 100.0 * self.__total_winning_unknown_runners / (self.__total_winning_unknown_runners + self.__total_winning_known_runners)
        percentage_races_with_unknown_runners = 100.0 * self.__total_races_with_unknown_runners / (self.__total_races_with_unknown_runners + self.__total_processed_races)
        print(
            f'Previous runner data for last {self.__total_days} days / {self.__total_processed_races} races:\n'
            f'  Total known runners: {self.__total_known_runners}\n'
            f'  Total unknown runners: {self.__total_unknown_runners} ({percentage_unknown_runners:.2f} %)\n'
            f'  Total known winners: {self.__total_winning_known_runners}\n'
            f'  Total unknown winners: {self.__total_winning_unknown_runners} ({percentage_unknown_winners:.2f} %)\n'
            f'  Races with unknown runners: {self.__total_races_with_unknown_runners} ({percentage_races_with_unknown_runners:.2f} %)\n'            
            )
        pass        

    def __update_runner_stats(self) -> None:
        self.__known_runners = self.history['HorseId'].unique().tolist()

    def __calculate_counts_for_race_group(self, race_group) -> pd.Series:
        new_columns = {}
        new_columns['HorseCount'] = race_group['HorseId'].count()
        new_columns['KnownHorseCount'] = race_group[race_group['HorseId'].isin(self.__known_runners)]['HorseId'].count()
        return pd.Series(new_columns, index=['HorseCount', 'KnownHorseCount']) 

process_race_data(PreviousRunnerAnalysisRaceDataProcessor())

Previous runner data for last 30 days / 1651 races:
  Total known runners: 14388
  Total unknown runners: 1818 (11.22 %)
  Total known winners: 1514
  Total unknown winners: 445 (22.72 %)
  Races with unknown runners: 593 (26.43 %)



**Conclusions from Step 3**: 26.43 % of races include runners that have not previously run and a significant proportion of those are won by horses we have no prior data about. It is unlikely that we can predict with any accuracy these races given the lack of data.

However, this also means that 73.57% of races *do* form a closed data set where we have prior information about races that we can use to inform predictions

Step 4: heck to see if we have a closed data set with respect to jockies i.e. give all the previous history we know, how many races include jockies that we have never seen race before? And how many times did those jockies win races?

In [31]:
class PreviousJockiesAnalysisRaceDataProcessor(RaceDataProcessor):
    def initialize(self, history : pd.DataFrame) -> None:
        super().initialize(history)
        self.__update_jockey_stats()
        self.__total_days = 0
        self.__total_processed_races = 0
        self.__total_known_jockies = 0
        self.__total_unknown_jockies = 0
        self.__total_winning_known_jockies = 0
        self.__total_winning_unknown_jockies = 0
        self.__total_races_with_unknown_jockies = 0

    def update(self, daily_results : pd.DataFrame) -> None:
        self.__total_days += 1
        self.__total_processed_races += daily_results['RaceId'].nunique()
        unique_jockies = daily_results['JockeyId'].nunique()
        known_unique_jockies = daily_results[daily_results['JockeyId'].isin(self.__known_runners)]['JockeyId'].nunique()
        self.__total_known_jockies += known_unique_jockies
        self.__total_unknown_jockies += unique_jockies - known_unique_jockies
        winners = daily_results[daily_results['FinishingPosition'] == 1]
        known_winners = winners[winners['JockeyId'].isin(self.__known_runners)]['JockeyId'].nunique()
        self.__total_winning_known_jockies += known_winners
        self.__total_winning_unknown_jockies += len(winners) - known_winners

        races_with_known_jockey_counts = daily_results.groupby('RaceId').apply(lambda df: self.__calculate_counts_for_race_group(df))
        races_with_any_unknown_jockies = races_with_known_jockey_counts[races_with_known_jockey_counts["JockeyCount"] != races_with_known_jockey_counts["KnownJockeyCount"]]
        self.__total_races_with_unknown_jockies += races_with_any_unknown_jockies.reset_index()['RaceId'].count()

    def post_update(self, daily_results : pd.DataFrame) -> None:
        super().post_update(daily_results)
        self.__update_jockey_stats()

    def after_process_data(self) -> None:
        percentage_unknown_jockies = 100.0 * self.__total_unknown_jockies / (self.__total_unknown_jockies + self.__total_known_jockies)
        percentage_unknown_winners = 100.0 * self.__total_winning_unknown_jockies / (self.__total_winning_unknown_jockies + self.__total_winning_known_jockies)
        percentage_races_with_unknown_jockies = 100.0 * self.__total_races_with_unknown_jockies / (self.__total_races_with_unknown_jockies + self.__total_processed_races)
        print(
            f'Previous runner data for last {self.__total_days} days / {self.__total_processed_races} races:\n'
            f'  Total known jockies: {self.__total_known_jockies}\n'
            f'  Total unknown jockies: {self.__total_unknown_jockies} ({percentage_unknown_jockies:.2f} %)\n'
            f'  Total known winners: {self.__total_winning_known_jockies}\n'
            f'  Total unknown winners: {self.__total_winning_unknown_jockies} ({percentage_unknown_winners:.2f} %)\n'
            f'  Races with unknown jockies: {self.__total_races_with_unknown_jockies} ({percentage_races_with_unknown_jockies:.2f} %)\n'            
            )
        pass        

    def __update_jockey_stats(self) -> None:
        self.__known_runners = self.history['JockeyId'].unique().tolist()

    def __calculate_counts_for_race_group(self, race_group) -> pd.Series:
        new_columns = {}
        new_columns['JockeyCount'] = race_group['JockeyId'].count()
        new_columns['KnownJockeyCount'] = race_group[race_group['JockeyId'].isin(self.__known_runners)]['JockeyId'].count()
        return pd.Series(new_columns, index=['JockeyCount', 'KnownJockeyCount']) 

process_race_data(PreviousJockiesAnalysisRaceDataProcessor())

Previous runner data for last 30 days / 1651 races:
  Total known jockies: 7723
  Total unknown jockies: 57 (0.73 %)
  Total known winners: 1422
  Total unknown winners: 537 (27.41 %)
  Races with unknown jockies: 43 (2.54 %)



**Conclusions from Step 4**: A very small number of races include unknown jockies (2.54%). These races should be excluded from analysis for the same reasons outlined above for races with unknown horses.