In [147]:
#%pip install numpy
#%pip install Pandas
#%pip install sklearn

Collecting sklearn
  Downloading sklearn-0.0.tar.gz (1.1 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting scikit-learn
  Downloading scikit_learn-1.1.3-cp39-cp39-win_amd64.whl (7.6 MB)
     ---------------------------------------- 7.6/7.6 MB 5.5 MB/s eta 0:00:00
Collecting threadpoolctl>=2.0.0
  Downloading threadpoolctl-3.1.0-py3-none-any.whl (14 kB)
Collecting joblib>=1.0.0
  Downloading joblib-1.2.0-py3-none-any.whl (297 kB)
     -------------------------------------- 298.0/298.0 KB 4.6 MB/s eta 0:00:00
Using legacy 'setup.py install' for sklearn, since package 'wheel' is not installed.
Installing collected packages: threadpoolctl, joblib, scikit-learn, sklearn
  Running setup.py install for sklearn: started
  Running setup.py install for sklearn: finished with status 'done'
Successfully installed joblib-1.2.0 scikit-learn-1.1.3 sklearn-0.0 threadpoolctl-3.1.0
Note: you may need to restart the kernel to use updated 

You should consider upgrading via the 'C:\Users\leeco\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip' command.


# Horse Racing Results Predictor #

The American professional gambler [Bill Benter](https://en.wikipedia.org/wiki/Bill_Benter) is said to have made earned nearly $1 billion through the development of one of the most successful analysis computer software programs in the horse racing market.

Bill published his techniques in the paper [Computer-Based Horse Race Handicapping and Wagering Systems](https://www.gwern.net/docs/statistics/decision/1994-benter.pdf). 

The [YouTube Video by Ken Jee](https://www.youtube.com/watch?v=KEeUR8UDy-s) outlines how he did it, how difficult it was, and discusses whether it is likely to be able to replicate this feat today (hint: Ken thinks it highly unlikely for a number of reasons).

Inspired by video, this notebook examines the possibility of replicating Bill's success using data from modern day UK races.

NOTE: This is a fun examination of the technique the can be used in predicting races. It is not intended to be accurate or valid. The author accepts no responsibility for the correctness, completeness or quality of the information provided. Please do not use this information to place any real-world bets. Gambling odds are always skewed in favour of the bookmaker and you will lose in the long run.  

In [1]:
import numpy as np
import pandas as pd
from abc import ABC, abstractmethod

Step 1: Load in the historic race data and ignore any horse that didn't complete the race

In [169]:
import glob
from datetime import timedelta
results_files = glob.glob('Results_*.csv')
results_files

history = pd.concat([pd.read_csv(f) for f in results_files])
history = history[history['ResultStatus'] == 'CompletedRace']
history['Off'] =  pd.to_datetime(history['Off'], format='%m/%d/%Y %H:%M:%S')
history['Wins'] = history.apply(lambda r: 1 if r['FinishingPosition'] == 1 else 0, axis=1)

Step 2: Define interface for processing the historic data and function to process data in a consistent way. 

In [3]:
class RaceDataProcessor(ABC):
    def initialize(self, history : pd.DataFrame) -> None:
        # Initialise the processor with all historic data
        self.history = history

    def update(self, daily_results : pd.DataFrame) -> None:
        # Update the processor with data
        pass

    def post_update(self, daily_results : pd.DataFrame) -> None:
        # Merge daily_results with history ready for next day's data
        self.history = pd.concat([self.history, daily_results])

    def after_process_data(self) -> None:
        # Allow processor to output results after processing completes
        pass

    def process_race_data(self, history : pd.DataFrame, days_to_process : int = 30):
        history_end = history['Off'].max().date()
        process_start = history_end - timedelta(days=days_to_process)
        initial_history =  history[history['Off'].dt.date < process_start]   
        self.initialize(initial_history)

        while process_start < history_end:
            process_step_end = process_start + timedelta(days=1)
            daily_slice = history[(history['Off'].dt.date >= process_start) & (history['Off'].dt.date < process_step_end)]
            self.update(daily_slice)
            self.post_update(daily_slice)
            process_start = process_step_end

        self.after_process_data()

Step 3: Check to see if we have a closed data set i.e. give all the previous history we know, how many races include horses that we have never seen race before? And how many times did those horses win races?

In [4]:
class PreviousRunnerAnalysisRaceDataProcessor(RaceDataProcessor):
    def initialize(self, history : pd.DataFrame) -> None:
        super().initialize(history)
        self.__update_runner_stats()
        self.__total_days = 0
        self.__total_processed_races = 0
        self.__total_known_runners = 0
        self.__total_unknown_runners = 0
        self.__total_winning_known_runners = 0
        self.__total_winning_unknown_runners = 0
        self.__total_races_with_unknown_runners = 0

    def update(self, daily_results : pd.DataFrame) -> None:
        self.__total_days += 1
        self.__total_processed_races += daily_results['RaceId'].nunique()
        unique_horse = daily_results['HorseId'].nunique()
        known_unique_horse = daily_results[daily_results['HorseId'].isin(self.__known_runners)]['HorseId'].nunique()
        self.__total_known_runners += known_unique_horse
        self.__total_unknown_runners += unique_horse - known_unique_horse
        winners = daily_results[daily_results['FinishingPosition'] == 1]
        known_winners = winners[winners['HorseId'].isin(self.__known_runners)]['HorseId'].nunique()
        self.__total_winning_known_runners += known_winners
        self.__total_winning_unknown_runners += len(winners) - known_winners

        races_with_known_runner_counts = daily_results.groupby('RaceId').apply(lambda df: self.__calculate_counts_for_race_group(df))
        races_with_any_unknown_runners = races_with_known_runner_counts[races_with_known_runner_counts["HorseCount"] != races_with_known_runner_counts["KnownHorseCount"]]
        self.__total_races_with_unknown_runners += races_with_any_unknown_runners.reset_index()['RaceId'].count()

    def post_update(self, daily_results : pd.DataFrame) -> None:
        super().post_update(daily_results)
        self.__update_runner_stats()

    def after_process_data(self) -> None:
        percentage_unknown_runners = 100.0 * self.__total_unknown_runners / (self.__total_unknown_runners + self.__total_known_runners)
        percentage_unknown_winners = 100.0 * self.__total_winning_unknown_runners / (self.__total_winning_unknown_runners + self.__total_winning_known_runners)
        percentage_races_with_unknown_runners = 100.0 * self.__total_races_with_unknown_runners / (self.__total_races_with_unknown_runners + self.__total_processed_races)
        print(
            f'Previous runner data for last {self.__total_days} days / {self.__total_processed_races} races:\n'
            f'  Total known runners: {self.__total_known_runners}\n'
            f'  Total unknown runners: {self.__total_unknown_runners} ({percentage_unknown_runners:.2f} %)\n'
            f'  Total known winners: {self.__total_winning_known_runners}\n'
            f'  Total unknown winners: {self.__total_winning_unknown_runners} ({percentage_unknown_winners:.2f} %)\n'
            f'  Races with unknown runners: {self.__total_races_with_unknown_runners} ({percentage_races_with_unknown_runners:.2f} %)\n'            
            )
        pass        

    def __update_runner_stats(self) -> None:
        self.__known_runners = self.history['HorseId'].unique().tolist()

    def __calculate_counts_for_race_group(self, race_group) -> pd.Series:
        new_columns = {}
        new_columns['HorseCount'] = race_group['HorseId'].count()
        new_columns['KnownHorseCount'] = race_group[race_group['HorseId'].isin(self.__known_runners)]['HorseId'].count()
        return pd.Series(new_columns, index=['HorseCount', 'KnownHorseCount']) 


In [5]:
PreviousRunnerAnalysisRaceDataProcessor().process_race_data(history)

Previous runner data for last 30 days / 1651 races:
  Total known runners: 14388
  Total unknown runners: 1818 (11.22 %)
  Total known winners: 1514
  Total unknown winners: 445 (22.72 %)
  Races with unknown runners: 593 (26.43 %)



**Conclusions from Step 3**: 26.43 % of races include runners that have not previously run and a significant proportion of those are won by horses we have no prior data about. It is unlikely that we can predict with any accuracy these races given the lack of data.

However, this also means that 73.57% of races *do* form a closed data set where we have prior information about races that we can use to inform predictions

Step 4: heck to see if we have a closed data set with respect to jockies i.e. give all the previous history we know, how many races include jockies that we have never seen race before? And how many times did those jockies win races?

In [6]:
class PreviousJockiesAnalysisRaceDataProcessor(RaceDataProcessor):
    def initialize(self, history : pd.DataFrame) -> None:
        super().initialize(history)
        self.__update_jockey_stats()
        self.__total_days = 0
        self.__total_processed_races = 0
        self.__total_known_jockies = 0
        self.__total_unknown_jockies = 0
        self.__total_winning_known_jockies = 0
        self.__total_winning_unknown_jockies = 0
        self.__total_races_with_unknown_jockies = 0

    def update(self, daily_results : pd.DataFrame) -> None:
        self.__total_days += 1
        self.__total_processed_races += daily_results['RaceId'].nunique()
        unique_jockies = daily_results['JockeyId'].nunique()
        known_unique_jockies = daily_results[daily_results['JockeyId'].isin(self.__known_jockies)]['JockeyId'].nunique()
        self.__total_known_jockies += known_unique_jockies
        self.__total_unknown_jockies += unique_jockies - known_unique_jockies
        winners = daily_results[daily_results['FinishingPosition'] == 1]
        known_winners = winners[winners['JockeyId'].isin(self.__known_jockies)]['JockeyId'].nunique()
        self.__total_winning_known_jockies += known_winners
        self.__total_winning_unknown_jockies += len(winners) - known_winners

        races_with_known_jockey_counts = daily_results.groupby('RaceId').apply(lambda df: self.__calculate_counts_for_race_group(df))
        races_with_any_unknown_jockies = races_with_known_jockey_counts[races_with_known_jockey_counts["JockeyCount"] != races_with_known_jockey_counts["KnownJockeyCount"]]
        self.__total_races_with_unknown_jockies += races_with_any_unknown_jockies.reset_index()['RaceId'].count()

    def post_update(self, daily_results : pd.DataFrame) -> None:
        super().post_update(daily_results)
        self.__update_jockey_stats()

    def after_process_data(self) -> None:
        percentage_unknown_jockies = 100.0 * self.__total_unknown_jockies / (self.__total_unknown_jockies + self.__total_known_jockies)
        percentage_unknown_winners = 100.0 * self.__total_winning_unknown_jockies / (self.__total_winning_unknown_jockies + self.__total_winning_known_jockies)
        percentage_races_with_unknown_jockies = 100.0 * self.__total_races_with_unknown_jockies / (self.__total_races_with_unknown_jockies + self.__total_processed_races)
        print(
            f'Previous jockey data for last {self.__total_days} days / {self.__total_processed_races} races:\n'
            f'  Total known jockies: {self.__total_known_jockies}\n'
            f'  Total unknown jockies: {self.__total_unknown_jockies} ({percentage_unknown_jockies:.2f} %)\n'
            f'  Total known winners: {self.__total_winning_known_jockies}\n'
            f'  Total unknown winners: {self.__total_winning_unknown_jockies} ({percentage_unknown_winners:.2f} %)\n'
            f'  Races with unknown jockies: {self.__total_races_with_unknown_jockies} ({percentage_races_with_unknown_jockies:.2f} %)\n'            
            )
        pass        

    def __update_jockey_stats(self) -> None:
        self.__known_jockies = self.history['JockeyId'].unique().tolist()

    def __calculate_counts_for_race_group(self, race_group) -> pd.Series:
        new_columns = {}
        new_columns['JockeyCount'] = race_group['JockeyId'].count()
        new_columns['KnownJockeyCount'] = race_group[race_group['JockeyId'].isin(self.__known_jockies)]['JockeyId'].count()
        return pd.Series(new_columns, index=['JockeyCount', 'KnownJockeyCount']) 

In [22]:
PreviousJockiesAnalysisRaceDataProcessor().process_race_data(history)

Previous jockey data for last 30 days / 1651 races:
  Total known jockies: 7723
  Total unknown jockies: 57 (0.73 %)
  Total known winners: 1422
  Total unknown winners: 537 (27.41 %)
  Races with unknown jockies: 43 (2.54 %)



**Conclusions from Step 4**: A very small number of races include unknown jockies (2.54%). These races should be excluded from analysis for the same reasons outlined above for races with unknown horses.

In [7]:
class RaceDataProcessorIgnoringUnknownRunnersOrJockies(RaceDataProcessor):
    def initialize(self, history : pd.DataFrame) -> None:
        super().initialize(history)
        self.__update_jockey_and_horse_stats()
        self._total_races = 0
        self._total_processed_races = 0
        self._total_ignored_races = 0

    def update(self, daily_results : pd.DataFrame) -> None:
        self.process_filtered_results(self.__remove_races_with_unknown_horses_or_jockies(daily_results))
        pass

    def process_filtered_results(self, daily_results : pd.DataFrame) -> None:
        pass

    def post_update(self, daily_results : pd.DataFrame) -> None:
        super().post_update(daily_results)
        self.__update_jockey_and_horse_stats()

    def __remove_races_with_unknown_horses_or_jockies(self, daily_results : pd.DataFrame) -> pd.DataFrame:
        df = daily_results.groupby('RaceId').apply(lambda g: self.__calculate_counts_for_race_group(g))
        races = daily_results['RaceId'].nunique()
        self._total_races += races
        df = df[(df['JockeyCount'] == df['KnownJockeyCount']) & (df['HorseCount'] == df['KnownHorseCount'])]
        races_to_process = df.reset_index()['RaceId'].unique().tolist()
        count_races_to_process = len(races_to_process)
        self._total_processed_races += count_races_to_process
        self._total_ignored_races += races - count_races_to_process
        return daily_results[daily_results['RaceId'].isin(races_to_process)]

    def __update_jockey_and_horse_stats(self) -> None:
        self.__known_jockies = self.history['JockeyId'].unique().tolist()
        self.__known_runners = self.history['HorseId'].unique().tolist()

    def __calculate_counts_for_race_group(self, race_group) -> pd.Series:
        new_columns = {}
        new_columns['HorseCount'] = race_group['HorseId'].count()
        new_columns['KnownHorseCount'] = race_group[race_group['HorseId'].isin(self.__known_runners)]['HorseId'].count()
        new_columns['JockeyCount'] = race_group['JockeyId'].count()
        new_columns['KnownJockeyCount'] = race_group[race_group['JockeyId'].isin(self.__known_jockies)]['JockeyId'].count()
        return pd.Series(new_columns, index=['HorseCount', 'KnownHorseCount', 'JockeyCount', 'KnownJockeyCount']) 
    
    def after_process_data(self) -> None:
        print(
            f'Processed {self._total_processed_races} of {self._total_races} races:\n'
            f'  Total ignored races: {self._total_ignored_races} (with unknown runner or jockey)'
            )

In [24]:
RaceDataProcessorIgnoringUnknownRunnersOrJockies().process_race_data(history)

Processed 1040 of 1651 races:
  Total ignored races: 611 (with unknown runner or jockey)


Step 5: Baseline predictions by choosing the first horse on each race card. 

Should be fairly random and allow us to score more real predictions against dumb luck.

In [99]:
class RacePredictor(RaceDataProcessorIgnoringUnknownRunnersOrJockies):
    def initialize(self, history: pd.DataFrame) -> None:
        super().initialize(history)
        self.__prediction_summary = pd.DataFrame(columns=['Date', 'Races', 'Predicted', 'Wins', 'Losses', 'Gains'])
        self.__predictions = None

    @property
    def prediction_summary(self) -> pd.DataFrame:
        return self.__prediction_summary 

    @property
    def predictions(self) -> pd.DataFrame:
        return self.__predictions

    def _update_predictions(self, predictions: pd.DataFrame) -> None:
        predicted = len(predictions)
        if predicted > 0:
            self.__predictions = predictions if self.__predictions is None else pd.concat([self.__predictions, predictions])
            prediction_start = predictions.loc[predictions.index[0], 'Off']
            staked = predicted # £1 stake per prediction
            winners = predictions[predictions['PredictedPosition'] == predictions['FinishingPosition']]
            wins = len(winners)
            losses = predicted - wins
            percentageWins = (wins / predicted) * 100.0;
            winnings = winners['DecimalOdds'].sum()
            percentageGains = ((winnings - losses) / staked) * 100.0;
            # print(f'Scored: {predicted}, Won: {wins}, Winnings (with £1 stake): {winnings}, Lost: {losses}, %gains/loss: {percentageGains}')

            row = pd.DataFrame([
                {
                    'Date': prediction_start, 
                    'Predicted': predicted, 
                    'Wins': wins, 
                    'Winnings': winnings,
                    'Losses': losses,
                    'PercentageWins': percentageWins,
                    'GainLoss': winnings - staked, 
                    'PercentGainLoss': percentageGains
                }])
            self.__prediction_summary = pd.concat([self.__prediction_summary, row], axis=0, ignore_index=True)

    def aggregate_prediction_summary(self) -> pd.DataFrame:
        if len(self.__prediction_summary) == 0:
            return pd.DataFrame(
                data = {
                    'Predicted': [0, 0, np.NAN, np.NAN, np.NAN, np.NAN], 
                    'Wins': [0, 0, np.NAN, np.NAN, np.NAN, np.NAN], 
                    'PercentageWins': [np.NAN, np.NAN, np.NAN, np.NAN, np.NAN, np.NAN],
                    'GainLoss': [0, 0, 0, 0, 0, 0],
                    'WinningsLoss': [0, 0, 0, 0, 0, 0]
                }, 
                index=['average', 'sum', 'std', 'min', 'max', 'skew'])

        return self.__prediction_summary.agg(
            {
                "Predicted" : ["average", "sum"],
                "Wins" : ["average", "sum"],
                "PercentageWins" : ["average", "std"],
                "GainLoss": ["min", "max", "average", "skew", "std", "sum"],
                "Winnings": ["min", "max", "average", "skew", "std", "sum"],
            }
        )


In [97]:
class FirstStallWinnerPredictor(RacePredictor):
    def process_filtered_results(self, daily_results : pd.DataFrame) -> None:
        predictions = daily_results.sort_values('RaceCardNumber', ascending=True).groupby('RaceId').first().copy()
        predictions['PredictedPosition'] = 1
        self._update_predictions(predictions)

    def after_process_data(self) -> None:
        super().after_process_data()
        print(self.aggregate_prediction_summary())

In [None]:
predictor = FirstStallWinnerPredictor()
predictor.process_race_data(history)
predictor.predictions.head()

**Conclusions from Step 5**: Betting £1 randomly on 1040 races results in an overall loss of £156 :(

Step 6: Analyse factors to understand if they have influence on the outcome of races. Bill Benter suggested the following attributes:

Current condition:
- performance in recent races
- time since last race
- recent workout data
- age of horse

Past performance:
- finishing position in past races
- lengths behind winner in past races
- normalized times of past races

Adjustments to past performance:
- strength of competition in past races
- weight carried in past races
- jockey's contribution to past performances
- compensation for bad luck in past races
- compensation for advantageous or disadvantageous post position in past races

Present race situational factors:
- weight to be carried
- today's jockey's ability
- advantages or disadvantages of the assigned post position

Preferences which could influence the horse's performance in today's race:
- distance preference
- surface preference (turf vs dirt)
- condition of surface preference (wet vs dry)
- specific track preference

Step 6.1: Define abstract feature factory for feature to the daily results that will be used to inform predictions

In [150]:
from typing import List

class FeatureFactory:
    @abstractmethod
    def add_features(self, history: pd.DataFrame, daily_results : pd.DataFrame) -> pd.DataFrame:
        pass


class FeaturePredictor(RacePredictor):
    def __init__(self, feature_factory: FeatureFactory) -> None:
        super().__init__()
        self.__feature_factory = feature_factory
        self.__features = None

    @property
    def features(self):
        return self.__features

    def process_filtered_results(self, daily_results : pd.DataFrame) -> None:
        daily_results_with_new_features = self.__feature_factory.add_features(self.history, daily_results.copy())
        self.__features = daily_results_with_new_features if self.__features is None else pd.concat([self.__features, daily_results_with_new_features])
        predictions = self.calculate_predictions(daily_results_with_new_features)
        self._update_predictions(predictions)
    
    @abstractmethod
    def calculate_predictions(self, daily_results : pd.DataFrame) -> pd.DataFrame:
        pass

    def after_process_data(self) -> None:
        super().after_process_data()
        print(self.aggregate_prediction_summary())


class FeatureBuilder(RaceDataProcessorIgnoringUnknownRunnersOrJockies):
    def __init__(self, feature_factories: List[FeatureFactory]) -> None:
        super().__init__()
        self.__feature_factories = feature_factories
        self.__features = None

    @property
    def features(self):
        return self.__features

    def process_filtered_results(self, daily_results : pd.DataFrame) -> None:
        daily_results_with_new_features = daily_results.copy()
        for feature_factory in self.__feature_factories:
            daily_results_with_new_features = feature_factory.add_features(self.history, daily_results_with_new_features)
        self.__features = daily_results_with_new_features if self.__features is None else pd.concat([self.__features, daily_results_with_new_features])



class LowestValueFeaturePredictor(FeaturePredictor):
    def __init__(self, feature_factory: FeatureFactory, prediction_feature_name: str, drop_races_with_missing_feature: bool = False) -> None:
        super().__init__(feature_factory)
        self.__prediction_feature_name = prediction_feature_name
        self.__drop_races_with_missing_features = drop_races_with_missing_feature

    def calculate_predictions(self, daily_results: pd.DataFrame) -> pd.DataFrame:
        if self.__drop_races_with_missing_features:
            predictable = daily_results.groupby('RaceId').filter(lambda g: g.isnull().values.sum() == 0)
        else:
            predictable = daily_results.dropna(axis=0, subset=[self.__prediction_feature_name])

        predictions = predictable.sort_values(self.__prediction_feature_name, ascending=True).groupby('RaceId').first().copy()
        predictions['PredictedPosition'] = 1
        return predictions


class HighestValueFeaturePredictor(FeaturePredictor):
    def __init__(self, feature_factory: FeatureFactory, prediction_feature_name: str, drop_races_with_missing_feature: bool = False) -> None:
        super().__init__(feature_factory)
        self.__prediction_feature_name = prediction_feature_name
        self.__drop_races_with_missing_features = drop_races_with_missing_feature

    def calculate_predictions(self, daily_results: pd.DataFrame) -> pd.DataFrame:
        if self.__drop_races_with_missing_features:
            predictable = daily_results.groupby('RaceId').filter(lambda g: g.isnull().values.sum() == 0)
        else:
            predictable = daily_results.dropna(axis=0, subset=[self.__prediction_feature_name])

        predictions = predictable.sort_values(self.__prediction_feature_name, ascending=False).groupby('RaceId').first().copy()
        predictions['PredictedPosition'] = 1
        return predictions



Step 6.2 Predict using performance in recent races. 

1. For each race in the last x months, sum the overall beaten distance and divide by the number of races (to average the performance over the period - since some horses may have ran more races than others in the given time frame)
1. Predict that the horse with be best/lowers past performance will win

In [103]:
class RecentRacePerformanceFeatureFactory(FeatureFactory):
    def __init__(self, days_to_process : int = 15) -> None:
        super().__init__()
        self.__days_to_process = days_to_process


    def add_features(self, history: pd.DataFrame, daily_results: pd.DataFrame) -> pd.DataFrame:
        history_end = history['Off'].max().date()
        recent_history_start = history_end - timedelta(days=self.__days_to_process)
        recent_history = history[history['Off'].dt.date >= recent_history_start]

        horse_performance = recent_history.groupby('HorseId').apply(lambda g: self.__calculate_overall_horse_performance(g))

        daily_results_with_new_features = pd.merge(daily_results, horse_performance, how='left', on=['HorseId'])
        return daily_results_with_new_features

    def __calculate_overall_horse_performance(self, race_group: pd.DataFrame) -> pd.Series:
        new_columns = {}
        new_columns['RaceCount'] = race_group['RaceId'].count()
        new_columns['RecentPerformance'] = race_group['OverallBeatenDistance'].sum() / race_group['RaceId'].count()
        return pd.Series(new_columns, index=['RaceCount', 'RecentPerformance']) 


In [143]:
for i in range(30, 31):
    predictor = LowestValueFeaturePredictor(RecentRacePerformanceFeatureFactory(days_to_process=i), 'RecentPerformance')
    print(f'Predicting results using {i} days of recent history')
    predictor.process_race_data(history)
    correlation = predictor.features[['Wins', 'RecentPerformance']].corr(method='spearman')['Wins']['RecentPerformance']
    print(f'  correlation = {correlation}')
    print()

Predicting results using 30 days of recent history
Processed 1040 of 1651 races:
  Total ignored races: 611 (with unknown runner or jockey)
           Predicted        Wins  PercentageWins    GainLoss    Winnings
average    33.366667    5.866667       17.343934   -5.910401   27.456266
sum      1001.000000  176.000000             NaN -177.312032  823.687968
std              NaN         NaN        7.746484   13.360445   16.117111
min              NaN         NaN             NaN  -29.000000    3.750000
max              NaN         NaN             NaN   17.950000   66.900000
skew             NaN         NaN             NaN    0.232433    0.776615
  correlation = -0.11712525433480271



Step 6.3 Predict using jockey performance in recent races. 

1. For each race in the last x months, sum the overall beaten distance for a given jockey and divide by the number of races (to average the performance over the period - since some horses may have ran more races than others in the given time frame)
1. Predict that the jockey with be best/lowers past performance will win

In [144]:
class RecentJockeyRacePerformanceFeatureFactory(FeatureFactory):
    def __init__(self, days_to_process : int = 15) -> None:
        super().__init__()
        self.__days_to_process = days_to_process


    def add_features(self, history: pd.DataFrame, daily_results: pd.DataFrame) -> pd.DataFrame:
        history_end = history['Off'].max().date()
        recent_history_start = history_end - timedelta(days=self.__days_to_process)
        recent_history = history[history['Off'].dt.date >= recent_history_start]

        jockey_performance = recent_history.groupby('JockeyId').apply(lambda g: self.__calculate_jockey_performance(g))

        daily_results_with_new_features = pd.merge(daily_results, jockey_performance, how='left', on=['JockeyId'])
        return daily_results_with_new_features

    def __calculate_jockey_performance(self, race_group: pd.DataFrame) -> pd.Series:
        new_columns = {}
        new_columns['RaceCount'] = race_group['RaceId'].count()
        new_columns['RecentJockeyPerformance'] = race_group['OverallBeatenDistance'].sum() / race_group['RaceId'].count()
        return pd.Series(new_columns, index=['RaceCount', 'RecentJockeyPerformance']) 

In [146]:
for i in range(30, 31):
    predictor = LowestValueFeaturePredictor(RecentJockeyRacePerformanceFeatureFactory(days_to_process=i), 'RecentJockeyPerformance')
    print(f'Predicting results using {i} days of recent jockey history')
    predictor.process_race_data(history)
    correlation = predictor.features[['Wins', 'RecentJockeyPerformance']].corr(method='spearman')['Wins']['RecentJockeyPerformance']
    print(f'  correlation = {correlation}')
    print()

Predicting results using 30 days of recent jockey history
Processed 1040 of 1651 races:
  Total ignored races: 611 (with unknown runner or jockey)
           Predicted        Wins  PercentageWins    GainLoss    Winnings
average    34.666667    5.633333       16.324256   -5.227456   29.439211
sum      1040.000000  169.000000             NaN -156.823665  883.176335
std              NaN         NaN        7.787383   15.359460   17.800923
min              NaN         NaN             NaN  -29.875000    0.000000
max              NaN         NaN             NaN   45.000000   83.000000
skew             NaN         NaN             NaN    0.989620    0.981410
  correlation = -0.007379161185814985



In [151]:
builder = FeatureBuilder([RecentJockeyRacePerformanceFeatureFactory(), RecentRacePerformanceFeatureFactory()])
builder.process_race_data(history)

Processed 1040 of 1651 races:
  Total ignored races: 611 (with unknown runner or jockey)


In [187]:
features = builder.features[['Wins', 'RecentJockeyPerformance', 'RecentPerformance', 'Age', 'WeightInPounds']]

In [188]:
features.corr(method='spearman')['Wins']


Wins                       1.000000
RecentJockeyPerformance   -0.000493
RecentPerformance         -0.131942
Age                        0.001105
WeightInPounds             0.073938
Name: Wins, dtype: float64

In [None]:
top_horses = history[['HorseId', 'RaceId']].groupby('HorseId').count().reset_index().sort_values('RaceId', ascending=False)['HorseId'].head(10).to_list()
top_horse_races = history[history['HorseId'].isin(top_horses)]['RaceId'].unique()

In [234]:
df = history[history['HorseId'].isin(top_horses)][['RaceType', 'WeightInPounds', 'HorseId', 'RaceId', 'Surface', 'Going', 'Wins', 'RaceTimeInSeconds', 'DistanceInMeters']]
df['Speed'] = df['DistanceInMeters'] / df['RaceTimeInSeconds']
df.sort_values(["HorseId", "Surface", "Going", "DistanceInMeters", "WeightInPounds"])
# TODO: calculate youngest horse in a given race, least weight, days since last race 
# speed on race type, surface, and going and adjust for weight carried
# - how does these features correlate with wins?


Unnamed: 0,RaceType,WeightInPounds,HorseId,RaceId,Surface,Going,Wins,RaceTimeInSeconds,DistanceInMeters,Speed
6401,Other,130,1431174,798414,AllWeather,Standard,0,61.4700,1005,16.349439
12802,Other,130,1431174,802356,AllWeather,Standard,0,61.5200,1005,16.336151
6002,Other,132,1431174,810161,AllWeather,Standard,0,61.5200,1005,16.336151
6631,Other,135,1431174,819781,AllWeather,Standard,0,62.0300,1005,16.201838
10850,Other,139,1431174,801573,AllWeather,Standard,0,61.0100,1005,16.472709
...,...,...,...,...,...,...,...,...,...,...
1523,Other,139,3564056,790519,Turf,Soft,1,86.4700,1407,16.271539
10875,Other,139,3564056,791991,Turf,Soft,0,85.0425,1407,16.544669
11165,Other,139,3564056,791991,Turf,Soft,0,85.0425,1407,16.544669
15255,Other,138,3564056,796464,Turf,Very Soft,0,105.9000,1608,15.184136


In [236]:
df[['WeightInPounds', 'Speed']].corr()

Unnamed: 0,WeightInPounds,Speed
WeightInPounds,1.0,-0.165697
Speed,-0.165697,1.0
