In [1]:
#%pip install numpy
#%pip install Pandas

In [2]:
import numpy as np
import pandas as pd

In [3]:
import glob
from datetime import timedelta
results_files = glob.glob('Results_*.csv')
results_files

history = pd.concat([pd.read_csv(f) for f in results_files])
history = history[history['ResultStatus'] == 'CompletedRace']
history['Off'] =  pd.to_datetime(history['Off'], format='%m/%d/%Y %H:%M:%S')
history_start = history['Off'].min().date()
history_end = history['Off'].max().date() - timedelta(days=1)
predict_window_size = 90

In [9]:
from typing import Callable

def predict_races_using(prediction_func: Callable[[pd.DataFrame, pd.DataFrame], pd.DataFrame], max_days: int = -1) -> pd.DataFrame:
    window_start = history_start
    loop_end = history_end - timedelta(days=predict_window_size)   
    daily_predictions_made = 1
    prediction_summary = pd.DataFrame(columns=['Date', 'Races', 'Predicted', 'Wins', 'Losses', 'Gains'])
    while window_start < loop_end:
        window_end = window_start + timedelta(days=predict_window_size)
        window = history[(history['Off'].dt.date >= window_start) & (history['Off'].dt.date < window_end)].copy()
        prediction_start = window_end + timedelta(days=1)
        prediction_end = prediction_start + timedelta(days=1)
        race_cards = history[(history['Off'].dt.date >= prediction_start) & (history['Off'].dt.date < prediction_end)].copy()
        race_count = race_cards["RaceId"].nunique()
        
        predictions = prediction_func(race_cards, window)

        predicted = len(predictions)
        staked = predicted # £1 stake per prediction
        winners = predictions[predictions['PredictedPosition'] == predictions['FinishingPosition']]
        wins = len(winners)
        losses = predicted - wins
        percentageWins = (wins / predicted) * 100.0;
        winnings = winners['DecimalOdds'].sum()
        percentageGains = ((winnings - losses) / staked) * 100.0;
        print(f'Scored: {predicted}, Won: {wins}, Winnings (with £1 stake): {winnings}, Lost: {losses}, %gains/loss: {percentageGains}')

        row = pd.DataFrame([
            {
                'Date': prediction_start, 
                'Races': race_count, 
                'Predicted': predicted, 
                'Wins': wins, 
                'Winnings': winnings,
                'Losses': losses,
                'PercentageWins': percentageWins,
                'GainLoss': winnings - staked, 
                'PercentGainLoss': percentageGains
            }])
        prediction_summary = pd.concat([prediction_summary, row], axis=0, ignore_index=True)

        window_start = window_start + timedelta(days=1)
        daily_predictions_made = daily_predictions_made + 1
        if max_days != -1 and daily_predictions_made > max_days:
            break
    return prediction_summary

## First Runner Predictor
Baseline predictions by choosing the first horse on each race card.

Should be fairly random and allow us to score more **real** predictions against dumb luck.

In [5]:
def first_runner_predictor(race_cards : pd.DataFrame, prior_race_results: pd.DataFrame) -> pd.DataFrame:
    results = race_cards.sort_values('RaceCardNumber').groupby('RaceId').first().copy()
    results['PredictedPosition'] = 1
    return results

In [10]:
first_runner_predictions = predict_races_using(first_runner_predictor, max_days=10)
first_runner_predictions 

Scored: 52, Won: 13, Winnings (with £1 stake): 95.78205128205128, Lost: 39, %gains/loss: 109.19625246548324
Scored: 30, Won: 2, Winnings (with £1 stake): 5.7, Lost: 28, %gains/loss: -74.33333333333334
Scored: 58, Won: 9, Winnings (with £1 stake): 46.875757575757575, Lost: 49, %gains/loss: -3.6624869383490086
Scored: 51, Won: 8, Winnings (with £1 stake): 33.919444444444444, Lost: 43, %gains/loss: -17.805010893246187
Scored: 60, Won: 16, Winnings (with £1 stake): 90.93560606060606, Lost: 44, %gains/loss: 78.2260101010101
Scored: 90, Won: 16, Winnings (with £1 stake): 80.12575757575758, Lost: 74, %gains/loss: 6.806397306397306
Scored: 57, Won: 10, Winnings (with £1 stake): 31.749999999999996, Lost: 47, %gains/loss: -26.754385964912288
Scored: 38, Won: 6, Winnings (with £1 stake): 31.333333333333336, Lost: 32, %gains/loss: -1.7543859649122744
Scored: 42, Won: 9, Winnings (with £1 stake): 31.144444444444442, Lost: 33, %gains/loss: -4.417989417989424
Scored: 42, Won: 11, Winnings (with £1 st

Unnamed: 0,Date,Races,Predicted,Wins,Losses,Gains,Winnings,PercentageWins,GainLoss,PercentGainLoss
0,2022-06-06,52,52,13,39,,95.782051,25.0,43.782051,109.196252
1,2022-06-07,30,30,2,28,,5.7,6.666667,-24.3,-74.333333
2,2022-06-08,58,58,9,49,,46.875758,15.517241,-11.124242,-3.662487
3,2022-06-09,51,51,8,43,,33.919444,15.686275,-17.080556,-17.805011
4,2022-06-10,60,60,16,44,,90.935606,26.666667,30.935606,78.22601
5,2022-06-11,90,90,16,74,,80.125758,17.777778,-9.874242,6.806397
6,2022-06-12,57,57,10,47,,31.75,17.54386,-25.25,-26.754386
7,2022-06-13,38,38,6,32,,31.333333,15.789474,-6.666667,-1.754386
8,2022-06-14,42,42,9,33,,31.144444,21.428571,-10.855556,-4.417989
9,2022-06-15,42,42,11,31,,83.948718,26.190476,41.948718,126.068376


In [11]:
first_runner_predictions.agg(
    {
        "PercentageWins" : ["average", "std"],
        "GainLoss": ["min", "max", "average", "skew", "std", "sum"],
        "Winnings": ["min", "max", "average", "skew", "std", "sum"],
    }
)

Unnamed: 0,PercentageWins,GainLoss,Winnings
average,18.826701,1.151511,53.151511
std,6.157394,26.923038,31.623393
min,,-25.25,5.7
max,,43.782051,95.782051
skew,,0.892826,0.154603
sum,,11.515113,531.515113


## Best rating predictor

Predict the winner of a race using the racing post rating

In [42]:
def best_rating_runner_predictor(race_cards : pd.DataFrame, prior_race_results: pd.DataFrame) -> pd.DataFrame:
    results = race_cards.sort_values('RacingPostRating', ascending=False).groupby('RaceId').first().copy()
    results['PredictedPosition'] = 1
    return results

In [43]:
best_rating_runner_predictions = predict_races_using(best_rating_runner_predictor, max_days=10)
best_rating_runner_predictions

Scored: 52, Won: 36, Winnings (with £1 stake): 242.18443223443225, Lost: 16, %gains/loss: 434.97006198929273
Scored: 30, Won: 20, Winnings (with £1 stake): 107.05833333333335, Lost: 10, %gains/loss: 323.5277777777779
Scored: 58, Won: 37, Winnings (with £1 stake): 236.09794372294374, Lost: 21, %gains/loss: 370.85852366024784
Scored: 51, Won: 34, Winnings (with £1 stake): 225.27222222222218, Lost: 17, %gains/loss: 408.3769063180827
Scored: 60, Won: 40, Winnings (with £1 stake): 225.205303030303, Lost: 20, %gains/loss: 342.00883838383834
Scored: 90, Won: 59, Winnings (with £1 stake): 371.6848484848485, Lost: 31, %gains/loss: 378.53872053872055
Scored: 57, Won: 40, Winnings (with £1 stake): 194.7138888888889, Lost: 17, %gains/loss: 311.7787524366472
Scored: 38, Won: 30, Winnings (with £1 stake): 160.60075757575757, Lost: 8, %gains/loss: 401.5809409888357
Scored: 42, Won: 28, Winnings (with £1 stake): 181.23055555555553, Lost: 14, %gains/loss: 398.16798941798936
Scored: 42, Won: 30, Winning

Unnamed: 0,Date,Races,Predicted,Wins,Losses,Gains,Winnings,PercentageWins,GainLoss,PercentGainLoss
0,2022-06-06,52,52,36,16,,242.184432,69.230769,190.184432,434.970062
1,2022-06-07,30,30,20,10,,107.058333,66.666667,77.058333,323.527778
2,2022-06-08,58,58,37,21,,236.097944,63.793103,178.097944,370.858524
3,2022-06-09,51,51,34,17,,225.272222,66.666667,174.272222,408.376906
4,2022-06-10,60,60,40,20,,225.205303,66.666667,165.205303,342.008838
5,2022-06-11,90,90,59,31,,371.684848,65.555556,281.684848,378.538721
6,2022-06-12,57,57,40,17,,194.713889,70.175439,137.713889,311.778752
7,2022-06-13,38,38,30,8,,160.600758,78.947368,122.600758,401.580941
8,2022-06-14,42,42,28,14,,181.230556,66.666667,139.230556,398.167989
9,2022-06-15,42,42,30,12,,240.3419,71.428571,198.3419,543.67119


In [46]:
best_rating_runner_predictions.agg(
    {
        "PercentageWins" : ["average", "std"],
        "GainLoss": ["min", "max", "average", "skew", "std", "sum"],
        "Winnings": ["min", "max", "average", "skew", "std", "sum"],
    }
)

Unnamed: 0,PercentageWins,GainLoss,Winnings
average,68.579747,166.439018,218.439018
std,4.285254,54.301816,68.82381
min,,77.058333,107.058333
max,,281.684848,371.684848
skew,,0.638783,0.847309
sum,,1664.390185,2184.390185


## Fastest horse predictor

Predict the winner of a race by calculating the fastest horse in the race using the historic date on previous runs over the same race type (flat, hurdles, etc.), distance type (short, medium, long distance) and going (firm, soft, good etc.)

#### Classify distances by type

In [14]:
def calculate_distance_type(row):
    if row['DistanceInMeters'] < 1300:
        return 'VeryShort'
    elif row['DistanceInMeters'] < 1700:
        return 'Short'
    elif row['DistanceInMeters'] < 3000:
        return 'Medium'
    elif row['DistanceInMeters'] < 4000:        
        return 'Long'
    else:
        return 'VeryLong'

#### Calculate average speed over previous races

Sum the total distance of previous races and divide by total race time. This function will be used to aggregate results that have already been grouped by race type, distance type, and going. 

In [15]:
def calculate_average_speed_over_previous_races(x):
    d = {}
    d['RacesRan'] = x['HorseId'].count()
    d['TotalDistanceInMeters'] = x['DistanceInMeters'].sum()
    d['TotalTimeInSeconds'] = x['RaceTimeInSeconds'].sum()
    d['AverageSpeed'] = d['TotalDistanceInMeters'] / d['TotalTimeInSeconds']
    return pd.Series(d, index=['RacesRan', 'TotalDistanceInMeters', 'TotalTimeInSeconds', 'AverageSpeed'])

In [16]:
def calculate_speed_race_aggregates(x):
    d = {}
    d['HorseCount'] = x['HorseId'].count()
    d['PreviouslyRanOnSimilarCourseCount'] = x[x['AverageSpeed'] > 0]['HorseId'].count()
    return pd.Series(d, index=['HorseCount', 'PreviouslyRanOnSimilarCourseCount'])

In [44]:
def average_speed_predictor(race_cards : pd.DataFrame, prior_race_results: pd.DataFrame) -> pd.DataFrame:
    print('Calculating distance types...')
    prior_race_results['DistanceType'] = prior_race_results.apply(calculate_distance_type, axis=1)
    print(f'Calculated distance types for {len(prior_race_results)} history rows...')
    race_cards['DistanceType'] = race_cards.apply(calculate_distance_type, axis=1)
    print(f'Calculated distance types for {len(race_cards)} race cards rows...')
    
    print('Calculating average speeds...')    
    average_speeds = prior_race_results.groupby(['HorseId', 'RaceType', 'DistanceType', 'Going']).apply(calculate_average_speed_over_previous_races)
    print(f'Calculated average speeds for {len(average_speeds)} horses...')
    results_with_speeds = pd.merge(race_cards, average_speeds, how='left', on=['HorseId', 'RaceType', 'DistanceType', 'Going'])
    results_with_speeds['AverageSpeed'] = results_with_speeds['AverageSpeed'].fillna(0)

    # Filter out races that don't have "enough" horses with average speeds
    minimum_previous_runners = 1
    races_with_speed_counts = results_with_speeds.groupby('RaceId').apply(calculate_speed_race_aggregates)
    races_ids_to_predict = races_with_speed_counts[races_with_speed_counts['PreviouslyRanOnSimilarCourseCount'] > minimum_previous_runners].reset_index()['RaceId'].to_list()
    races_to_predict = results_with_speeds[results_with_speeds['RaceId'].isin(races_ids_to_predict) & results_with_speeds['AverageSpeed'] > 0]

    results = races_to_predict.sort_values('AverageSpeed', ascending=False).groupby('RaceId').first().copy()
    results['PredictedPosition'] = 1  
    return results    


In [45]:
average_speed_predictions = predict_races_using(average_speed_predictor, max_days=1)
average_speed_predictions

Calculating distance types...
Calculated distance types for 49776 history rows...
Calculated distance types for 478 race cards rows...
Calculating average speeds...
Calculated average speeds for 35756 horses...
Scored: 18, Won: 4, Winnings (with £1 stake): 44.83333333333333, Lost: 14, %gains/loss: 171.29629629629628


Unnamed: 0,Date,Races,Predicted,Wins,Losses,Gains,Winnings,PercentageWins,GainLoss,PercentGainLoss
0,2022-06-06,52,18,4,14,,44.833333,22.222222,26.833333,171.296296
