# Horse Racing Dataset Models

In [1]:
import pandas as pd
import numpy as np
import random as rand

#Supresses scientific notation
pd.set_option('display.float_format', lambda x: '%.2f' % x)

import warnings
warnings.filterwarnings('ignore')

print(pd.__version__)

1.4.2


In [2]:
# Requires CleanedHorses.csv, which was created in the HorseRacingDataCleaning notebook
df_cleaned = pd.read_csv('CleanedHorses.csv',
                         skipinitialspace=True, 
                         low_memory=False)

df_cleaned.head()

Unnamed: 0,date,venue_name,race_number,market_name,previous_margin,position_again,bf_odds,bf_odds_two_mins_out,vic_tote,vic_tote_two_mins_out,...,track_win_percent_norm,track_place_percent_norm,distance_win_percent_norm,distance_place_percent_norm,condition_win_percent_norm,condition_starts_norm,condition_place_percent_norm,prize_money_per_start_norm,bf_odds_place,target
0,2016-06-27,Echuca,3,R3 1200m Mdn,6.8,1.0,2.88,3.0,2.3,3.5,...,,,,1.0,,0.0,,0.2,1.63,2.88
1,2016-06-27,Echuca,3,R3 1200m Mdn,20.8,2.0,15.0,18.0,11.2,11.7,...,,,,1.0,,0.33,,0.47,4.2,4.2
2,2016-06-27,Echuca,3,R3 1200m Mdn,,3.0,95.0,100.0,46.4,37.1,...,,,,0.0,,1.0,,0.29,21.2,21.2
3,2016-06-27,Echuca,3,R3 1200m Mdn,4.0,,20.0,17.0,13.2,14.2,...,,,,0.0,,0.0,,0.11,5.27,0.0
4,2016-06-27,Echuca,3,R3 1200m Mdn,,,2.74,2.68,2.6,3.2,...,,,,0.0,,0.0,,0.08,1.6,0.0


In [3]:
df_cleaned.drop('position_again', axis=1, inplace=True)

# Eliminating columns that cannot be easily converted into dummy variables
df_cleaned.drop(['name', 'runner_name_uuid', 'sire', 'dam', 'jockey', 'trainer'], axis=1, inplace=True)

In [4]:
df_cleaned.isna().sum().where(lambda x: x>df_cleaned.shape[0]*.5).dropna()

penalty_norm               238331.00
runs_since_spell_norm      238331.00
firm_starts_norm           128042.00
firm_wins_norm             198389.00
firm_places_norm           176606.00
heavy_wins_norm            128625.00
class_same_wins_norm       120453.00
class_stronger_wins_norm   134189.00
track_distance_wins_norm   128373.00
distance_norm              238331.00
track_win_percent_norm     146614.00
track_place_percent_norm   122103.00
dtype: float64

In [5]:
df_cleaned.drop(['penalty_norm', 'runs_since_spell_norm', 'distance_norm'], axis=1, inplace=True)

### Creating dummy variables:

In [6]:
df_cleaned.set_index(['date','venue_name','race_number','market_name'], inplace=True)
df_cleaned.sort_index(level=['date','venue_name','race_number'],inplace=True)

In [7]:
df_cleaned['uniq_idx'] = range(len(df_cleaned))
df_cleaned.set_index('uniq_idx', append=True, inplace=True)

In [8]:
categorical_list = list(df_cleaned.select_dtypes('object').columns)

In [9]:
df_cleaned = pd.get_dummies(df_cleaned, columns=categorical_list, dummy_na=True)

### Replacing NaNs:

In [10]:
numerical_list = list(df_cleaned.select_dtypes('number').columns)

In [11]:
for col in numerical_list:
    if df_cleaned[col].isna().sum() != 0:
        df_cleaned[f'{col}_nan'] = np.where(df_cleaned[col].isna(), 1, 0)

In [12]:
df_cleaned.fillna(0, inplace=True)

### Creating a Train-Test Split:

In [13]:
from math import floor

In [14]:
df_grouped = df_cleaned.groupby(['date','venue_name','race_number','market_name'])

df_grouped_list = [df_grouped.get_group(x) for x in df_grouped.groups]
len_sgl = len(df_grouped_list)

In [15]:
df_cleaned = df_cleaned.reset_index(level=list(range(4)), drop=True)

In [16]:
train_grouped = df_grouped_list[:floor(.9*len_sgl)]
test_grouped = df_grouped_list[floor(.9*len_sgl):]

In [17]:
train_indices = [int(idx[-1]) for group in train_grouped for idx in group.index]
test_indices = [int(idx[-1]) for group in test_grouped for idx in group.index]

In [18]:
train = df_cleaned.loc[train_indices]
test = df_cleaned.loc[test_indices]

### Oversampling:

In [19]:
zero_count = train['target'][train['target'] == 0].count()
non_zero_count = train['target'][train['target'] != 0].count()

print("Non-Winning/Non-Placing:", zero_count)
print("Winning/Placing:", non_zero_count)

Non-Winning/Non-Placing: 153399
Winning/Placing: 61735


In [20]:
non_zero_idx_list = train['target'][train['target'] != 0].index
idx_list = []

for i in range(zero_count - non_zero_count):
    idx_list.append(rand.choice(non_zero_idx_list))

In [21]:
df_cleaned_new = pd.concat([train, train.loc[idx_list]])
df_cleaned_new.shape

(306798, 459)

In [22]:
zero_count_new = df_cleaned_new['target'][df_cleaned_new['target'] == 0].count()
non_zero_count_new = df_cleaned_new['target'][df_cleaned_new['target'] != 0].count()

print("Non-Winning/Non-Placing:", zero_count_new)
print("Winning/Placing:", non_zero_count_new)

Non-Winning/Non-Placing: 153399
Winning/Placing: 153399


In [23]:
df_cleaned_new.reset_index(drop=True, inplace=True)

In [24]:
df_cleaned_new.head()

Unnamed: 0,previous_margin,bf_odds,bf_odds_two_mins_out,vic_tote,vic_tote_two_mins_out,nsw_tote,nsw_tote_two_mins_out,nsw_odds,betfair_slope,vic_tote_slope,...,overall_win_percent_norm_nan,overall_place_percent_norm_nan,track_win_percent_norm_nan,track_place_percent_norm_nan,distance_win_percent_norm_nan,distance_place_percent_norm_nan,condition_win_percent_norm_nan,condition_starts_norm_nan,condition_place_percent_norm_nan,prize_money_per_start_norm_nan
0,6.8,2.88,3.0,2.3,3.5,2.4,3.3,2.6,-0.0,-0.0,...,1,0,1,1,1,0,1,0,1,0
1,20.8,15.0,18.0,11.2,11.7,12.0,12.3,11.0,-0.0,-0.0,...,1,0,1,1,1,0,1,0,1,0
2,0.0,95.0,100.0,46.4,37.1,40.4,35.0,51.0,0.21,0.04,...,1,0,1,1,1,0,1,0,1,0
3,4.0,20.0,17.0,13.2,14.2,11.8,13.6,12.0,0.01,-0.01,...,1,0,1,1,1,0,1,0,1,0
4,0.0,2.74,2.68,2.6,3.2,3.4,3.6,2.6,0.0,-0.0,...,1,0,1,1,1,0,1,0,1,0


In [25]:
X_train, y_train = df_cleaned_new.drop(['target'],axis=1), df_cleaned_new[['target']]
X_test, y_test = test.drop(['target'],axis=1), test[['target']]

### Creating the models:

In [26]:
from sklearn.ensemble import RandomForestRegressor

from time import time
from sklearn.metrics import mean_absolute_error, r2_score, precision_score
from sklearn.model_selection import RandomizedSearchCV

In [27]:
from sklearn.linear_model import Ridge

In [28]:
clf_ridge = Ridge(alpha=0.1, copy_X=True, fit_intercept=True, max_iter=None, normalize=False, random_state=0,
                  solver='auto', tol=0.001)

In [29]:
params = {'bootstrap': [True, False],
         'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
         'max_features': ['auto', 'sqrt'],
         'min_samples_leaf': [1, 2, 4],
         'min_samples_split': [2, 5, 10],
         'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}


clf_rfr = RandomForestRegressor(bootstrap=False, criterion='mse', max_depth=30, max_features='sqrt', max_leaf_nodes=None,
                                min_impurity_decrease=0.0, min_samples_leaf=1, min_samples_split=5, 
                                min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=None, oob_score=False, random_state=0,
                                verbose=0, warm_start=False)

In [30]:
grid_rfr = RandomizedSearchCV(estimator=clf_rfr, param_distributions=params, verbose=2)

#### A function to split data and fit, predict, and score models:

In [31]:
def model_function(model,X_train,y_train,X_test,y_test):
    start = time()
  
    model.fit(X_train,y_train)
    
    end = time()

    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)

    print(f"Train MAE: {mean_absolute_error(y_train, y_pred_train)}")
    print(f"Test MAE: {mean_absolute_error(y_test, y_pred_test)}")
    print()
    print(f"Train R2: {r2_score(y_train, y_pred_train)}")
    print(f"Test R2: {r2_score(y_test, y_pred_test)}") 
    print()
    print(f"Time: {end-start}")
    print()
    
    total_picks = 0
    correct_picks = []

    for a,b in zip(y_test.values, y_pred_test):
        if b - 1 > 0:
            total_picks += 1
            if a - 1 > 0:
                correct_picks.append(a)

    average_win = np.mean(correct_picks)

    print("Test Results")
    print("Total Horses:",len(y_test))
    print("Total Picks:",total_picks)
    print("Percent of Horses Picked:",total_picks/len(y_test)*100)
    print("Correct Picks:",len(correct_picks))
    print("Precision:", len(correct_picks)/total_picks*100)
    print("Average Win Odds:",average_win)
    print("Total Return:",average_win*len(correct_picks)-total_picks)
    print("Average Expected Return:",(average_win*len(correct_picks)-total_picks)/total_picks)
    print()

    return model

### Random Forest

In [32]:
model_ridge = model_function(model=clf_ridge, 
                                X_train=X_train, 
                                y_train=y_train, 
                                X_test=X_test,
                                y_test=y_test)

Train MAE: 3.340814306935625
Test MAE: 3.329286813441329

Train R2: 0.02997591522889409
Test R2: -0.09171493242227102

Time: 9.601893186569214

Test
Total Horses: 23197
Total Picks: 22122
Percent of Horses Picked: 95.36578005776609
Correct Picks: 6399
Precision: 28.925956061838892
Average Win Odds: 6.173478668541961
Total Return: 17382.090000000004
Average Expected Return: 0.7857377271494441



In [33]:
model_rfr = model_function(model=clf_rfr, 
                                X_train=X_train, 
                                y_train=y_train, 
                                X_test=X_test,
                                y_test=y_test)

Train MAE: 0.5962784422227704
Test MAE: 2.770083519598643

Train R2: 0.9680129394740036
Test R2: -0.021802956921210814

Time: 996.3682701587677

Test
Total Horses: 23197
Total Picks: 22490
Percent of Horses Picked: 96.95219209380524
Correct Picks: 6784
Precision: 30.164517563361493
Average Win Odds: 5.957169811320755
Total Return: 17923.440000000002
Average Expected Return: 0.796951534015118

