# Predictive Analytics in Horse Racing using Gradient Boosting and AdaBoost

### *Group Members*

- Harrye Fredericksen (23020603)
- Kobe Spring (24004428)
- Lagi Rabo (04225368)
- Jason Wing (16339768)

## Introduction


Horse racing 

### ***Imports***

In [105]:
import pandas as pd
import numpy as np
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, f1_score, accuracy_score
from sklearn.model_selection import RandomizedSearchCV

import matplotlib.pyplot as plt


### ***Dataset***



In [None]:
#Harrye please show the code that was used to obtain this data and place in any cleaning code that was performed - make sure to have it documented down here as EXTENSIVELY as possible please.

In [11]:
cleaned_df = pd.read_csv("cleaned_data_v1.csv")
cleaned_df.head(10)

Unnamed: 0,course_id,race_id,dist_m,going,ran,pos,draw,ovr_btn,btn,horse_id,...,has_blinkers,has_tongue_tie,has_visor,has_cheekpieces,has_eye_hood,has_eyecover,has_eyeshield,has_hood,no_headgear,first_time_equipment
0,297,308856,3200,5.0,22,1,13.0,0.0,0.0,548204,...,0,0,0,0,0,0,0,0,0,0
1,297,308856,3200,5.0,22,2,15.0,0.75,0.75,518856,...,0,0,0,0,0,0,0,0,0,0
2,297,308856,3200,5.0,22,3,11.0,6.75,6.0,447498,...,0,0,0,0,0,0,0,0,0,0
3,297,308856,3200,5.0,22,4,19.0,7.5,0.75,458259,...,1,0,0,0,0,0,0,0,0,0
4,297,308856,3200,5.0,22,5,10.0,7.75,0.3,554624,...,0,0,0,0,0,0,0,0,0,0
5,297,308856,3200,5.0,22,6,23.0,7.75,0.05,554625,...,0,0,0,0,0,0,0,0,0,0
6,297,308856,3200,5.0,22,7,7.0,8.5,0.75,519815,...,1,0,0,0,0,0,0,0,0,0
7,297,308856,3200,5.0,22,8,21.0,8.75,0.2,504666,...,1,0,0,0,0,0,0,0,0,0
8,297,308856,3200,5.0,22,9,17.0,10.25,1.5,529388,...,0,0,0,0,0,0,0,0,0,0
9,297,308856,3200,5.0,22,10,3.0,12.0,1.75,514914,...,0,0,0,0,0,0,0,0,0,0


### Harrye

In [None]:
#Under your name enter in the ML code that you have created - ADDITIONALLY PLEASE write a relatively extensive paragraph AT minimum 5-6 sentences long that outlines what your ML question is looking at and why etc. I recommend everyone mnake their own initial copy of the df please.

### Kobe


In [None]:
#notes: discovered that there we a few very prolific sires so therefore had to log transform. one sire had 1000< foals - whilst alot have less <10

In [50]:
kns_df = cleaned_df.copy()
kns_df['top3'] = (kns_df['pos'] <= 3).astype(int)


train_df, test_df = train_test_split(
    kns_df, test_size=0.2, stratify=kns_df['top3'], random_state=42
)

min_runners = 3  

sire_stats = (
    train_df.groupby('sire_id')['top3']
    .agg(['mean', 'count'])
    .rename(columns={'mean': 'sire_top3_rate', 'count': 'sire_runners'})
)
sire_stats = sire_stats[sire_stats['sire_runners'] >= min_runners]

dam_stats = (
    train_df.groupby('dam_id')['top3']
    .agg(['mean', 'count'])
    .rename(columns={'mean': 'dam_top3_rate', 'count': 'dam_runners'})
)
dam_stats = dam_stats[dam_stats['dam_runners'] >= min_runners]

train_df = (
    train_df
    .merge(sire_stats[['sire_top3_rate','sire_runners']], on='sire_id', how='left')
    .merge(dam_stats[['dam_top3_rate','dam_runners']], on='dam_id', how='left')
)

test_df = (
    test_df
    .merge(sire_stats[['sire_top3_rate','sire_runners']], on='sire_id', how='left')
    .merge(dam_stats[['dam_top3_rate','dam_runners']], on='dam_id', how='left')
)

for col in ['sire_top3_rate','dam_top3_rate','sire_runners','dam_runners']:
    train_df[col] = train_df[col].fillna(0)
    test_df[col] = test_df[col].fillna(0)

In [106]:
for df in [train_df, test_df]:
    df['sire_runners_log'] = np.log1p(df['sire_runners'])

base_feats = ['dist_m','going','surface_Turf','ran','age','lbs','has_blinkers','has_tongue_tie']

lineage_feats = base_feats + ['sire_top3_rate','dam_top3_rate','sire_runners_log', 'dam_runners']

categorical_feats = ['surface_Turf', 'has_blinkers', 'has_tongue_tie']

base_feats = [f for f in base_feats if f in train_df.columns]
lineage_feats = [f for f in lineage_feats if f in train_df.columns]

X_train_b, y_train_b = train_df[base_feats], train_df['top3']
X_test_b,  y_test_b  = test_df[base_feats],  test_df['top3']

X_train_l, y_train_l = train_df[lineage_feats], train_df['top3']
X_test_l,  y_test_l  = test_df[lineage_feats],  test_df['top3']

In [107]:
pos_weight = len(y_train_b[y_train_b==0]) / len(y_train_b[y_train_b==1])

params = {
    'n_estimators': 400,
    'learning_rate': 0.05,
    'max_depth': 5,
    'objective': 'binary',
    'scale_pos_weight': [pos_weight],
    'random_state': 42
}

model_base = LGBMClassifier(**params)
model_line = LGBMClassifier(**params)

model_base.fit(X_train_b, y_train_b, categorical_feature=categorical_feats)
model_line.fit(X_train_l, y_train_l, categorical_feature=categorical_feats)

def evaluate(model, X_test, y_test, label):
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1]
    auc = roc_auc_score(y_test, y_prob)
    f1  = f1_score(y_test, y_pred)
    acc = accuracy_score(y_test, y_pred)
    print(f"\n*** {label} ***")
    print(f"AUC: {auc:.3f}")
    print(f"F1 : {f1:.3f}")
    print(f"ACC: {acc:.3f}")
    return auc, f1, acc

base = evaluate(model_base, X_test_b, y_test_b, "Baseline (no lineage)")
lineage = evaluate(model_line, X_test_l, y_test_l, "Lineage features added")


*** Baseline (no lineage) ***
AUC: 0.613
F1 : 0.418
ACC: 0.606

*** Lineage features added ***
AUC: 0.658
F1 : 0.460
ACC: 0.647


In [104]:
from sklearn.model_selection import GridSearchCV
from lightgbm import LGBMClassifier

pos_weight = len(y_train_b[y_train_b==0]) / len(y_train_b[y_train_b==1])

param_grid = {
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 5, 7],
    'n_estimators': [200, 400, 800],
    ''
    'scale_pos_weight': [pos_weight]
}

lgbm = LGBMClassifier(
    objective='binary',
    random_state=42,
    n_jobs=-1
)

grid = GridSearchCV(
    estimator=lgbm,
    param_grid=param_grid,
    scoring='roc_auc',
    cv=3,                # 3-fold CV
    verbose=3
)

grid.fit(X_train_b, y_train_b, categorical_feature=categorical_feats)

Fitting 3 folds for each of 972 candidates, totalling 2916 fits
[CV 1/3] END colsample_bytree=0.8, learning_rate=0.01, max_depth=3, n_estimators=200, num_leaves=15, reg_lambda=0, scale_pos_weight=2.841237903612149, subsample=0.8;, score=0.611 total time=   0.0s
[CV 2/3] END colsample_bytree=0.8, learning_rate=0.01, max_depth=3, n_estimators=200, num_leaves=15, reg_lambda=0, scale_pos_weight=2.841237903612149, subsample=0.8;, score=0.613 total time=   0.0s
[CV 3/3] END colsample_bytree=0.8, learning_rate=0.01, max_depth=3, n_estimators=200, num_leaves=15, reg_lambda=0, scale_pos_weight=2.841237903612149, subsample=0.8;, score=0.625 total time=   0.0s
[CV 1/3] END colsample_bytree=0.8, learning_rate=0.01, max_depth=3, n_estimators=200, num_leaves=15, reg_lambda=0, scale_pos_weight=2.841237903612149, subsample=1.0;, score=0.611 total time=   0.0s
[CV 2/3] END colsample_bytree=0.8, learning_rate=0.01, max_depth=3, n_estimators=200, num_leaves=15, reg_lambda=0, scale_pos_weight=2.8412379036

KeyboardInterrupt: 

### Lagi

In [None]:
#Under your name enter in the ML code that you have created - ADDITIONALLY PLEASE write a relatively extensive paragraph AT minimum 5-6 sentences long that outlines what your ML question is looking at and why etc.  I recommend everyone mnake their own initial copy of the df please.

### Jason

In [None]:
#Under your name enter in the ML code that you have created - ADDITIONALLY PLEASE write a relatively extensive paragraph AT minimum 5-6 sentences long that outlines what your ML question is looking at and why etc.  I recommend everyone mnake their own initial copy of the df please.