## Colab settings

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!nvidia-smi

In [None]:
!pip install --upgrade torch

## Load data

In [None]:
import os
import pandas as pd
import torch as tc
import numpy as np
import matplotlib as plt

pd.set_option('display.max_columns',500)
pd.set_option('display.max_rows',200)

In [None]:
%cd /content/drive/MyDrive/pred/

In [None]:
target_csv = "./data/2017-2023.csv"
if os.path.isfile(target_csv):
    data = pd.read_csv(target_csv)
else:
    raise FileNotFoundError()

In [None]:
split_loc = "data/set"
if not os.path.exists(split_loc):
    os.makedirs(split_loc)

In [None]:
data.dtypes

### Clean up data

In [None]:
data.drop(columns=["region","comment","num"], inplace=True, errors="ignore")
data.sort_values(by="date",inplace=True)
data = data[data.secs != "-"]
data["secs"] = data["secs"].astype(float)
data = data[data["or"] != "–"]
data["or"] = data["or"].astype(int)
data = data[data.rpr != "–"]
data["rpr"] = data["rpr"].astype(int)
data = data[data["pos"].str.isdigit()]
data["pos"] = data["pos"].astype(int)
data["ovr_btn"] = data["ovr_btn"].astype(float)
data["class"] = np.where(~data["pattern"].isna(), "g", data["class"])
data["pattern"] = np.where(data["class"].isna() & data["race_name"].str.contains('|'.join(["Classic Mile", "Classic Cup", "Derby"])),"l",data["pattern"])
data["class"].fillna(value="NC", inplace=True)
data["date"] = pd.to_datetime(data["date"])
data["pattern"] = (data["pattern"].fillna(value="NG"))
data["age_band_code"] = data["age_band"].replace({"2yo+":0,"3yo":1,"3yo+":2,"4yo":3,"4yo+":4})
data["dist_code"] = data["dist_m"].replace({distance: i for i, distance in enumerate(sorted(data["dist_m"].unique()))})
mapGO = {"Firm": 0, "Fast": 0, 
"Good To Firm": 1, "Standard To Fast": 1, 
"Good": 2, "Standard": 2, 
"Good To Yielding": 3, "Standard To Slow": 3, 
"Yielding": 4, "Slow": 4, 
"Yielding To Soft": 5, "Soft": 6, "Heavy": 7}
data["going_code"] = data["going"].replace(mapGO)
data.fillna(value={"hg":"None"},inplace=True)
data["draw"] = data["draw"].astype(int)

In [None]:
data["class"].unique()

### Create new features

In [None]:
train = data.copy()
train["month"] = train["date"].dt.month
train["quarter"] = (train["date"].dt.month+2)//3
# if inner draw
train["in_draw"] = np.where(train["draw"] <= 4, 1, 0)
# if outer draw
train["out_draw"] = np.where(train["draw"] >= 10, 1, 0)
# if top3
train["in_place"] = np.where(train["pos"] <= 3,1,0)
# if win
train["win"] = np.where(train["pos"] == 1, 1, 0)
# log of dec
train["decLog"] = np.log1p(train["dec"])
# age under 3
train["age_u3"] = np.where(train["age"] <= 4, 1, 0)
# rest time between race in days
train["rest_time"] = (train["date"]-(train.groupby("horse_id")["date"].shift())).dt.days
train["rest_time"] = train["rest_time"].fillna(train["rest_time"].mode()[0])
train["rest_less14"] = np.where(train["rest_time"] < 14, 1, 0)
train["rest_ovr32"] = (train["rest_time"]>32)*1
# top odds
train["topDec"] = np.where(train["dec"] == (train.groupby("race_id", group_keys = False)["dec"].transform("min")), 1, 0)
# number of recent matches to count
rct = 3
# horse_id historic/recent winrate/inplacerate
train["h_hwin"] =  ((train.groupby("horse_id")["win"].transform(lambda x: x.shift().cumsum().fillna(0))) / (train.groupby("horse_id")["horse_id"].transform("cumcount"))).fillna(0)
train["h_rwin"] = ((train.groupby("horse_id", group_keys=False)["win"].apply(lambda x: x.shift().fillna(0).rolling(rct, min_periods=1).sum()))/(((train.groupby("horse_id")["horse_id"]).transform("cumcount")).apply(lambda x: rct if x >= rct else x))).fillna(0)
train["h_hplace"] =  ((train.groupby("horse_id")["in_place"].transform(lambda x: x.shift().cumsum().fillna(0))) / (train.groupby("horse_id")["horse_id"].transform("cumcount"))).fillna(0)
train["h_rplace"] = ((train.groupby("horse_id", group_keys=False)["in_place"].apply(lambda x: x.shift().fillna(0).rolling(rct, min_periods=1).sum()))/(((train.groupby("horse_id")["horse_id"]).transform("cumcount")).apply(lambda x: rct if x >= rct else x))).fillna(0)
# jockey_id historic/recent winrate/inplacerate
train["j_hwin"] =  ((train.groupby("jockey_id")["win"].transform(lambda x: x.shift().cumsum().fillna(0))) / (train.groupby("jockey_id")["jockey_id"].transform("cumcount"))).fillna(0)
train["j_rwin"] = ((train.groupby("jockey_id", group_keys=False)["win"].apply(lambda x: x.shift().fillna(0).rolling(rct, min_periods=1).sum()))/(((train.groupby("jockey_id")["jockey_id"]).transform("cumcount")).apply(lambda x: rct if x >= rct else x))).fillna(0)
train["j_hplace"] =  ((train.groupby("jockey_id")["in_place"].transform(lambda x: x.shift().cumsum().fillna(0))) / (train.groupby("jockey_id")["jockey_id"].transform("cumcount"))).fillna(0)
train["j_rplace"] = ((train.groupby("jockey_id", group_keys=False)["in_place"].apply(lambda x: x.shift().fillna(0).rolling(rct, min_periods=1).sum()))/(((train.groupby("jockey_id")["jockey_id"]).transform("cumcount")).apply(lambda x: rct if x >= rct else x))).fillna(0)
# trainer_id historic/recent winrate/inplacerate
train["t_hwin"] =  ((train.groupby("trainer_id")["win"].transform(lambda x: x.shift().cumsum().fillna(0))) / (train.groupby("trainer_id")["trainer_id"].transform("cumcount"))).fillna(0)
train["t_rwin"] = ((train.groupby("trainer_id", group_keys=False)["win"].apply(lambda x: x.shift().fillna(0).rolling(rct, min_periods=1).sum()))/(((train.groupby("trainer_id")["trainer_id"]).transform("cumcount")).apply(lambda x: rct if x >= rct else x))).fillna(0)
train["t_hplace"] =  ((train.groupby("trainer_id")["in_place"].transform(lambda x: x.shift().cumsum().fillna(0))) / (train.groupby("trainer_id")["trainer_id"].transform("cumcount"))).fillna(0)
train["t_rplace"] = ((train.groupby("trainer_id", group_keys=False)["in_place"].apply(lambda x: x.shift().fillna(0).rolling(rct, min_periods=1).sum()))/(((train.groupby("trainer_id")["trainer_id"]).transform("cumcount")).apply(lambda x: rct if x >= rct else x))).fillna(0)



In [None]:
train = train[ ['date'] + [ col for col in train.columns if col != 'date' ] ]
train.sort_values(by=["date","pos"],inplace=True)
train.reset_index(drop=True, inplace=True)

In [None]:
train = train.drop(columns=
[
    'date',
    'off',
    'going',
    'age_band',
    'dist_m',
    'surface',
    'horse_id',
    'horse',
    'dec',
    
    'jockey',
    'trainer',
    'dam',
    'sire',
    'damsire',
    # 
    # 'jockey_id',
    # 'trainer_id',
    # 'sire_id',
    # 'dam_id',
    # 'damsire_id',
    ],
    errors='ignore')

In [None]:
# optional, can test later
train = train.drop(columns=[
    "race_id",
    "race_name",
    ], errors='ignore')

In [None]:
train = pd.get_dummies(train, columns=["course","class","pattern","pos","draw","sex","hg"], drop_first=True)

In [None]:
train.isna().sum()

In [None]:
# create embeds
nominal = ['jockey_id',
   'trainer_id',
   'sire_id',
   'dam_id',
   'damsire_id',]

In [None]:
inp = train.drop(columns=["win"]+nominal)
inp_nominal = train[nominal]
inp_train = inp.drop(inp.tail(9).index)
inp_nominal_train = inp_nominal.drop(inp_nominal.tail(9).index)
inp_test = inp.tail(9)
inp_nominal_test = inp_nominal.tail(9)
print(f"input dimension(exclude nominal):{(inp_var := len(inp.columns))}")
out = train["win"]
out_train = out.drop(out.tail(9).index)
out_test = out.tail(9)
cnt = 0
for col in nominal:
    cur = inp_nominal_train[col]
    inp_nominal_train[col] = cur.replace({cid: i for i, cid in enumerate(sorted(cur.unique()))})
    print(f"{col}: {len(inp_nominal_train[col].unique())}")
    cnt += len(inp_nominal_train[col].unique())
print(f"total category: {cnt}")

#### For outputing feature engineered data (optional)

In [None]:
train.to_csv("test.csv")

## Prep for training

In [None]:
import torch.nn as nn
import torch.optim as opt
import math
device = tc.device("cuda" if tc.cuda.is_available() else "cpu")
print(f"using {device}")

Train on target = win, winrate = true

In [None]:
X = tc.tensor(inp_train.values, dtype=tc.float32).to(device)
X_nominal = {col: tc.LongTensor(inp_nominal_train[col]).to(device) for col in nominal}
y = tc.tensor(out_train.values, dtype=tc.float32, device=device).to(device)

In [None]:
for e in X_nominal.values():
  print(e.get_device())

#### Model sturcture creation

In [None]:
class PredictWin(nn.Module):
    def __init__(self):
        super().__init__()
        self.embedding = nn.ModuleDict()
        self.embLen = 0
        for col in nominal:
            cnt = len(train[col].unique())
            emb_dim = min(500,cnt//2) if cnt <= 1000 else int(75.6496 * math.log(cnt + 176.623) - 41.4457)
            self.embLen += emb_dim
            tmp = nn.Embedding(num_embeddings=cnt+1, embedding_dim=emb_dim)
            self.embedding[col] = tmp
        self.iv = inp_var + self.embLen
        self.linear = nn.Sequential(
            nn.Linear(self.iv, self.iv+self.iv//3+self.iv%3),
            nn.ReLU(),
            nn.Linear(self.iv+self.iv//3+self.iv%3,self.iv),
            nn.ReLU(),
            nn.Linear(self.iv, 1),
            nn.Sigmoid(),
        )
        self.drop = nn.Dropout(0.4)
        
    def forward(self, x, x_nominal):
        x_nominal = [self.embedding[col](x_nominal[col]) for col in nominal]
        x_nominal = tc.cat(x_nominal, dim = 1)
        x = tc.cat([x,x_nominal], dim = 1)
        return self.linear(x)

model = PredictWin()
model = model.to(device)
model

In [None]:
loss_fn = nn.BCELoss().to(device)
optim = opt.Adam(model.parameters(), lr = 0.01)

In [None]:
n_epoch = model.iv*3
batch_size  = 128
n_epoch

In [None]:
from tqdm.notebook import tqdm
for epoch in tqdm(range(n_epoch),position=0, leave=True):
  for iteration in tqdm(range(0,len(X),batch_size),position=0, leave=True):
    for batch in range(batch_size):
        Xbatch = X[batch:batch+batch_size]
        XnBatch = {col: X_nominal[col][batch:batch+batch_size] for col in nominal}
        y_pred = model(Xbatch, XnBatch)
        y_pred = tc.squeeze(y_pred)
        Ybatch = y[batch:batch+batch_size]
        loss = loss_fn(y_pred,Ybatch)
        optim.zero_grad()
        loss.backward()
        optim.step()
  print(f"Epoch {epoch} loss: {loss}")