In [1]:
import os
import pandas as pd
import torch as tc
import numpy as np
import matplotlib as plt
import dateutil.parser as dtpr
import torch

pd.set_option('display.max_columns',500)
pd.set_option('display.max_rows',100)

In [2]:
target_csv = "data\\2023.csv"
if os.path.isfile(target_csv):
    data = pd.read_csv(target_csv)
else:
    raise FileNotFoundError()

In [None]:
split_loc = "data\\set"
if not os.path.exists(split_loc):
    os.makedirs(split_loc)

In [None]:
data.dtypes

### Clean up data

In [3]:
data.drop(columns=["region","comment","num"], inplace=True, errors="ignore")
data.sort_values(by="date",inplace=True)
data = data[data.secs != "-"]
data["secs"] = data["secs"].astype(float)
data = data[data["or"] != "–"]
data["or"] = data["or"].astype(int)
data = data[data.rpr != "–"]
data["rpr"] = data["rpr"].astype(int)
data = data[data["pos"].str.isdigit()]
data["pos"] = data["pos"].astype(int)
data["ovr_btn"] = data["ovr_btn"].astype(float)
data["class"] = np.where(~data["pattern"].isna(), "g", data["class"])
data["class"] = np.where(data["class"].isna() & data["race_name"].str.contains('|'.join(["Classic Mile", "Classic Cup", "Derby"])),"l",data["class"])
data["class"].fillna(value="NaN", inplace=True)
data["class_code"] = data["class"].replace({"Class 5": 0, "Class 4":0, "Class 3":1, "Class 2": 2, "Class 1":3, "l":4, "g":5, "NaN":0})
data["course_code"] = data["course"].replace({"Happy Valley (HK)":0, "Sha Tin (HK)":1})
data["date_time"] = pd.to_datetime(data["date"] + " " + data["off"])
data["date"] = pd.to_datetime(data["date"])
data["pattern_code"] = (data["pattern"].fillna(value="N")).replace({"N":0,"Group 3":1,"Group 2": 2, "Group 1":3})
data["age_band_code"] = data["age_band"].replace({"2yo+":0,"3yo":1,"3yo+":2,"4yo":3,"4yo+":4})
data["dist_code"] = data["dist_m"].replace({distance: i for i, distance in enumerate(sorted(data["dist_m"].unique()))})
data["going_code"] = data["going"].replace({surf: i for i, surf in enumerate(data["going"].unique())})
data["surface_code"] = data["surface"].replace({surf: i for i, surf in enumerate(data["surface"].unique())})
data["sex_code"] = data["sex"].replace({sex: i for i, sex in enumerate(data["sex"].unique())})
data.fillna(value={"hg":"None"},inplace=True)
data["hg_code"] = data["hg"].replace({hg: i for i, hg in enumerate(data["hg"].unique())})

### Create new features

In [None]:
data["month"] = data["date"].dt.month
data["quarter"] = (data["date"].dt.month+2)//3
# if advantage stall number
data["in_stall"] = np.where(data["draw"] <= 4, 1, 0)
# if top3
data["in_place"] = np.where(data["pos"] <= 3,1,0)
# if win
data["win"] = np.where(data["pos"] == 1, 1, 0)
# log of dec
data["decLog"] = np.log1p(data["dec"])
# age under 3
data["age_u3"] = np.where(data["age"] <= 4, 1, 0)
# rest time between race in days
data["rest_time"] = (data["date"]-(data.groupby("horse_id")["date"].shift())).dt.days
data["rest_time"] = data["rest_time"].fillna(data["rest_time"].mode()[0])
data["rest_less14"] = np.where(data["rest_time"] < 14, 1, 0)
data["rest_ovr32"] = (data["rest_time"]>32)*1
# horse_id total/recent winrate/inplacerate
data["h_cwin"] = (data.groupby("horse_id")["win"].transform("sum"))/(data.groupby("horse_id")["horse_id"].transform("count"))
data["h_rwin"] = (data.groupby("horse_id", group_keys=False)["win"].apply(lambda x: x.rolling(4, min_periods=1).sum())) / (((data.groupby("horse_id")["horse_id"]).transform("cumcount")+1).apply(lambda x: 4 if x >= 4 else x))
data["h_cplace"] = (data.groupby("horse_id")["in_place"].transform("sum"))/(data.groupby("horse_id")["horse_id"].transform("count"))
data["h_rplace"] = (data.groupby("horse_id", group_keys=False)["in_place"].apply(lambda x: x.rolling(4, min_periods=1).sum())) / (((data.groupby("horse_id")["horse_id"]).transform("cumcount")+1).apply(lambda x: 4 if x >= 4 else x))
# jockey_id total/recent winrate/inplacerate
data["j_cwin"] = (data.groupby("jockey_id")["win"].transform("sum"))/(data.groupby("jockey_id")["jockey_id"].transform("count"))
data["j_rwin"] = (data.groupby("jockey_id", group_keys=False)["win"].apply(lambda x: x.rolling(4, min_periods=1).sum())) / (((data.groupby("jockey_id")["jockey_id"]).transform("cumcount")+1).apply(lambda x: 4 if x >= 4 else x))
data["j_cplace"] = (data.groupby("jockey_id")["in_place"].transform("sum"))/(data.groupby("jockey_id")["jockey_id"].transform("count"))
data["j_rplace"] = (data.groupby("jockey_id", group_keys=False)["in_place"].apply(lambda x: x.rolling(4, min_periods=1).sum())) / (((data.groupby("jockey_id")["jockey_id"]).transform("cumcount")+1).apply(lambda x: 4 if x >= 4 else x))
# trainer_id total/recent winrate/inplacerate
data["t_cwin"] = (data.groupby("trainer_id")["win"].transform("sum"))/(data.groupby("trainer_id")["trainer_id"].transform("count"))
data["t_rwin"] = (data.groupby("trainer_id", group_keys=False)["win"].apply(lambda x: x.rolling(4, min_periods=1).sum())) / (((data.groupby("trainer_id")["trainer_id"]).transform("cumcount")+1).apply(lambda x: 4 if x >= 4 else x))
data["t_cplace"] = (data.groupby("trainer_id")["in_place"].transform("sum"))/(data.groupby("trainer_id")["trainer_id"].transform("count"))
data["t_rplace"] = (data.groupby("trainer_id", group_keys=False)["in_place"].apply(lambda x: x.rolling(4, min_periods=1).sum())) / (((data.groupby("trainer_id")["trainer_id"]).transform("cumcount")+1).apply(lambda x: 4 if x >= 4 else x))

In [None]:
train = train[ ['date_time'] + [ col for col in train.columns if col != 'date_time' ] ]
train.sort_values(by=["date_time","pos"],inplace=True)

In [None]:
train = data.drop(columns=["date_time","dist_m","sex", "course", "race_name", "date", "off", "class", "pattern", "age_band", "hg", "going", "surface", "horse", "jockey", "trainer", "dam", "sire", "damsire"], errors='ignore')

In [None]:
train = pd.get_dummies(train, columns=["ran","class_code","pattern_code","age_band_code","dist_code","going_code","surface_code","sex_code","hg_code","quarter"])

In [None]:
train.dtypes

In [None]:
from torch import tensor

In [None]:
t_dep = tensor(np.array(train["win"])) # predictor
t_indep = tensor(train[[col for col in train.columns if col != "win"]].values, dtype=tc.float) # influencer

In [None]:
t_indep.shape

In [None]:
torch.manual_seed(369)
coeffs = torch.rand(t_indep.shape[1])-0.5
coeffs.shape

In [None]:
vals, indicies = t_indep.max(dim=0)
t_indep = t_indep/vals

In [None]:
predict = (t_indep*coeffs).sum(axis=1)
predict[:10] # predictions by random coefficient