In [None]:
from datetime import datetime

import h2o
import mlflow
import mlflow.h2o
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
from h2o.automl import H2OAutoML
from mlflow.tracking import MlflowClient

In [None]:
client = MlflowClient()

In [None]:
h2o.init()

#### Goals Probability

In [None]:
experiment_name = "Probability Matrix"

mlflow.create_experiment(experiment_name)
experiment = mlflow.set_experiment(experiment_name)

In [None]:
j1 = pd.read_parquet("../data/cleansed/cleansed_data.parquet")

col = ["home", "away", "hg", "ag"]
opponent = {"away": "home", "home": "away"}

j1 = j1.query("season < 2024")[col].copy()

In [None]:
dfs = []
for stadium, team in enumerate(["away", "home"]):
    xg = j1[[team, opponent[team], f"{team[0]}g"]] \
        .rename(
            columns={
                team: "team", opponent[team]: "opponent",
                f"{team[0]}g": "goals",
            }
        ).assign(stadium=stadium)
    
    dfs.append(xg)

xgs = pd.concat(dfs, ignore_index=True)

In [None]:
dt = datetime.now().strftime("%Y%m%d%H%M")

with mlflow.start_run(run_name=f"gp_{dt}"):
    mlflow.statsmodels.autolog()
    
    glm = smf.glm(
        formula="goals ~ stadium + team + opponent",
        data=xgs,
        family=sm.families.Poisson()
    ).fit()

#### Handicap Results

In [None]:
experiment_name = "Handicap Prediction"

mlflow.create_experiment(experiment_name)
experiment = mlflow.set_experiment(experiment_name)

In [None]:
j1 = pd.read_parquet("../data/featured/j1_league.parquet")

train = j1.query("season <= 2022").drop(columns="season")
dev = j1.query("season == 2023").drop(columns="season")

col_y = "res"
col_x = [col for col in train.columns if col != col_y]

In [None]:
train_h2o = h2o.H2OFrame(train)
dev_h2o = h2o.H2OFrame(dev)

train_h2o[col_y] = train_h2o[col_y].asfactor()

In [None]:
dt = datetime.now().strftime("%Y%m%d%H%M")

with mlflow.start_run(run_name=f"hc_{dt}"):
    clf = H2OAutoML(
        max_models=32,
        seed=42,
        balance_classes=True,
        sort_metric="logloss",
        exclude_algos=["DeepLearning"],
    )
    
    clf.train(
        x=col_x, y=col_y,
        training_frame=train_h2o,
        validation_frame=dev_h2o,
    )
    
    metrics = {
        "logloss": clf.leader.logloss(),
        "auc": clf.leader.auc(),
        "rmse": clf.leader.rmse(),
        "mse": clf.leader.mse()
    }
    mlflow.log_metrics(metrics)
    
    mlflow.h2o.log_model(clf.leader, artifact_path="model")

#### Bet Decision

In [None]:
experiment_name = "Bet Decision"

mlflow.create_experiment(experiment_name)
experiment = mlflow.set_experiment(experiment_name)

In [None]:
dev_pred = clf.predict(dev_h2o)

train = dev_pred.as_data_frame()

train["probability"] = train[["A", "H"]].max(axis=1)

train["actual"] = dev.reset_index(drop=True)["res"]

train["correct"] = train["predict"] == train["actual"]
train["correct"] = train["correct"].astype(int)

In [None]:
train_h2o = h2o.H2OFrame(train)

train_h2o["correct"] = train_h2o["correct"].asfactor()

In [None]:
dt = datetime.now().strftime("%Y%m%d%H%M")

with mlflow.start_run(run_name=f"bd_{dt}"):
    sm = H2OAutoML(
        max_models=32,
        seed=42,
        balance_classes=True,
        sort_metric="logloss",
        exclude_algos=["DeepLearning"],
    )
    
    sm.train(
        x=["predict", "probability"], y="correct",
        training_frame=train_h2o,
    )
    
    metrics = {
        "logloss": sm.leader.logloss(),
        "auc": sm.leader.auc(),
        "rmse": sm.leader.rmse(),
        "mse": sm.leader.mse()
    }
    mlflow.log_metrics(metrics)
    
    mlflow.h2o.log_model(sm.leader, artifact_path="model")