## Model Evaluation
---

In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier

np.random.seed(1834633)
pd.set_option("display.max_columns", None)

### Data Import
---

In [2]:
j1 = pd.read_parquet("../data/j1_league_featured.parquet") \
    .query("date.dt.year >= 2022").copy()

### Production Simulation
---

In [3]:
dc_y = ["results"]
dc_X = [col for col in j1.columns if col not in ["date", "home", "away", "results", "net_goals"]]

y_test = j1.query("date >= '20220901'")[dc_y].values.ravel()
X_test = j1.query("date >= '20220901'")[dc_X].values

y_train = j1.query("date < '20220901'")[dc_y].values.ravel()
X_train = j1.query("date < '20220901'")[dc_X].values

In [4]:
scaler = MinMaxScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

### Model Restore
---

In [5]:
np.random.seed(42)

clf = RandomForestClassifier(
    bootstrap=False, criterion="gini",
    max_depth=10, max_features="log2",
    min_samples_leaf=33, n_estimators=237
)
clf.fit(X_train, y_train)

### Expected Return
---

In [6]:
df = j1.query("date >= '20220901'").copy()
df["y_hat"] = clf.predict(X_test)

In [7]:
n = df.shape[0]
cost = 200 * df.query("results != y_hat").shape[0]
profit = 0.8 * df.query("results == y_hat and handicap + net_goals != 0").shape[0] \
    + 0.4 * df.query("results == y_hat and handicap + net_goals == 0").shape[0]
profit *= 200
roi = (profit / cost - 1) * 100

In [8]:
pd.DataFrame(
    {
        "Number of bets": [f"{n} games"],
        "Actual return": [f"HKD {profit - cost:,.0f}"],
        "Return on investment": [f"{roi:.2f}%"],
    },
    index=["Product Environment"]
)

Unnamed: 0,Number of bets,Actual return,Return on investment
Product Environment,73 games,"HKD 1,560",28.89%


### Outsider Game
---

In [9]:
dc = ["date", "home", "away", "handicap", "results", "net_goals", "y_hat"]

In [10]:
df.query("date == '20221012' and home == 'Yokohama F-Marinos' and away == 'Jubilo Iwata'")[dc]

Unnamed: 0,date,home,away,handicap,results,net_goals,y_hat
2432,2022-10-12,Yokohama F-Marinos,Jubilo Iwata,-2.0,0,-1,1


In [11]:
df.query("date == '20221008' and home == 'Yokohama F-Marinos' and away == 'Gamba Osaka'")[dc]

Unnamed: 0,date,home,away,handicap,results,net_goals,y_hat
2423,2022-10-08,Yokohama F-Marinos,Gamba Osaka,-1.5,0,-2,1
