## Machine Learning
---

In [1]:
import os
import warnings

os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"
warnings.filterwarnings("ignore")

In [2]:
import numpy as np
import pandas as pd
from scipy import stats
from math import comb

np.random.seed(1834633)
pd.set_option("display.max_columns", None)

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import MinMaxScaler

In [4]:
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score

In [5]:
from sklearn.neighbors import KNeighborsClassifier

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

from tensorflow.keras import Sequential
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.callbacks import EarlyStopping
from scikeras.wrappers import KerasClassifier

In [6]:
from sklearn.model_selection import RandomizedSearchCV

from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer

import optuna
from optuna.samplers import TPESampler

from hyperopt import tpe, hp, fmin, STATUS_OK, Trials, space_eval

optuna.logging.set_verbosity(optuna.logging.WARNING)

### Data Import
---

In [7]:
j1 = pd.read_parquet("../data/j1_league_featured.parquet") \
    .query("date.dt.year >= 2022").copy()

### Data Splitting
---

In [8]:
dc_y = ["results"]
dc_X = [col for col in j1.columns if col not in ["date", "home", "away", "results", "net_goals"]]

y_test = j1.query("date >= '20220901'")[dc_y].values.ravel()
X_test = j1.query("date >= '20220901'")[dc_X].values

y_train_dev = j1.query("date < '20220901'")[dc_y].values.ravel()
X_train_dev = j1.query("date < '20220901'")[dc_X].values

X_train, X_dev, y_train, y_dev = train_test_split(
    X_train_dev, y_train_dev, test_size=0.2,
    stratify=y_train_dev , random_state=24997828
)

### Feature Scaling
---

In [9]:
scaler = MinMaxScaler()

X_train_dev = scaler.fit_transform(X_train_dev)
X_test = scaler.transform(X_test)

X_train = scaler.fit_transform(X_train)
X_dev = scaler.transform(X_dev)

### Model Training
---

In [10]:
data = {
    "Dev": {
        "Train": (X_train, y_train),
        "Dev": (X_dev, y_dev),
    },
    "Prod": {
        "Train-Dev": (X_train_dev, y_train_dev),
        "Test": (X_test, y_test)
    },
}

metrics_log = {}

#### Evaluation Functions

In [11]:
def evaluate_score(model, X, y, k_fold=5):
    scores = {}
    y_hat = model.predict(X)
    
    scores["accuracy"] = accuracy_score(y, y_hat)
    scores["auc"] = roc_auc_score(y, y_hat)
    
    return scores

def state_evaluation(portion, scores):
    label = f"{portion} set"
    accuracy = f"Accuracy: {scores['accuracy']:.4f}"
    auc = f"AUC: {scores['auc']:.4f}"
    
    return f"{label} \t| {accuracy} \t| {auc}"

def evaluate_model(model, opt_params=None):
    msg = []
    
    for tier in ["Dev", "Prod"]:
        portions = list(data[tier].keys())
        clf.fit(*data[tier][portions[0]])
        for portion in portions:
            scores = evaluate_score(clf, *data[tier][portion])
            msg.append(state_evaluation(portion, scores))
            if portion == "Test":
                metrics_log[model] = scores
                metrics_log[model]["params"] = opt_params
    
    print(f"Model: {model}" + [f" {opt_params}", ""][opt_params is None])
    print("=" * 125)
    print(f"{msg[0]} \t\t┆\t {msg[2]}")
    print(f"{msg[1]} \t\t┆\t {msg[3]}")
    print("=" * 125)
    
    return None

#### Baseline Models

In [12]:
def DNN(n_layers=1, neurons_min=8, activation="relu", patience=3, batch_size=32, epochs=20):
    
    model = Sequential([
        Input(shape=X_train.shape[1]),
        Dense(units=comb(neurons_min, 1), activation=activation),
    ])
    for layer in range(1, n_layers+1):
        model.add(Dense(units=comb(neurons_min, layer+1), activation=activation))
    model.add(Dense(units=1, activation="sigmoid"))
    
    model.compile(optimizer=Adam(learning_rate=0.001), loss=BinaryCrossentropy())
    
    early_stopping = EarlyStopping(monitor="loss", patience=patience)
    history = model.fit(
        X_train, y_train,
        batch_size=batch_size,
        epochs=epochs,
        callbacks = [early_stopping],
        verbose=0
    )
    
    return model

##### 01 | k-Nearest Neighbors

In [13]:
%%time
np.random.seed(42)

clf = KNeighborsClassifier()

evaluate_model("k-Nearest Neighbors")

Model: k-Nearest Neighbors
Train set 	| Accuracy: 0.6927 	| AUC: 0.6866 		┆	 Train-Dev set 	| Accuracy: 0.6830 	| AUC: 0.6790
Dev set 	| Accuracy: 0.5333 	| AUC: 0.5238 		┆	 Test set 	| Accuracy: 0.5479 	| AUC: 0.5522
CPU times: user 26.4 ms, sys: 3.06 ms, total: 29.5 ms
Wall time: 32.6 ms


##### 02 | Logistic Regression

In [14]:
%%time
np.random.seed(42)

clf = LogisticRegression()

evaluate_model("Logistic Regression")

Model: Logistic Regression
Train set 	| Accuracy: 0.5587 	| AUC: 0.5443 		┆	 Train-Dev set 	| Accuracy: 0.5982 	| AUC: 0.5836
Dev set 	| Accuracy: 0.5111 	| AUC: 0.4940 		┆	 Test set 	| Accuracy: 0.4658 	| AUC: 0.5146
CPU times: user 12.2 ms, sys: 2.27 ms, total: 14.4 ms
Wall time: 13.3 ms


##### 03 | Support Vector Machine

In [15]:
%%time
np.random.seed(42)

clf = SVC()

evaluate_model("Support Vector Machine")

Model: Support Vector Machine
Train set 	| Accuracy: 0.6760 	| AUC: 0.6712 		┆	 Train-Dev set 	| Accuracy: 0.6741 	| AUC: 0.6648
Dev set 	| Accuracy: 0.4889 	| AUC: 0.4792 		┆	 Test set 	| Accuracy: 0.5616 	| AUC: 0.5810
CPU times: user 16.8 ms, sys: 1.24 ms, total: 18 ms
Wall time: 16.6 ms


##### 04 | Random Forest

In [16]:
%%time
np.random.seed(42)

clf = RandomForestClassifier()

evaluate_model("Random Forest")

Model: Random Forest
Train set 	| Accuracy: 1.0000 	| AUC: 1.0000 		┆	 Train-Dev set 	| Accuracy: 1.0000 	| AUC: 1.0000
Dev set 	| Accuracy: 0.5778 	| AUC: 0.5774 		┆	 Test set 	| Accuracy: 0.6164 	| AUC: 0.6033
CPU times: user 252 ms, sys: 5.07 ms, total: 257 ms
Wall time: 258 ms


##### 05 | eXtreme Gradient Boosting

In [17]:
%%time
np.random.seed(42)

clf = XGBClassifier()

evaluate_model("eXtreme Gradient Boosting")

Model: eXtreme Gradient Boosting
Train set 	| Accuracy: 1.0000 	| AUC: 1.0000 		┆	 Train-Dev set 	| Accuracy: 1.0000 	| AUC: 1.0000
Dev set 	| Accuracy: 0.5111 	| AUC: 0.5060 		┆	 Test set 	| Accuracy: 0.5205 	| AUC: 0.5031
CPU times: user 1.03 s, sys: 50.3 ms, total: 1.08 s
Wall time: 102 ms


##### 06 | Deep Neural Network

In [18]:
%%time
np.random.seed(42)

clf = KerasClassifier(model=DNN, verbose=0)

evaluate_model("Deep Neural Network")

Model: Deep Neural Network
Train set 	| Accuracy: 0.5866 	| AUC: 0.5720 		┆	 Train-Dev set 	| Accuracy: 0.5446 	| AUC: 0.5313
Dev set 	| Accuracy: 0.5556 	| AUC: 0.5387 		┆	 Test set 	| Accuracy: 0.5342 	| AUC: 0.5319
CPU times: user 4.6 s, sys: 288 ms, total: 4.88 s
Wall time: 2.75 s


#### RandomisedSearchCV Models

##### 01 | k-Nearest Neighbors

In [19]:
%%time
np.random.seed(42)

grid_params = {
    "n_neighbors": stats.randint(2, 50),
}
clf = RandomizedSearchCV(
    KNeighborsClassifier(), grid_params,
    n_iter=20, cv=5, scoring="accuracy"
)
clf.fit(*data["Dev"]["Train"])

params = clf.best_params_
clf = clf.best_estimator_

evaluate_model("k-Nearest Neighbors (RandomisedSearchCV)", params)

Model: k-Nearest Neighbors (RandomisedSearchCV) {'n_neighbors': 20}
Train set 	| Accuracy: 0.6592 	| AUC: 0.6502 		┆	 Train-Dev set 	| Accuracy: 0.6473 	| AUC: 0.6413
Dev set 	| Accuracy: 0.5778 	| AUC: 0.5655 		┆	 Test set 	| Accuracy: 0.5479 	| AUC: 0.5649
CPU times: user 250 ms, sys: 4.88 ms, total: 255 ms
Wall time: 256 ms


##### 02 | Logistic Regression

In [20]:
%%time
np.random.seed(42)

grid_params = {
    "C": 0.001 * 10 ** np.arange(0, 6),
    "class_weight": ["balanced", "auto"],
    "max_iter": [10000],
    "penalty": ["elasticnet"],
    "solver": ["saga"],
    "l1_ratio": stats.uniform(0, 1),
}
clf = RandomizedSearchCV(
    LogisticRegression(), grid_params,
    n_iter=20, cv=5, scoring="accuracy"
)
clf.fit(*data["Dev"]["Train"])

params = clf.best_params_
clf = clf.best_estimator_

evaluate_model("Logistic Regression (RandomisedSearchCV)", params)

Model: Logistic Regression (RandomisedSearchCV) {'C': 0.01, 'class_weight': 'auto', 'l1_ratio': 0.007066305219717406, 'max_iter': 10000, 'penalty': 'elasticnet', 'solver': 'saga'}
Train set 	| Accuracy: 0.5251 	| AUC: 0.5000 		┆	 Train-Dev set 	| Accuracy: 0.5268 	| AUC: 0.5000
Dev set 	| Accuracy: 0.5333 	| AUC: 0.5000 		┆	 Test set 	| Accuracy: 0.4247 	| AUC: 0.5000
CPU times: user 268 ms, sys: 3.12 ms, total: 272 ms
Wall time: 271 ms


##### 03 | Support Vector Machine

In [21]:
%%time
np.random.seed(42)

grid_params = {
    "C": stats.randint(1, 100),
    "kernel": ["linear", "poly", "rbf", "sigmoid"],
    "gamma": ["scale"],
}
clf = RandomizedSearchCV(
    SVC(), grid_params,
    n_iter=20, cv=5, scoring="accuracy"
)
clf.fit(*data["Dev"]["Train"])

params = clf.best_params_
clf = clf.best_estimator_

evaluate_model("Support Vector Machine (RandomisedSearchCV)", params)

Model: Support Vector Machine (RandomisedSearchCV) {'C': 24, 'gamma': 'scale', 'kernel': 'rbf'}
Train set 	| Accuracy: 0.7542 	| AUC: 0.7502 		┆	 Train-Dev set 	| Accuracy: 0.7500 	| AUC: 0.7474
Dev set 	| Accuracy: 0.5778 	| AUC: 0.5655 		┆	 Test set 	| Accuracy: 0.5616 	| AUC: 0.5599
CPU times: user 5.18 s, sys: 21.5 ms, total: 5.2 s
Wall time: 5.22 s


##### 04 | Random Forest

In [22]:
%%time
np.random.seed(42)

grid_params = {
    "bootstrap": [True,False],
    "criterion": ["gini", "entropy"],
    "max_depth": stats.randint(5, 15),
    "max_features": ["sqrt", "log2"],
    "min_samples_leaf": stats.randint(10, 50),
    "n_estimators": stats.randint(200, 500),
}
clf = RandomizedSearchCV(
    RandomForestClassifier(), grid_params,
    n_iter=20, cv=5, scoring="accuracy"
)
clf.fit(*data["Dev"]["Train"])

params = clf.best_params_
clf = clf.best_estimator_

evaluate_model("Random Forest (RandomisedSearchCV)", params)

Model: Random Forest (RandomisedSearchCV) {'bootstrap': False, 'criterion': 'entropy', 'max_depth': 6, 'max_features': 'log2', 'min_samples_leaf': 18, 'n_estimators': 252}
Train set 	| Accuracy: 0.6704 	| AUC: 0.6653 		┆	 Train-Dev set 	| Accuracy: 0.7054 	| AUC: 0.7031
Dev set 	| Accuracy: 0.5556 	| AUC: 0.5506 		┆	 Test set 	| Accuracy: 0.5890 	| AUC: 0.5541
CPU times: user 32.2 s, sys: 562 ms, total: 32.7 s
Wall time: 33.3 s


##### 05 | eXtreme Gradient Boosting

In [23]:
%%time
np.random.seed(42)

grid_params = {
    "colsample_bytree": stats.uniform(0, 1),
    "eval_metric": ["error"],
    "gamma": stats.uniform(0.1, 2.9),
    "learning_rate": stats.uniform(0.01, 0.29),
    "max_depth": stats.randint(3, 10),
    "min_child_weight": stats.randint(0, 5),
    "n_estimators": stats.randint(500, 1500),
    "objective": ["binary:logistic"],
    "reg_alpha": stats.expon(0, 50),
    "reg_lambda": stats.expon(0, 50),
    "scale_pos_weight": stats.uniform(1, 2),
    "subsample": stats.uniform(0.3, 0.7),
}
clf = RandomizedSearchCV(
    XGBClassifier(), grid_params,
    n_iter=20, cv=5, scoring="accuracy"
)
clf.fit(*data["Dev"]["Train"])

params = clf.best_params_
clf = clf.best_estimator_

evaluate_model("eXtreme Gradient Boosting (RandomisedSearchCV)", params)

Model: eXtreme Gradient Boosting (RandomisedSearchCV) {'colsample_bytree': 0.14286681792194078, 'eval_metric': 'error', 'gamma': 1.9875765715516733, 'learning_rate': 0.026359357917859073, 'max_depth': 6, 'min_child_weight': 1, 'n_estimators': 691, 'objective': 'binary:logistic', 'reg_alpha': 242.7557302444265, 'reg_lambda': 48.048914302067814, 'scale_pos_weight': 2.223306320976562, 'subsample': 0.30494641365380215}
Train set 	| Accuracy: 0.5251 	| AUC: 0.5000 		┆	 Train-Dev set 	| Accuracy: 0.5268 	| AUC: 0.5000
Dev set 	| Accuracy: 0.5333 	| AUC: 0.5000 		┆	 Test set 	| Accuracy: 0.4247 	| AUC: 0.5000
CPU times: user 3min 38s, sys: 11.3 s, total: 3min 50s
Wall time: 22.4 s


##### 06 | Deep Neural Network

In [24]:
%%time
np.random.seed(42)

grid_params = {
    "optimizer__n_layers": stats.randint(1, 10),
    "optimizer__neurons_min": stats.randint(4, 8),
    "optimizer__activation": ["relu", "tanh"],
    "optimizer__patience": stats.randint(2, 5),
    "batch_size": stats.randint(16, 32),
    "epochs": stats.randint(20, 50),
}
clf = RandomizedSearchCV(
    KerasClassifier(model=DNN, verbose=0), grid_params,
    n_iter=20, cv=5, scoring="accuracy"
)
clf.fit(*data["Dev"]["Train"])

params = clf.best_params_
clf = clf.best_estimator_

evaluate_model("Deep Neural Network (RandomisedSearchCV)", params)

Model: Deep Neural Network (RandomisedSearchCV) {'batch_size': 21, 'epochs': 40, 'optimizer__activation': 'tanh', 'optimizer__n_layers': 8, 'optimizer__neurons_min': 7, 'optimizer__patience': 3}
Train set 	| Accuracy: 0.5866 	| AUC: 0.5743 		┆	 Train-Dev set 	| Accuracy: 0.5938 	| AUC: 0.5818
Dev set 	| Accuracy: 0.5333 	| AUC: 0.5149 		┆	 Test set 	| Accuracy: 0.3425 	| AUC: 0.3737
CPU times: user 2min 29s, sys: 12.6 s, total: 2min 41s
Wall time: 2min 13s


#### BayesSearchCV Models

##### 01 | k-Nearest Neighbors

In [25]:
%%time
np.random.seed(42)

grid_params = {
    "n_neighbors": Integer(2, 50),
}
clf = BayesSearchCV(
    KNeighborsClassifier(), grid_params,
    optimizer_kwargs={"base_estimator": "ET"},
    n_iter=20, cv=StratifiedKFold(n_splits=5),
    scoring="accuracy", return_train_score=False,
)
clf.fit(X_train, y_train)

params = clf.best_params_
clf = KNeighborsClassifier(**params)

evaluate_model("k-Nearest Neighbors (BayesSearchCV)", params)

Model: k-Nearest Neighbors (BayesSearchCV) OrderedDict([('n_neighbors', 20)])
Train set 	| Accuracy: 0.6592 	| AUC: 0.6502 		┆	 Train-Dev set 	| Accuracy: 0.6473 	| AUC: 0.6413
Dev set 	| Accuracy: 0.5778 	| AUC: 0.5655 		┆	 Test set 	| Accuracy: 0.5479 	| AUC: 0.5649
CPU times: user 8.76 s, sys: 137 ms, total: 8.9 s
Wall time: 8.95 s


##### 02 | Logistic Regression

In [26]:
%%time
np.random.seed(42)

grid_params = {
    "C": Real(0.001, 1e+2),
    "class_weight": Categorical(["balanced", "auto"]),
    "max_iter": Categorical([10000]),
    "penalty": Categorical(["elasticnet"]),
    "solver": Categorical(["saga"]),
    "l1_ratio": Real(0, 1),
}
clf = BayesSearchCV(
    LogisticRegression(), grid_params,
    optimizer_kwargs={"base_estimator": "ET"},
    n_iter=20, cv=StratifiedKFold(n_splits=5),
    scoring="accuracy", return_train_score=False,
)
clf.fit(X_train, y_train)

params = clf.best_params_
clf = LogisticRegression(**params)

evaluate_model("Logistic Regression (BayesSearchCV)", params)

Model: Logistic Regression (BayesSearchCV) OrderedDict([('C', 41.010985781372526), ('class_weight', 'auto'), ('l1_ratio', 0.9328679988478339), ('max_iter', 10000), ('penalty', 'elasticnet'), ('solver', 'saga')])
Train set 	| Accuracy: 0.5698 	| AUC: 0.5640 		┆	 Train-Dev set 	| Accuracy: 0.5893 	| AUC: 0.5819
Dev set 	| Accuracy: 0.5111 	| AUC: 0.4970 		┆	 Test set 	| Accuracy: 0.5342 	| AUC: 0.5699
CPU times: user 12.1 s, sys: 117 ms, total: 12.2 s
Wall time: 12.2 s


##### 03 | Support Vector Machine

In [27]:
%%time
np.random.seed(42)

grid_params = {
    "C": Integer(1, 100),
    "kernel": Categorical(["linear", "poly", "rbf", "sigmoid"]),
    "gamma": Categorical(["scale"]),
}
clf = BayesSearchCV(
    SVC(), grid_params,
    optimizer_kwargs={"base_estimator": "ET"},
    n_iter=20, cv=StratifiedKFold(n_splits=5),
    scoring="accuracy", return_train_score=False,
)
clf.fit(X_train, y_train)

params = clf.best_params_
clf = SVC(**params)

evaluate_model("Support Vector Machine (BayesSearchCV)", params)

Model: Support Vector Machine (BayesSearchCV) OrderedDict([('C', 19), ('gamma', 'scale'), ('kernel', 'rbf')])
Train set 	| Accuracy: 0.7654 	| AUC: 0.7620 		┆	 Train-Dev set 	| Accuracy: 0.7500 	| AUC: 0.7474
Dev set 	| Accuracy: 0.5778 	| AUC: 0.5655 		┆	 Test set 	| Accuracy: 0.5616 	| AUC: 0.5641
CPU times: user 20.5 s, sys: 161 ms, total: 20.6 s
Wall time: 20.7 s


##### 04 | Random Forest

In [28]:
%%time
np.random.seed(42)

grid_params = {
    "bootstrap": Categorical([True,False]),
    "criterion": Categorical(["gini", "entropy"]),
    "max_depth": Integer(5, 15),
    "max_features": Categorical(["sqrt", "log2"]),
    "min_samples_leaf": Integer(10, 50),
    "n_estimators": Integer(200, 500),
}
clf = BayesSearchCV(
    RandomForestClassifier(), grid_params,
    optimizer_kwargs={"base_estimator": "ET"},
    n_iter=20, cv=StratifiedKFold(n_splits=5),
    scoring="accuracy", return_train_score=False,
)
clf.fit(X_train, y_train)

params = clf.best_params_
clf = RandomForestClassifier(**params)

evaluate_model("Random Forest (BayesSearchCV)", params)

Model: Random Forest (BayesSearchCV) OrderedDict([('bootstrap', False), ('criterion', 'entropy'), ('max_depth', 13), ('max_features', 'log2'), ('min_samples_leaf', 35), ('n_estimators', 327)])
Train set 	| Accuracy: 0.6425 	| AUC: 0.6365 		┆	 Train-Dev set 	| Accuracy: 0.6250 	| AUC: 0.6210
Dev set 	| Accuracy: 0.5778 	| AUC: 0.5625 		┆	 Test set 	| Accuracy: 0.6438 	| AUC: 0.6271
CPU times: user 38.8 s, sys: 470 ms, total: 39.3 s
Wall time: 39.5 s


##### 05 | eXtreme Gradient Boosting

In [29]:
%%time
np.random.seed(42)

grid_params = {
    "colsample_bytree": Real(0, 1),
    "eval_metric": Categorical(["error"]),
    "gamma": Real(0.1, 2.9),
    "learning_rate": Real(0.01, 0.29),
    "max_depth": Integer(3, 10),
    "min_child_weight": Integer(0, 5),
    "n_estimators": Integer(500, 1500),
    "objective": Categorical(["binary:logistic"]),
    "reg_alpha": Real(0, 50),
    "reg_lambda": Real(0, 50),
    "scale_pos_weight": Real(1, 2),
    "subsample": Real(0.3, 0.7),
}
clf = BayesSearchCV(
    XGBClassifier(), grid_params,
    optimizer_kwargs={"base_estimator": "ET"},
    n_iter=20, cv=StratifiedKFold(n_splits=5),
    scoring="accuracy", return_train_score=False,
)
clf.fit(X_train, y_train)

params = clf.best_params_
clf = XGBClassifier(**params)

evaluate_model("eXtreme Gradient Boosting (BayesSearchCV)", params)

Model: eXtreme Gradient Boosting (BayesSearchCV) OrderedDict([('colsample_bytree', 0.8373883555532844), ('eval_metric', 'error'), ('gamma', 0.9495483076756899), ('learning_rate', 0.2763426938461322), ('max_depth', 7), ('min_child_weight', 5), ('n_estimators', 940), ('objective', 'binary:logistic'), ('reg_alpha', 38.5375694757653), ('reg_lambda', 7.955943284827878), ('scale_pos_weight', 1.677702769290508), ('subsample', 0.4806291449961098)])
Train set 	| Accuracy: 0.5251 	| AUC: 0.5000 		┆	 Train-Dev set 	| Accuracy: 0.5268 	| AUC: 0.5000
Dev set 	| Accuracy: 0.5333 	| AUC: 0.5000 		┆	 Test set 	| Accuracy: 0.4247 	| AUC: 0.5000
CPU times: user 4min 17s, sys: 14.3 s, total: 4min 31s
Wall time: 37.3 s


##### 06 | Deep Neural Network

In [30]:
%%time
np.random.seed(42)

grid_params = {
    "optimizer__n_layers": Integer(1, 10),
    "optimizer__neurons_min": Integer(4, 8),
    "optimizer__activation": ["relu", "tanh"],
    "optimizer__patience": Integer(2, 5),
    "batch_size": Integer(16, 32),
    "epochs": Integer(20, 50),
}
clf = BayesSearchCV(
    KerasClassifier(model=DNN, verbose=0), grid_params,
    optimizer_kwargs={"base_estimator": "ET"},
    n_iter=20, cv=StratifiedKFold(n_splits=5),
    scoring="accuracy", return_train_score=False,
)
clf.fit(X_train, y_train)

params = clf.best_params_
clf = KerasClassifier(model=DNN, verbose=0, **params)

evaluate_model("Deep Neural Network (BayesSearchCV)", params)

Model: Deep Neural Network (BayesSearchCV) OrderedDict([('batch_size', 32), ('epochs', 49), ('optimizer__activation', 'tanh'), ('optimizer__n_layers', 6), ('optimizer__neurons_min', 7), ('optimizer__patience', 4)])
Train set 	| Accuracy: 0.5642 	| AUC: 0.5491 		┆	 Train-Dev set 	| Accuracy: 0.5893 	| AUC: 0.5776
Dev set 	| Accuracy: 0.6000 	| AUC: 0.5833 		┆	 Test set 	| Accuracy: 0.4110 	| AUC: 0.4416
CPU times: user 2min 33s, sys: 11.6 s, total: 2min 45s
Wall time: 2min 16s


#### Optuna Models

##### 01 | k-Nearest Neighbors

In [31]:
%%time
np.random.seed(42)

def objective(trial):
    grid_params = {
        "n_neighbors": trial.suggest_int("n_neighbors", 2, 50),
    }
    
    clf = KNeighborsClassifier(**grid_params)
    clf.fit(*data["Dev"]["Train"])
    score = clf.score(*data["Dev"]["Dev"])
    
    return score

study = optuna.create_study(direction="maximize", sampler=TPESampler(seed=42))
study.optimize(objective, n_trials=20)

params = study.best_params
clf = KNeighborsClassifier(**params)

evaluate_model("k-Nearest Neighbors (Optuna)", params)

Model: k-Nearest Neighbors (Optuna) {'n_neighbors': 31}
Train set 	| Accuracy: 0.6257 	| AUC: 0.6200 		┆	 Train-Dev set 	| Accuracy: 0.6562 	| AUC: 0.6517
Dev set 	| Accuracy: 0.6444 	| AUC: 0.6369 		┆	 Test set 	| Accuracy: 0.5479 	| AUC: 0.5353
CPU times: user 104 ms, sys: 2.38 ms, total: 106 ms
Wall time: 105 ms


##### 02 | Logistic Regression

In [32]:
%%time
np.random.seed(42)

def objective(trial):
    grid_params = {
        "C": trial.suggest_float("C", 0.001, 1e+2),
        "class_weight": trial.suggest_categorical("class_weight", ["balanced", "auto"]),
        "max_iter": trial.suggest_categorical("max_iter", [10000]),
        "penalty": trial.suggest_categorical("penalty", ["elasticnet"]),
        "solver": trial.suggest_categorical("solver", ["saga"]),
        "l1_ratio": trial.suggest_float("l1_ratio", 0, 1),
    }
    
    clf = LogisticRegression(**grid_params)
    clf.fit(*data["Dev"]["Train"])
    score = clf.score(*data["Dev"]["Dev"])
    
    return score

study = optuna.create_study(direction="maximize", sampler=TPESampler(seed=42))
study.optimize(objective, n_trials=20)

params = study.best_params
clf = LogisticRegression(**params)

evaluate_model("Logistic Regression (Optuna)", params)

Model: Logistic Regression (Optuna) {'C': 61.18567761934322, 'class_weight': 'auto', 'max_iter': 10000, 'penalty': 'elasticnet', 'solver': 'saga', 'l1_ratio': 0.3663618432936917}
Train set 	| Accuracy: 0.5642 	| AUC: 0.5581 		┆	 Train-Dev set 	| Accuracy: 0.5848 	| AUC: 0.5776
Dev set 	| Accuracy: 0.5111 	| AUC: 0.4970 		┆	 Test set 	| Accuracy: 0.5479 	| AUC: 0.5818
CPU times: user 351 ms, sys: 2.33 ms, total: 353 ms
Wall time: 352 ms


##### 03 | Support Vector Machine

In [33]:
%%time
np.random.seed(42)

def objective(trial):
    grid_params = {
        "C": trial.suggest_int("C", 1, 100),
        "kernel": trial.suggest_categorical("kernel", ["linear", "poly", "rbf", "sigmoid"]),
        "gamma": trial.suggest_categorical("gamma", ["scale"]),
    }
    
    clf = SVC(**grid_params)
    clf.fit(*data["Dev"]["Train"])
    score = clf.score(*data["Dev"]["Dev"])
    
    return score

study = optuna.create_study(direction="maximize", sampler=TPESampler(seed=42))
study.optimize(objective, n_trials=20)

params = study.best_params
clf = SVC(**params)

evaluate_model("Support Vector Machine (Optuna)", params)

Model: Support Vector Machine (Optuna) {'C': 62, 'kernel': 'sigmoid', 'gamma': 'scale'}
Train set 	| Accuracy: 0.5698 	| AUC: 0.5679 		┆	 Train-Dev set 	| Accuracy: 0.4330 	| AUC: 0.4331
Dev set 	| Accuracy: 0.7333 	| AUC: 0.7321 		┆	 Test set 	| Accuracy: 0.3836 	| AUC: 0.3882
CPU times: user 2.02 s, sys: 3.59 ms, total: 2.02 s
Wall time: 2.02 s


##### 04 | Random Forest

In [34]:
%%time
np.random.seed(42)

def objective(trial):
    grid_params = {
        "bootstrap": trial.suggest_categorical("bootstrap", [True,False]),
        "criterion": trial.suggest_categorical("criterion", ["gini", "entropy"]),
        "max_depth": trial.suggest_int("max_depth", 5, 15),
        "max_features": trial.suggest_categorical("max_features", ["sqrt", "log2"]),
        "min_samples_leaf": trial.suggest_int("min_samples_leaf", 10, 50),
        "n_estimators": trial.suggest_int("n_estimators", 200, 500),
    }
    
    clf = RandomForestClassifier(**grid_params)
    clf.fit(*data["Dev"]["Train"])
    score = clf.score(*data["Dev"]["Dev"])
    
    return score

study = optuna.create_study(direction="maximize", sampler=TPESampler(seed=42))
study.optimize(objective, n_trials=20)

params = study.best_params
clf = RandomForestClassifier(**params)

evaluate_model("Random Forest (Optuna)", params)

Model: Random Forest (Optuna) {'bootstrap': False, 'criterion': 'entropy', 'max_depth': 15, 'max_features': 'log2', 'min_samples_leaf': 10, 'n_estimators': 404}
Train set 	| Accuracy: 0.7654 	| AUC: 0.7631 		┆	 Train-Dev set 	| Accuracy: 0.7634 	| AUC: 0.7601
Dev set 	| Accuracy: 0.6222 	| AUC: 0.6161 		┆	 Test set 	| Accuracy: 0.6301 	| AUC: 0.6110
CPU times: user 7.13 s, sys: 58.8 ms, total: 7.19 s
Wall time: 7.2 s


##### 05 | eXtreme Gradient Boosting

In [35]:
%%time
np.random.seed(42)

def objective(trial):
    grid_params = {
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0, 1),
        "eval_metric": trial.suggest_categorical("eval_metric", ["error"]),
        "gamma": trial.suggest_float("gamma", 0.1, 2.9),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.29),
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "min_child_weight": trial.suggest_int("min_child_weight", 0, 5),
        "n_estimators": trial.suggest_int("n_estimators", 500, 1500),
        "objective": trial.suggest_categorical("objective", ["binary:logistic"]),
        "reg_alpha": trial.suggest_float("reg_alpha", 0, 50),
        "reg_lambda": trial.suggest_float("reg_lambda", 0, 50),
        "scale_pos_weight": trial.suggest_float("scale_pos_weight", 1, 2),
        "subsample": trial.suggest_float("subsample", 0.3, 0.7),
    }
    
    clf = XGBClassifier(**grid_params)
    clf.fit(*data["Dev"]["Train"])
    score = clf.score(*data["Dev"]["Dev"])
    
    return score

study = optuna.create_study(direction="maximize", sampler=TPESampler(seed=42))
study.optimize(objective, n_trials=20)

params = study.best_params
clf = XGBClassifier(**params)

evaluate_model("eXtreme Gradient Boosting (Optuna)", params)

Model: eXtreme Gradient Boosting (Optuna) {'colsample_bytree': 0.38691249885012907, 'eval_metric': 'error', 'gamma': 1.2402301959453919, 'learning_rate': 0.1638244271216156, 'max_depth': 5, 'min_child_weight': 2, 'n_estimators': 1042, 'objective': 'binary:logistic', 'reg_alpha': 0.3835177757870811, 'reg_lambda': 49.89176955365552, 'scale_pos_weight': 1.004727219400187, 'subsample': 0.6795082150508247}
Train set 	| Accuracy: 0.6760 	| AUC: 0.6757 		┆	 Train-Dev set 	| Accuracy: 0.6518 	| AUC: 0.6498
Dev set 	| Accuracy: 0.6667 	| AUC: 0.6696 		┆	 Test set 	| Accuracy: 0.6164 	| AUC: 0.5991
CPU times: user 42.8 s, sys: 605 ms, total: 43.4 s
Wall time: 3.7 s


##### 06 | Deep Neural Network

In [36]:
%%time
np.random.seed(42)

def objective(trial):
    grid_params = {
        "optimizer__n_layers": trial.suggest_int("optimizer__n_layers", 1, 10),
        "optimizer__neurons_min": trial.suggest_int("optimizer__neurons_min", 4, 8),
        "optimizer__activation": trial.suggest_categorical("optimizer__activation", ["relu", "tanh"]),
        "optimizer__patience": trial.suggest_int("optimizer__patience", 2, 5),
        "batch_size": trial.suggest_int("batch_size", 16, 32),
        "epochs": trial.suggest_int("epochs", 20, 50),
    }
    
    clf = KerasClassifier(model=DNN, verbose=0, **grid_params)
    clf.fit(*data["Dev"]["Train"])
    score = clf.score(*data["Dev"]["Dev"])
    
    return score

study = optuna.create_study(direction="maximize", sampler=TPESampler(seed=42))
study.optimize(objective, n_trials=20)

params = study.best_params
clf = KerasClassifier(model=DNN, verbose=0, **params)

evaluate_model("Deep Neural Network (Optuna)", params)

Model: Deep Neural Network (Optuna) {'optimizer__n_layers': 6, 'optimizer__neurons_min': 7, 'optimizer__activation': 'relu', 'optimizer__patience': 2, 'batch_size': 24, 'epochs': 20}
Train set 	| Accuracy: 0.5866 	| AUC: 0.5743 		┆	 Train-Dev set 	| Accuracy: 0.5759 	| AUC: 0.5605
Dev set 	| Accuracy: 0.5333 	| AUC: 0.5179 		┆	 Test set 	| Accuracy: 0.3973 	| AUC: 0.4382
CPU times: user 32 s, sys: 2.43 s, total: 34.5 s
Wall time: 27 s


#### Hyperopt Models

##### 01 | k-Nearest Neighbors

In [37]:
%%time
np.random.seed(42)

def objective(params):
    clf = KNeighborsClassifier(**params)
    clf.fit(*data["Dev"]["Train"])
    score = clf.score(*data["Dev"]["Dev"])
    
    return {"loss": -score, "status": STATUS_OK}

grid_params = {
    "n_neighbors": hp.randint("n_neighbors", 2, 50),
}

trials = Trials()
best = fmin(
    fn=objective, space=grid_params,
    algo=tpe.suggest, max_evals=20,
    trials=trials, rstate=np.random.default_rng(42),
    show_progressbar=False,
)

params = space_eval(grid_params, best)
clf = KNeighborsClassifier(**params)

evaluate_model("k-Nearest Neighbors (Hyperopt)", params)

Model: k-Nearest Neighbors (Hyperopt) {'n_neighbors': 24}
Train set 	| Accuracy: 0.6592 	| AUC: 0.6519 		┆	 Train-Dev set 	| Accuracy: 0.6473 	| AUC: 0.6413
Dev set 	| Accuracy: 0.6444 	| AUC: 0.6339 		┆	 Test set 	| Accuracy: 0.5479 	| AUC: 0.5480
CPU times: user 77.6 ms, sys: 2.36 ms, total: 80 ms
Wall time: 78.4 ms


##### 02 | Logistic Regression

In [38]:
%%time
np.random.seed(42)

def objective(params):
    clf = LogisticRegression(**params)
    clf.fit(*data["Dev"]["Train"])
    score = clf.score(*data["Dev"]["Dev"])
    
    return {"loss": -score, "status": STATUS_OK}

grid_params = {
    "C": hp.uniform("C", 0.001, 1e+2),
    "class_weight": hp.choice("class_weight", ["balanced", "auto"]),
    "max_iter": hp.choice("max_iter", [10000]),
    "penalty": hp.choice("penalty", ["elasticnet"]),
    "solver": hp.choice("solver", ["saga"]),
    "l1_ratio": hp.uniform("l1_ratio", 0, 1),
}

trials = Trials()
best = fmin(
    fn=objective, space=grid_params,
    algo=tpe.suggest, max_evals=20,
    trials=trials, rstate=np.random.default_rng(42),
    show_progressbar=False,
)

params = space_eval(grid_params, best)
clf = LogisticRegression(**params)

evaluate_model("Logistic Regression (Hyperopt)", params)

Model: Logistic Regression (Hyperopt) {'C': 88.39277862063608, 'class_weight': 'auto', 'l1_ratio': 0.8136554076402878, 'max_iter': 10000, 'penalty': 'elasticnet', 'solver': 'saga'}
Train set 	| Accuracy: 0.5642 	| AUC: 0.5581 		┆	 Train-Dev set 	| Accuracy: 0.5848 	| AUC: 0.5776
Dev set 	| Accuracy: 0.5111 	| AUC: 0.4970 		┆	 Test set 	| Accuracy: 0.5479 	| AUC: 0.5818
CPU times: user 306 ms, sys: 1.73 ms, total: 308 ms
Wall time: 306 ms


##### 03 | Support Vector Machine

In [39]:
%%time
np.random.seed(42)

def objective(params):
    clf = SVC(**params)
    clf.fit(*data["Dev"]["Train"])
    score = clf.score(*data["Dev"]["Dev"])
    
    return {"loss": -score, "status": STATUS_OK}

grid_params = {
    "C": hp.randint("C", 1, 100),
    "kernel": hp.choice("kernel", ["linear", "poly", "rbf", "sigmoid"]),
    "gamma": hp.choice("gamma", ["scale"]),
}

trials = Trials()
best = fmin(
    fn=objective, space=grid_params,
    algo=tpe.suggest, max_evals=20,
    trials=trials, rstate=np.random.default_rng(42),
    show_progressbar=False,
)

params = space_eval(grid_params, best)
clf = SVC(**params)

evaluate_model("Support Vector Machine (Hyperopt)", params)

Model: Support Vector Machine (Hyperopt) {'C': 30, 'gamma': 'scale', 'kernel': 'sigmoid'}
Train set 	| Accuracy: 0.5587 	| AUC: 0.5561 		┆	 Train-Dev set 	| Accuracy: 0.4643 	| AUC: 0.4695
Dev set 	| Accuracy: 0.7333 	| AUC: 0.7321 		┆	 Test set 	| Accuracy: 0.4110 	| AUC: 0.3952
CPU times: user 1.67 s, sys: 3.2 ms, total: 1.67 s
Wall time: 1.67 s


##### 04 | Random Forest

In [40]:
%%time
np.random.seed(42)

def objective(params):
    clf = RandomForestClassifier(**params)
    clf.fit(*data["Dev"]["Train"])
    score = clf.score(*data["Dev"]["Dev"])
    
    return {"loss": -score, "status": STATUS_OK}

grid_params = {
    "bootstrap": hp.choice("bootstrap", [True,False]),
    "criterion": hp.choice("criterion", ["gini", "entropy"]),
    "max_depth": hp.randint("max_depth", 5, 15),
    "max_features": hp.choice("max_features", ["sqrt", "log2"]),
    "min_samples_leaf": hp.randint("min_samples_leaf", 10, 50),
    "n_estimators": hp.randint("n_estimators", 200, 500),
}

trials = Trials()
best = fmin(
    fn=objective, space=grid_params,
    algo=tpe.suggest, max_evals=20,
    trials=trials, rstate=np.random.default_rng(42),
    show_progressbar=False,
)

params = space_eval(grid_params, best)
clf = RandomForestClassifier(**params)

evaluate_model("Random Forest (Hyperopt)", params)

Model: Random Forest (Hyperopt) {'bootstrap': False, 'criterion': 'gini', 'max_depth': 12, 'max_features': 'sqrt', 'min_samples_leaf': 21, 'n_estimators': 303}
Train set 	| Accuracy: 0.6927 	| AUC: 0.6883 		┆	 Train-Dev set 	| Accuracy: 0.6830 	| AUC: 0.6804
Dev set 	| Accuracy: 0.6000 	| AUC: 0.5952 		┆	 Test set 	| Accuracy: 0.5890 	| AUC: 0.5710
CPU times: user 6.98 s, sys: 64 ms, total: 7.04 s
Wall time: 7.05 s


##### 05 | eXtreme Gradient Boosting

In [41]:
%%time
np.random.seed(42)

def objective(params):
    clf = XGBClassifier(**params)
    clf.fit(*data["Dev"]["Train"])
    score = clf.score(*data["Dev"]["Dev"])
    
    return {"loss": -score, "status": STATUS_OK}

grid_params = {
    "colsample_bytree": hp.uniform("colsample_bytree", 0, 1),
    "eval_metric": hp.choice("eval_metric", ["error"]),
    "gamma": hp.uniform("gamma", 0.1, 2.9),
    "learning_rate": hp.uniform("learning_rate", 0.01, 0.29),
    "max_depth": hp.randint("max_depth", 3, 10),
    "min_child_weight": hp.randint("min_child_weight", 0, 5),
    "n_estimators": hp.randint("n_estimators", 500, 1500),
    "objective": hp.choice("objective", ["binary:logistic"]),
    "reg_alpha": hp.uniform("reg_alpha", 0, 50),
    "reg_lambda": hp.uniform("reg_lambda", 0, 50),
    "scale_pos_weight": hp.uniform("scale_pos_weight", 1, 2),
    "subsample": hp.uniform("subsample", 0.3, 0.7),
}

trials = Trials()
best = fmin(
    fn=objective, space=grid_params,
    algo=tpe.suggest, max_evals=20,
    trials=trials, rstate=np.random.default_rng(42),
    show_progressbar=False,
)

params = space_eval(grid_params, best)
clf = XGBClassifier(**params)

evaluate_model("eXtreme Gradient Boosting (Hyperopt)", params)

Model: eXtreme Gradient Boosting (Hyperopt) {'colsample_bytree': 0.34187263717236394, 'eval_metric': 'error', 'gamma': 0.1153664931135201, 'learning_rate': 0.2577669527551195, 'max_depth': 8, 'min_child_weight': 2, 'n_estimators': 1488, 'objective': 'binary:logistic', 'reg_alpha': 2.9968094540700263, 'reg_lambda': 43.046524095671735, 'scale_pos_weight': 1.6082491621711366, 'subsample': 0.42016854191620334}
Train set 	| Accuracy: 0.7318 	| AUC: 0.7379 		┆	 Train-Dev set 	| Accuracy: 0.7232 	| AUC: 0.7306
Dev set 	| Accuracy: 0.6222 	| AUC: 0.6310 		┆	 Test set 	| Accuracy: 0.6164 	| AUC: 0.5695
CPU times: user 36.6 s, sys: 482 ms, total: 37.1 s
Wall time: 3.17 s


##### 06 | Deep Neural Network

In [42]:
%%time
np.random.seed(42)

def objective(params):
    clf = KerasClassifier(model=DNN, verbose=0, **params)
    clf.fit(*data["Dev"]["Train"])
    score = clf.score(*data["Dev"]["Dev"])
    
    return {"loss": -score, "status": STATUS_OK}

grid_params = {
    "optimizer__n_layers": hp.randint("optimizer__n_layers", 1, 10),
    "optimizer__neurons_min": hp.randint("optimizer__neurons_min", 4, 8),
    "optimizer__activation": hp.choice("optimizer__activation", ["relu", "tanh"]),
    "optimizer__patience": hp.randint("optimizer__patience", 2, 5),
    "batch_size": hp.randint("batch_size", 16, 32),
    "epochs": hp.randint("epochs", 20, 50),
}

trials = Trials()
best = fmin(
    fn=objective, space=grid_params,
    algo=tpe.suggest, max_evals=20,
    trials=trials, rstate=np.random.default_rng(42),
    show_progressbar=False,
)

params = space_eval(grid_params, best)
clf = KerasClassifier(model=DNN, verbose=0, **params)

evaluate_model("Deep Neural Network (Hyperopt)", params)

Model: Deep Neural Network (Hyperopt) {'batch_size': 17, 'epochs': 26, 'optimizer__activation': 'tanh', 'optimizer__n_layers': 1, 'optimizer__neurons_min': 6, 'optimizer__patience': 4}
Train set 	| Accuracy: 0.6425 	| AUC: 0.6337 		┆	 Train-Dev set 	| Accuracy: 0.5804 	| AUC: 0.5667
Dev set 	| Accuracy: 0.4889 	| AUC: 0.4732 		┆	 Test set 	| Accuracy: 0.3836 	| AUC: 0.4178
CPU times: user 34.1 s, sys: 2.65 s, total: 36.7 s
Wall time: 28.8 s


### Model Performance
---

In [43]:
df = pd.DataFrame()
for idx, val in metrics_log.items():
    df_merge = pd.DataFrame(
        index=[idx],
        data={
            "accuracy": val["accuracy"],
            "auc": val["auc"],
            "params": str(val["params"])
        }
    )
    df = pd.concat([df, df_merge])
df.sort_values(["accuracy", "auc"], ascending=False)

Unnamed: 0,accuracy,auc,params
Random Forest (BayesSearchCV),0.643836,0.627112,"OrderedDict([('bootstrap', False), ('criterion..."
Random Forest (Optuna),0.630137,0.610983,"{'bootstrap': False, 'criterion': 'entropy', '..."
Random Forest,0.616438,0.603303,
eXtreme Gradient Boosting (Optuna),0.616438,0.599078,"{'colsample_bytree': 0.38691249885012907, 'eva..."
eXtreme Gradient Boosting (Hyperopt),0.616438,0.569508,"{'colsample_bytree': 0.34187263717236394, 'eva..."
Random Forest (Hyperopt),0.589041,0.571045,"{'bootstrap': False, 'criterion': 'gini', 'max..."
Random Forest (RandomisedSearchCV),0.589041,0.554147,"{'bootstrap': False, 'criterion': 'entropy', '..."
Support Vector Machine,0.561644,0.581029,
Support Vector Machine (BayesSearchCV),0.561644,0.564132,"OrderedDict([('C', 19), ('gamma', 'scale'), ('..."
Support Vector Machine (RandomisedSearchCV),0.561644,0.559908,"{'C': 24, 'gamma': 'scale', 'kernel': 'rbf'}"
