In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import ExtraTreesClassifier, HistGradientBoostingClassifier

train = pd.read_csv("./electricity_train.csv")
reserved = pd.read_csv("./electricity_reserved.csv")

target_col = "class" if "class" in train.columns else train.columns[-1]

X = train.drop(columns=[target_col])
y = train[target_col].astype(int)

X_res = reserved.reindex(columns=X.columns)

num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = [c for c in X.columns if c not in num_cols]

numeric = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
])

categorical = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("ohe", OneHotEncoder(handle_unknown="ignore")),
])

prep = ColumnTransformer(
    transformers=[
        ("num", numeric, num_cols),
        ("cat", categorical, cat_cols),
    ],
    remainder="drop",
)

cv = StratifiedKFold(n_splits=7, shuffle=True, random_state=21)

candidates = [
    ("hgb", HistGradientBoostingClassifier(random_state=21), {
        "clf__learning_rate": [0.03, 0.05, 0.07, 0.1, 0.15],
        "clf__max_depth": [None, 3, 4, 5, 6, 7, 8],
        "clf__max_iter": [200, 400, 600, 800, 1000],
        "clf__min_samples_leaf": [5, 10, 20, 30, 50, 80],
        "clf__l2_regularization": [0.0, 0.05, 0.1, 0.5, 1.0],
        "clf__max_bins": [128, 255],
    }),
    ("et", ExtraTreesClassifier(random_state=21, n_jobs=-1, class_weight="balanced"), {
        "clf__n_estimators": [400, 800, 1200, 1600],
        "clf__max_depth": [None, 6, 8, 10, 12, 15, 20],
        "clf__min_samples_split": [2, 4, 6, 10],
        "clf__min_samples_leaf": [1, 2, 3, 5],
        "clf__max_features": ["sqrt", "log2", 0.5, 0.7, 1.0],
        "clf__bootstrap": [False, True],
    }),
]

best_estimator = None
best_score = -1.0
best_name = None
best_params = None

for name, clf, param_dist in candidates:
    pipe = Pipeline([("prep", prep), ("clf", clf)])
    search = RandomizedSearchCV(
        estimator=pipe,
        param_distributions=param_dist,
        n_iter=80,
        scoring="accuracy",
        cv=cv,
        random_state=21,
        n_jobs=-1,
        verbose=1,
    )
    search.fit(X, y)

    if search.best_score_ > best_score:
        best_score = search.best_score_
        best_estimator = search.best_estimator_
        best_name = name
        best_params = search.best_params_

print("BEST:", best_name, "CV accuracy =", round(best_score, 4))
print("BEST PARAMS:", best_params)

best_estimator.fit(X, y)
pred = best_estimator.predict(X_res).astype(int)

pred_list = pred.tolist()
print(pred_list)

Fitting 7 folds for each of 80 candidates, totalling 560 fits
Fitting 7 folds for each of 80 candidates, totalling 560 fits
BEST: hgb CV accuracy = 0.9254
BEST PARAMS: {'clf__min_samples_leaf': 5, 'clf__max_iter': 800, 'clf__max_depth': 8, 'clf__max_bins': 255, 'clf__learning_rate': 0.15, 'clf__l2_regularization': 1.0}
[0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 