colab notebook

In [1]:
!pip install catboost
!pip install shap
!pip install flaml
!pip install optuna
!pip install mlxtend

Collecting catboost
  Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl (98.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.7
Collecting flaml
  Downloading FLAML-2.3.2-py3-none-any.whl.metadata (16 kB)
Downloading FLAML-2.3.2-py3-none-any.whl (313 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m313.9/313.9 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: flaml
Successfully installed flaml-2.3.2
Collecting optuna
  Downloading optuna-4.1.0-py3-none-any.whl.metadata (16 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.14.0-py3-none-any.whl.metadata (7.4 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Collecting Mako

In [2]:
import pandas as pd
import json
import os, sys
from tqdm import tqdm
import numpy as np
import polars as pl
from polars import col as c
from pprint import pprint
import matplotlib.pyplot as plt
import re
import shap
from flaml.automl import AutoML

sys.path.append("../")
pl.Config.set_tbl_rows(50)

# explicitly require this experimental feature
from sklearn.experimental import enable_iterative_imputer  # noqa

# now you can import normally from sklearn.impute
from sklearn.impute import IterativeImputer
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from catboost import CatBoostRegressor, CatBoostClassifier

In [4]:
FILES_DIR = "./"
### select data
path = "./"
os.makedirs(os.path.join(FILES_DIR, path), exist_ok=True)
file_name = "imputed_train_data.parquet"
file_path = os.path.join(FILES_DIR, path, file_name)
# train_data.write_parquet(file_path)
train_data = pl.read_parquet(file_path)
### split data
X_train, X_test, y_train, y_test = train_test_split(
    train_data.select(
        pl.selectors.by_dtype(pl.Float64, pl.Int64, pl.Int32),
    ).drop("target"),
    train_data["target"],
    test_size=0.2,
    random_state=1,
)

X_train, X_eval, y_train, y_eval = train_test_split(
    X_train,
    y_train,
    test_size=0.25,
    random_state=1,
)  # 0.25 x 0.8 = 0.2
cat_cols = X_train.select(
    pl.selectors.by_dtype(pl.String),
).columns

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [10]:
baseline_model = AutoML()

settings = {
    "metric": "roc_auc",
    "estimator_list": ["lgbm", "catboost", "rf"],  # list of ML learners
    "task": "classification",  # task type
    # "log_file_name": "/content/drive/MyDrive/1231_ABCD/baseline_model.log",  # flaml log file
    "time_budget": 200,
    "early_stop": True,
    "eval_method": "cv",
    "n_splits": 5,
    "verbose": 2,
    "seed": 0,
    "split_type": "stratified",
}

In [11]:
baseline_model.fit(
    X_train=X_train.to_pandas(),
    y_train=y_train.to_pandas(),
    **settings,
)

In [12]:
metric = roc_auc_score(
    y_test,
    baseline_model.predict_proba(X_test.to_pandas())[:, 1],
)
print(metric.round(3))

0.648
