In [6]:
# ! git clone https://github.com/sb-ai-lab/LightAutoML.git
# ! pip download -d lightautoml_packages LightAutoML
# ! pip install --no-index --find-links=lightautoml_packages LightAutoML

In [1]:
import multiprocessing
multiprocessing.set_start_method('spawn', force=True)

In [None]:
import os
import gc
import pickle
import time
import numpy as np
import polars as pl
from sklearn.metrics import roc_auc_score, average_precision_score, f1_score, precision_score, recall_score
from lightautoml.automl.presets.tabular_presets import TabularUtilizedAutoML
from lightautoml.tasks import Task


SEED = 42
np.random.seed(SEED)

'nlp' extra dependecy package 'gensim' isn't installed. Look at README.md in repo 'LightAutoML' for installation instructions.
'nlp' extra dependecy package 'nltk' isn't installed. Look at README.md in repo 'LightAutoML' for installation instructions.
'nlp' extra dependecy package 'transformers' isn't installed. Look at README.md in repo 'LightAutoML' for installation instructions.
'nlp' extra dependecy package 'gensim' isn't installed. Look at README.md in repo 'LightAutoML' for installation instructions.
'nlp' extra dependecy package 'nltk' isn't installed. Look at README.md in repo 'LightAutoML' for installation instructions.
'nlp' extra dependecy package 'transformers' isn't installed. Look at README.md in repo 'LightAutoML' for installation instructions.


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
DATA_DIR = "/home/rbparchiev/alpha_hackathon/alpha_step_2/data/data_fs_rf/"
OUTPUT_DIR = os.path.join(DATA_DIR, "lightautoml_results")
SUBMISSION_DIR = os.path.join(OUTPUT_DIR, "submissions")
os.makedirs(SUBMISSION_DIR, exist_ok=True)

datasets = [
    "fl_credit_card_tendency",
    "invest_prop_4",
    "outflow_12",
    "pd_fl",
    "pd_ul_9",
    "ul_leasing_outflow"
]

In [5]:
TOTAL_TIME = int(3600 * 4) - 600
TIMEOUT_PER_DATASET = TOTAL_TIME // len(datasets)
N_THREADS = 16
N_FOLDS = 5

In [None]:
def load_data(name):
    train_path = os.path.join(DATA_DIR, f"{name}_train_selected.parquet")
    test_path = os.path.join(DATA_DIR, f"{name}_test_selected.parquet")
    cv_splits_path = os.path.join(DATA_DIR, f"{name}_cv_splits.pkl")

    df_train = pl.read_parquet(train_path)
    df_test = pl.read_parquet(test_path)

    with open(cv_splits_path, "rb") as f:
        cv_splits = pickle.load(f)

    return df_train, df_test, cv_splits

def train_and_evaluate_lightautoml(df_train, cv_splits, dataset_name, timeout):
    train_pd = df_train.to_pandas()

    task = Task('binary')
    roles = {
        'target': 'target',
        'drop': ['id', 'smpl']
    }

    scores = []
    fold_timeout = timeout // len(cv_splits) if len(cv_splits) > 0 else timeout
    for fold_dict in cv_splits:
        tr_idx, val_idx = fold_dict["train_index"], fold_dict["validation_index"]
        X_train_fold = train_pd.iloc[tr_idx].reset_index(drop=True)
        X_val_fold = train_pd.iloc[val_idx].reset_index(drop=True)

        automl = TabularUtilizedAutoML(
            task=task,
            timeout=fold_timeout,
            cpu_limit=N_THREADS,
            reader_params={'n_jobs': N_THREADS, 'cv': N_FOLDS, 'random_state': SEED}
        )
        oof_preds_fold = automl.fit_predict(X_train_fold, roles=roles, verbose=1)
        val_pred = automl.predict(X_val_fold)
        val_pred_proba = val_pred.data[:, 0]

        score = roc_auc_score(X_val_fold['target'].values, val_pred_proba)
        scores.append(score)

        del automl
        gc.collect()

    if scores:
        print(f"LightAutoML CV scores for {dataset_name}: {scores}, mean: {np.mean(scores):.4f}")
    else:
        print(f"No CV splits provided for {dataset_name}, skipping CV evaluation.")

    automl_full = TabularUtilizedAutoML(
        task=task,
        timeout=timeout,
        cpu_limit=N_THREADS,
        reader_params={'n_jobs': N_THREADS, 'cv': N_FOLDS, 'random_state': SEED}
    )
    oof_preds_full = automl_full.fit_predict(train_pd, roles=roles, verbose=1)
    full_score = roc_auc_score(train_pd['target'].values, oof_preds_full.data[:, 0])
    print(f"Full training OOF score for {dataset_name}: {full_score:.4f}")

    return automl_full

def make_submission(df_test, model, dataset_name):
    test_pd = df_test.to_pandas()

    test_pred = model.predict(test_pd)
    test_pred_proba = test_pred.data[:, 0]

    submission = pl.DataFrame({
        "id": test_pd["id"],
        "prediction": test_pred_proba
    })

    submission_path = os.path.join(SUBMISSION_DIR, f"{dataset_name}_lightautoml_submission.csv")
    submission.write_csv(submission_path)
    print(f"Submission saved for {dataset_name}: {submission_path}")

In [None]:
start_time = time.time()

for i, dataset_name in enumerate(datasets):
    elapsed = time.time() - start_time
    remaining = TOTAL_TIME - elapsed
    if remaining <= 0:
        print("No time left to process remaining datasets.")
        break

    dataset_timeout = min(TIMEOUT_PER_DATASET, int(remaining))
    if dataset_timeout <= 0:
        print(f"No sufficient time left for {dataset_name}. Skipping.")
        continue

    print(f"Processing dataset: {dataset_name}, allocated timeout: {dataset_timeout} seconds")
    df_train, df_test, cv_splits = load_data(dataset_name)
    laml_model = train_and_evaluate_lightautoml(df_train, cv_splits, dataset_name, dataset_timeout)
    make_submission(df_test, laml_model, dataset_name)

    del df_train, df_test, cv_splits, laml_model
    gc.collect()

print("All submissions generated (or as many as time allowed) successfully!")

Processing dataset: fl_credit_card_tendency, allocated timeout: 2300 seconds
[02:10:45] Start automl [1mutilizator[0m with listed constraints:
[02:10:45] - time: 460.00 seconds
[02:10:45] - CPU: 16 cores
[02:10:45] - memory: 16 GB

[02:10:45] [1mIf one preset completes earlier, next preset configuration will be started[0m

[02:10:45] Start 0 automl preset configuration:
[02:10:45] [1mconf_0_sel_type_0.yml[0m, random state: {'reader_params': {'random_state': 42}, 'nn_params': {'random_state': 42}, 'general_params': {'return_all_predictions': False}}
[02:10:45] Stdout logging level is INFO.
[02:10:45] Task: binary

[02:10:45] Start automl preset with listed constraints:
[02:10:45] - time: 460.00 seconds
[02:10:45] - CPU: 16 cores
[02:10:45] - memory: 16 GB

[02:10:45] [1mTrain data shape: (147549, 39)[0m



In addition, using fork() with Python in general is a recipe for mysterious
deadlocks and crashes.

The most likely reason you are seeing this error is because you are using the
multiprocessing module on Linux, which uses fork() by default. This will be
fixed in Python 3.14. Until then, you want to use the "spawn" context instead.

See https://docs.pola.rs/user-guide/misc/multiprocessing/ for details.

or by setting POLARS_ALLOW_FORKING_THREAD=1.

  pid = os.fork()


[02:10:53] Layer [1m1[0m train process start. Time left 451.26 secs
[02:11:04] Start fitting [1mLvl_0_Pipe_0_Mod_0_LinearL2[0m ...
[02:11:27] Fitting [1mLvl_0_Pipe_0_Mod_0_LinearL2[0m finished. score = [1m0.7857183450784795[0m
[02:11:27] [1mLvl_0_Pipe_0_Mod_0_LinearL2[0m fitting and predicting completed
[02:11:27] Time left 417.77 secs

[02:11:37] Start fitting [1mLvl_0_Pipe_1_Mod_0_LightGBM[0m ...
[02:11:58] Fitting [1mLvl_0_Pipe_1_Mod_0_LightGBM[0m finished. score = [1m0.8035609379402759[0m
[02:11:58] [1mLvl_0_Pipe_1_Mod_0_LightGBM[0m fitting and predicting completed
[02:11:58] Start hyperparameters optimization for [1mLvl_0_Pipe_1_Mod_1_Tuned_LightGBM[0m ... Time budget is 25.24 secs
[02:11:58] Copying TaskTimer may affect the parent PipelineTimer, so copy will create new unlimited TaskTimer
[02:12:26] Hyperparameters optimization for [1mLvl_0_Pipe_1_Mod_1_Tuned_LightGBM[0m completed
[02:12:26] Start fitting [1mLvl_0_Pipe_1_Mod_1_Tuned_LightGBM[0m ...
[02:12: