In [4]:
# @title Подключение к диску с данными
import os
from google.colab import drive
drive.mount('/content/drive')
!pip install polars
!pip install Dask

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Collecting Dask
  Using cached dask-2026.1.2-py3-none-any.whl.metadata (3.8 kB)
Collecting partd>=1.4.0 (from Dask)
  Using cached partd-1.4.2-py3-none-any.whl.metadata (4.6 kB)
Collecting locket (from partd>=1.4.0->Dask)
  Downloading locket-1.0.0-py2.py3-none-any.whl.metadata (2.8 kB)
Downloading dask-2026.1.2-py3-none-any.whl (1.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m25.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading partd-1.4.2-py3-none-any.whl (18 kB)
Downloading locket-1.0.0-py2.py3-none-any.whl (4.4 kB)
Installing collected packages: locket, partd, Dask
Successfully installed Dask-2026.1.2 locket-1.0.0 partd-1.4.2


In [None]:
# 3_train_baseline_dask.py
# @title Dask-LightGBM baseline для Data Fusion Contest 2026 "Страж" с обучением по частям (для OOM)

!pip install --quiet dask[complete] lightgbm dask-ml dask-lightgbm
!pip install "dask[distributed]"
!pip install lightgbm

import polars as pl
import dask.dataframe as dd
import dask.array as da
from dask.distributed import Client
import lightgbm as lgb
from lightgbm.dask import DaskLGBMClassifier
from sklearn.metrics import average_precision_score
import numpy as np
import gc
import os
from datetime import datetime

print("Начало обучения Dask-LightGBM baseline", datetime.now().strftime("%Y-%m-%d %H:%M"))

DATA_PATH = "/content/drive/MyDrive/ml-vtb-data-fusion-strazh/data/"
MODEL_PATH = "./models/"
os.makedirs(MODEL_PATH, exist_ok=True)

# ─── Параметры Dask ───────────────────────────────────────────────────────────

N_WORKERS = 4                  # подбери под свой CPU (i3 — 2–4)
THREADS_PER_WORKER = 1         # для LightGBM
MEMORY_LIMIT = 'auto'          # или '8GB' per worker, если нужно ограничить

# ─── Параметры модели ─────────────────────────────────────────────────────────

N_ESTIMATORS      = 2500
LEARNING_RATE     = 0.035
MAX_DEPTH         = 9
NUM_LEAVES        = 120
FEATURE_FRACTION  = 0.75
BAGGING_FRACTION  = 0.80
BAGGING_FREQ      = 5
POS_WEIGHT        = 350          # подбери по соотношению ~1 : 1500–4000

EARLY_STOPPING    = 120
VERBOSE_EVAL      = 100

# ─── Категориальные признаки ─────────────────────────────────────────────────

cat_features = [
    "event_type_nm",
    "channel_indicator_type",
    "channel_indicator_sub_type",
    "currency_iso_cd",
    "mcc_code",
    "pos_cd",
    "accept_language",
    "browser_language",
    "timezone",
    "operating_system_type",
    "device_system_version",
    "screen_size",
    # если есть ещё категориальные — добавь
]

# ─── Dask Client ──────────────────────────────────────────────────────────────

print("Запуск Dask Client...")
client = Client(
    n_workers=N_WORKERS,
    threads_per_worker=THREADS_PER_WORKER,
    memory_limit=MEMORY_LIMIT,
    processes=True,              # для стабильности
)
print(client.dashboard_link)     # ссылка на дашборд для мониторинга

# ─── Загрузка данных по частям с Dask ─────────────────────────────────────────

print("Считываем метаданные для разбиения...")

# Берём последнюю часть как валидацию (примерно последние 1–2 месяца)
valid_part = 3
train_parts = [1, 2]   # можно [1] для теста, потом добавить 2

train_files = [f"{DATA_PATH}train_features_part_{p}.parquet" for p in train_parts]
valid_file  = f"{DATA_PATH}train_features_part_{valid_part}.parquet"

# Dask DataFrame (lazy чтение)
ddf_train = dd.read_parquet(train_files, engine="pyarrow")
ddf_valid = dd.read_parquet(valid_file, engine="pyarrow")

print(f"Train parts: {len(train_files)} | Valid: 1")

# ─── Подготовка массивов ──────────────────────────────────────────────────────

exclude_cols = [
    "customer_id", "event_id", "event_dttm", "date",
    "target",                     # таргет отдельно
    # если есть другие служебные — добавь
]

feature_cols = [c for c in ddf_train.columns if c not in exclude_cols]

print(f"Количество признаков: {len(feature_cols)}")

# Категориальные в Dask
for col in cat_features:
    if col in feature_cols:
        ddf_train[col] = ddf_train[col].astype("category")
        ddf_valid[col] = ddf_valid[col].astype("category")

# X/y как Dask arrays
X_train = ddf_train[feature_cols].to_dask_array(lengths=True)
y_train = ddf_train["target"].to_dask_array(lengths=True)

X_valid = ddf_valid[feature_cols].to_dask_array(lengths=True)
y_valid = ddf_valid["target"].to_dask_array(lengths=True)

# ─── Dask-LightGBM ────────────────────────────────────────────────────────────

print("Запуск обучения Dask-LightGBM...")

model = DaskLGBMClassifier(
    objective="binary",
    metric="average_precision",          # PR-AUC
    learning_rate=LEARNING_RATE,
    num_leaves=NUM_LEAVES,
    max_depth=MAX_DEPTH,
    feature_fraction=FEATURE_FRACTION,
    bagging_fraction=BAGGING_FRACTION,
    bagging_freq=BAGGING_FREQ,
    scale_pos_weight=POS_WEIGHT,
    random_state=1842,
    client=client,
    n_jobs=-1,
    verbosity=-1,
)

model.fit(
    X_train,
    y_train,
    eval_set=[(X_valid, y_valid)],
    eval_metric="average_precision",
    callbacks=[
        lgb.early_stopping(stopping_rounds=EARLY_STOPPING),
        lgb.log_evaluation(VERBOSE_EVAL),
    ],
)

# ─── Оценка на валидации ─────────────────────────────────────────────────────

print("Предсказание на валидации...")
preds_valid = model.predict_proba(X_valid)[:, 1].compute()  # вероятности класса 1

y_valid_np = y_valid.compute()
pr_auc = average_precision_score(y_valid_np, preds_valid)
print(f"PR-AUC на валидации: {pr_auc:.5f}")

# ─── Сохранение модели ───────────────────────────────────────────────────────

model_file = f"{MODEL_PATH}lgb_dask_baseline_part_{'-'.join(map(str, train_parts))}_PR{pr_auc:.4f}.txt"
model.booster_.save_model(model_file)
print(f"Модель сохранена: {model_file}")

client.close()
print("Готово!", datetime.now().strftime("%Y-%m-%d %H:%M"))

  Installing build dependencies ... [?25l[?25hdone
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mGetting requirements to build wheel[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m See above for output.
  
  [1;35mnote[0m: This error originates from a subprocess, and is likely not a problem with pip.
  Getting requirements to build wheel ... [?25l[?25herror
[1;31merror[0m: [1msubprocess-exited-with-error[0m

[31m×[0m [32mGetting requirements to build wheel[0m did not run successfully.
[31m│[0m exit code: [1;36m1[0m
[31m╰─>[0m See above for output.

[1;35mnote[0m: This error originates from a subprocess, and is likely not a problem with pip.
Collecting lightgbm
  Using cached lightgbm-4.6.0-py3-none-manylinux_2_28_x86_64.whl.metadata (17 kB)
Downloading lightgbm-4.6.0-py3-none-manylinux_2_28_x86_64.whl (3.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.6/3.6 MB[0m [31m78.5 MB/

INFO:distributed.http.proxy:To route to workers diagnostics web server please install jupyter-server-proxy: python -m pip install jupyter-server-proxy
INFO:distributed.scheduler:State start
INFO:distributed.scheduler:  Scheduler at:     tcp://127.0.0.1:38457
INFO:distributed.scheduler:  dashboard at:  http://127.0.0.1:8787/status
INFO:distributed.scheduler:Registering Worker plugin shuffle
INFO:distributed.nanny:        Start Nanny at: 'tcp://127.0.0.1:45975'


Начало обучения Dask-LightGBM baseline 2026-02-18 08:59
Запуск Dask Client...


INFO:distributed.nanny:        Start Nanny at: 'tcp://127.0.0.1:33803'
INFO:distributed.nanny:        Start Nanny at: 'tcp://127.0.0.1:39885'
INFO:distributed.nanny:        Start Nanny at: 'tcp://127.0.0.1:41883'
INFO:distributed.scheduler:Register worker addr: tcp://127.0.0.1:40283 name: 2
INFO:distributed.scheduler:Starting worker compute stream, tcp://127.0.0.1:40283
INFO:distributed.core:Starting established connection to tcp://127.0.0.1:60896
INFO:distributed.scheduler:Register worker addr: tcp://127.0.0.1:46625 name: 0
INFO:distributed.scheduler:Starting worker compute stream, tcp://127.0.0.1:46625
INFO:distributed.core:Starting established connection to tcp://127.0.0.1:60900
INFO:distributed.scheduler:Register worker addr: tcp://127.0.0.1:37503 name: 1
INFO:distributed.scheduler:Starting worker compute stream, tcp://127.0.0.1:37503
INFO:distributed.core:Starting established connection to tcp://127.0.0.1:60916
INFO:distributed.scheduler:Register worker addr: tcp://127.0.0.1:34827

http://127.0.0.1:8787/status
Считываем метаданные для разбиения...
Train parts: 2 | Valid: 1
Количество признаков: 39


INFO:distributed.core:Connection to tcp://127.0.0.1:60932 has been closed.
INFO:distributed.scheduler:Remove worker addr: tcp://127.0.0.1:34827 name: 3 (stimulus_id='handle-worker-cleanup-1771405227.5911317')
INFO:distributed.nanny:Worker process 3119 was killed by signal 15
INFO:distributed.scheduler:Register worker addr: tcp://127.0.0.1:36633 name: 3
INFO:distributed.scheduler:Starting worker compute stream, tcp://127.0.0.1:36633
INFO:distributed.core:Starting established connection to tcp://127.0.0.1:43072
INFO:distributed.core:Connection to tcp://127.0.0.1:60916 has been closed.
INFO:distributed.scheduler:Remove worker addr: tcp://127.0.0.1:37503 name: 1 (stimulus_id='handle-worker-cleanup-1771405291.3847587')
INFO:distributed.nanny:Worker process 3113 was killed by signal 15
INFO:distributed.scheduler:Register worker addr: tcp://127.0.0.1:35385 name: 1
INFO:distributed.scheduler:Starting worker compute stream, tcp://127.0.0.1:35385
INFO:distributed.core:Starting established connec