In [1]:
# @title Подключение к диску с данными
import os
from google.colab import drive
drive.mount('/content/drive')
!pip install polars
!pip install Dask
!pip install lightgbm

Mounted at /content/drive
Collecting polars
  Downloading polars-1.38.1-py3-none-any.whl.metadata (10 kB)
Collecting polars-runtime-32==1.38.1 (from polars)
  Downloading polars_runtime_32-1.38.1-cp310-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.5 kB)
Downloading polars-1.38.1-py3-none-any.whl (810 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m810.4/810.4 kB[0m [31m28.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading polars_runtime_32-1.38.1-cp310-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (45.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.8/45.8 MB[0m [31m79.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: polars-runtime-32, polars
Successfully installed polars-1.38.1 polars-runtime-32-1.38.1
Collecting Dask
  Downloading dask-2026.1.2-py3-none-any.whl.metadata (3.8 kB)
Collecting partd>=1.4.0 (from Dask)
  Downloading partd-1.4.2-py3-none-any.whl.metadata (4.6 kB)
Collecting locket (from 

In [None]:
# @title LightGBM baseline для Data Fusion Contest 2026 "Страж" с обучением по частям

# 3_train_baseline_chunked_fixed.py

import polars as pl
import lightgbm as lgb
import numpy as np
from sklearn.metrics import average_precision_score
import gc
import os
from datetime import datetime

print("Начало обучения LightGBM по частям (исправленная версия)", datetime.now().strftime("%Y-%m-%d %H:%M"))

DATA_PATH = "/content/drive/MyDrive/ml-vtb-data-fusion-strazh/data/"
MODEL_PATH = "./models/"
os.makedirs(MODEL_PATH, exist_ok=True)

# ─── Параметры модели ─────────────────────────────────────────────────────────

N_ESTIMATORS      = 2500
LEARNING_RATE     = 0.035
MAX_DEPTH         = 9
NUM_LEAVES        = 120
FEATURE_FRACTION  = 0.75
BAGGING_FRACTION  = 0.80
BAGGING_FREQ      = 5
POS_WEIGHT        = 1378

EARLY_STOPPING    = 120
VERBOSE_EVAL      = 100

# ─── Категориальные признаки ─────────────────────────────────────────────────

cat_features_names = [
    "event_type_nm",
    "channel_indicator_type",
    "channel_indicator_sub_type",
    "currency_iso_cd",
    "mcc_code",
    "pos_cd",
    "accept_language",
    "browser_language",
    "timezone",
    "operating_system_type",
    "device_system_version",
    "screen_size",
    "event_desc",
    "battery"
]
exclude_cols = ["customer_id", "event_id", "event_dttm", "date", "target"]
# ─── Разбиение частей ─────────────────────────────────────────────────────────

valid_part = 3
train_parts = [1, 2]



# ─── Функция подготовки данных из одной части ────────────────────────────────

def load_and_prepare_part(part_num, is_train=True):
    path = f"{DATA_PATH}train_features_part_{part_num}.parquet"
    print(f"Загрузка части {part_num} ({'train' if is_train else 'valid'})...")

    df = pl.read_parquet(path)

    # Приводим категориальные колонки к типу Categorical (LightGBM их поймёт)
    for col in cat_features_names:
        if col in df.columns:
            df = df.with_columns(pl.col(col).cast(pl.Utf8).cast(pl.Categorical).to_physical())


    feature_cols = [c for c in df.columns if c not in exclude_cols]

    # Получаем индексы категориальных признаков (нужны для lightgbm)
    cat_indices = []
    for i, col in enumerate(feature_cols):
        if col in cat_features_names:
            cat_indices.append(i)

    X = df.select(feature_cols).to_numpy()
    y = df["target"].to_numpy().astype(np.float32) if "target" in df.columns else None
    print("Проверка на типы данных:")
    for col in df.columns:
        print(f"{col:25} → dtype: {df[col].dtype}")
    del df
    gc.collect()

    return X, y, feature_cols, cat_indices

# ─── Загрузка валидации ───────────────────────────────────────────────────────

X_valid, y_valid, feature_cols, cat_indices_valid = load_and_prepare_part(valid_part, is_train=False)

print(f"Валидация: {X_valid.shape[0]:,} строк, {X_valid.shape[1]} признаков")
print(f"Категориальные индексы: {cat_indices_valid}")

# ─── Обучение по частям ───────────────────────────────────────────────────────

params = {
    "objective": "binary",
    "metric": "average_precision",
    "learning_rate": LEARNING_RATE,
    "num_leaves": NUM_LEAVES,
    "max_depth": MAX_DEPTH,
    "feature_fraction": FEATURE_FRACTION,
    "bagging_fraction": BAGGING_FRACTION,
    "bagging_freq": BAGGING_FREQ,
    "scale_pos_weight": POS_WEIGHT,
    "verbosity": -1,
    "random_state": 1842,
    "n_jobs": -1,
}

booster = None
best_iteration = None
best_score = -np.inf

for part_idx, part in enumerate(train_parts, 1):
    print(f"\nОбучение на части {part} ({part_idx}/{len(train_parts)})")

    X_train_part, y_train_part, _, cat_indices_train = load_and_prepare_part(part, is_train=True)

    print(f"  → часть {part}: {X_train_part.shape[0]:,} строк")

    train_data = lgb.Dataset(
        X_train_part,
        label=y_train_part,
        categorical_feature=cat_indices_train,  # ← здесь индексы, а не имена!
        free_raw_data=False
    )

    valid_data = lgb.Dataset(
        X_valid,
        label=y_valid,
        categorical_feature=cat_indices_valid,
        reference=train_data,
        free_raw_data=False
    )

    if booster is None:
        # Первая итерация
        booster = lgb.train(
            params,
            train_data,
            num_boost_round=N_ESTIMATORS,
            valid_sets=[valid_data],
            callbacks=[
                lgb.early_stopping(stopping_rounds=EARLY_STOPPING, verbose=True),
                lgb.log_evaluation(VERBOSE_EVAL),
            ]
        )
    else:
        # Продолжаем обучение
        booster = lgb.train(
            params,
            train_data,
            num_boost_round=N_ESTIMATORS,
            init_model=booster,
            valid_sets=[valid_data],
            callbacks=[
                lgb.early_stopping(stopping_rounds=EARLY_STOPPING, verbose=True),
                lgb.log_evaluation(VERBOSE_EVAL),
            ]
        )

    current_score = booster.best_score['valid_0']['average_precision']
    if current_score > best_score:
        best_score = current_score
        best_iteration = booster.best_iteration

    del X_train_part, y_train_part, train_data, valid_data
    gc.collect()

# ─── Финальная оценка и сохранение ────────────────────────────────────────────

print("\nФинальное предсказание на валидации...")
preds_valid = booster.predict(X_valid, num_iteration=best_iteration)

pr_auc = average_precision_score(y_valid, preds_valid)
print(f"PR-AUC на валидации: {pr_auc:.5f}")

model_file = f"{MODEL_PATH}lgb_chunked_fixed_train_{'-'.join(map(str, train_parts))}_PR{pr_auc:.4f}.txt"
booster.save_model(model_file)
print(f"Модель сохранена: {model_file}")

print("Готово!")

Начало обучения LightGBM по частям (исправленная версия) 2026-02-18 17:08
Загрузка части 3 (valid)...
Проверка на типы данных:
customer_id               → dtype: Int64
event_id                  → dtype: Int64
event_dttm                → dtype: Datetime(time_unit='us', time_zone=None)
event_type_nm             → dtype: UInt32
event_desc                → dtype: UInt32
channel_indicator_type    → dtype: UInt32
channel_indicator_sub_type → dtype: UInt32
operaton_amt              → dtype: Float64
currency_iso_cd           → dtype: UInt32
mcc_code                  → dtype: UInt32
pos_cd                    → dtype: UInt32
accept_language           → dtype: UInt32
browser_language          → dtype: UInt32
timezone                  → dtype: UInt32
session_id                → dtype: Int64
operating_system_type     → dtype: UInt32
battery                   → dtype: UInt32
device_system_version     → dtype: UInt32
screen_size               → dtype: UInt32
developer_tools           → dtype: Int32
p

In [None]:
import polars as pl

total_positive = 0
total_negative = 0

for i in [1, 2, 3]:
    df = pl.read_parquet(f"{DATA_PATH}train_features_part_{i}.parquet")
    pos = df["target"].sum()
    total_positive += pos
    total_negative += len(df) - pos
    del df
    gc.collect()

ratio = total_negative / total_positive if total_positive > 0 else 1
print(f"Реальный дисбаланс: 1 : {ratio:.0f}")
print(f"Рекомендуемый scale_pos_weight: {ratio:.0f} или {ratio * 0.8:.0f} – {ratio * 1.2:.0f}")

Реальный дисбаланс: 1 : 1378
Рекомендуемый scale_pos_weight: 1378 или 1103 – 1654


In [None]:
# @title LightGBM baseline для Data Fusion Contest 2026 "Страж" с обучением по частям 2 часть

# 3_train_baseline_chunked_fixed.py

import polars as pl
import lightgbm as lgb
import numpy as np
from sklearn.metrics import average_precision_score
import gc
import os
from datetime import datetime

print("Начало обучения LightGBM по частям (исправленная версия)", datetime.now().strftime("%Y-%m-%d %H:%M"))

DATA_PATH = "/content/drive/MyDrive/ml-vtb-data-fusion-strazh/data/"
MODEL_PATH = "./models/"
os.makedirs(MODEL_PATH, exist_ok=True)

# ─── Параметры модели ─────────────────────────────────────────────────────────

N_ESTIMATORS      = 2500
LEARNING_RATE     = 0.035
MAX_DEPTH         = 9
NUM_LEAVES        = 120
FEATURE_FRACTION  = 0.75
BAGGING_FRACTION  = 0.80
BAGGING_FREQ      = 5
POS_WEIGHT        = 1378

EARLY_STOPPING    = 120
VERBOSE_EVAL      = 100

# ─── Категориальные признаки ─────────────────────────────────────────────────

cat_features_names = [
    "event_type_nm",
    "channel_indicator_type",
    "channel_indicator_sub_type",
    "currency_iso_cd",
    "mcc_code",
    "pos_cd",
    "accept_language",
    "browser_language",
    "timezone",
    "operating_system_type",
    "device_system_version",
    "screen_size",
    "event_desc",
    "battery"
]
exclude_cols = ["customer_id", "event_id", "event_dttm", "date", "target"]
# ─── Разбиение частей ─────────────────────────────────────────────────────────

valid_part = 3
train_parts = [1, 2]



# ─── Функция подготовки данных из одной части ────────────────────────────────

def load_and_prepare_part(part_num, is_train=True):
    path = f"{DATA_PATH}train_features_part_{part_num}.parquet"
    print(f"Загрузка части {part_num} ({'train' if is_train else 'valid'})...")

    df = pl.read_parquet(path)

    # Приводим категориальные колонки к типу Categorical (LightGBM их поймёт)
    for col in cat_features_names:
        if col in df.columns:
            df = df.with_columns(pl.col(col).cast(pl.Utf8).cast(pl.Categorical).to_physical())


    feature_cols = [c for c in df.columns if c not in exclude_cols]

    # Получаем индексы категориальных признаков (нужны для lightgbm)
    cat_indices = []
    for i, col in enumerate(feature_cols):
        if col in cat_features_names:
            cat_indices.append(i)

    X = df.select(feature_cols).to_numpy()
    y = df["target"].to_numpy().astype(np.float32) if "target" in df.columns else None
    print("Проверка на типы данных:")
    for col in df.columns:
        print(f"{col:25} → dtype: {df[col].dtype}")
    del df
    gc.collect()

    return X, y, feature_cols, cat_indices

# ─── Загрузка валидации ───────────────────────────────────────────────────────

X_valid, y_valid, feature_cols, cat_indices_valid = load_and_prepare_part(valid_part, is_train=False)

print(f"Валидация: {X_valid.shape[0]:,} строк, {X_valid.shape[1]} признаков")
print(f"Категориальные индексы: {cat_indices_valid}")

# ─── Обучение по частям ───────────────────────────────────────────────────────

params = {
    "objective": "binary",
    "metric": "average_precision",
    "learning_rate": LEARNING_RATE,
    "num_leaves": NUM_LEAVES,
    "max_depth": MAX_DEPTH,
    "feature_fraction": FEATURE_FRACTION,
    "bagging_fraction": BAGGING_FRACTION,
    "bagging_freq": BAGGING_FREQ,
    "scale_pos_weight": POS_WEIGHT,
    "verbosity": -1,
    "random_state": 1842,
    "n_jobs": -1,
    "is_unbalance": False,    # Focal loss (альтернатива scale_pos_weight)
    "boosting_type": "gbdt",
    "objective": "binary",
    "focal_loss_alpha": 0.25,           # баланс между классами
    "focal_loss_gamma": 2.0,            # фокус на сложных примерах
}

booster = None
best_iteration = None
best_score = -np.inf

for part_idx, part in enumerate(train_parts, 1):
    print(f"\nОбучение на части {part} ({part_idx}/{len(train_parts)})")

    X_train_part, y_train_part, _, cat_indices_train = load_and_prepare_part(part, is_train=True)

    print(f"  → часть {part}: {X_train_part.shape[0]:,} строк")

    train_data = lgb.Dataset(
        X_train_part,
        label=y_train_part,
        categorical_feature=cat_indices_train,  # ← здесь индексы, а не имена!
        free_raw_data=False
    )

    valid_data = lgb.Dataset(
        X_valid,
        label=y_valid,
        categorical_feature=cat_indices_valid,
        reference=train_data,
        free_raw_data=False
    )

    if booster is None:
        # Первая итерация
        booster = lgb.train(
            params,
            train_data,
            num_boost_round=N_ESTIMATORS,
            valid_sets=[valid_data],
            callbacks=[
                lgb.early_stopping(stopping_rounds=EARLY_STOPPING, verbose=True),
                lgb.log_evaluation(VERBOSE_EVAL),
            ]
        )
    else:
        # Продолжаем обучение
        booster = lgb.train(
            params,
            train_data,
            num_boost_round=N_ESTIMATORS,
            init_model=booster,
            valid_sets=[valid_data],
            callbacks=[
                lgb.early_stopping(stopping_rounds=EARLY_STOPPING, verbose=True),
                lgb.log_evaluation(VERBOSE_EVAL),
            ]
        )

    current_score = booster.best_score['valid_0']['average_precision']
    if current_score > best_score:
        best_score = current_score
        best_iteration = booster.best_iteration

    del X_train_part, y_train_part, train_data, valid_data
    gc.collect()

# ─── Финальная оценка и сохранение ────────────────────────────────────────────

print("\nФинальное предсказание на валидации...")
preds_valid = booster.predict(X_valid, num_iteration=best_iteration)

pr_auc = average_precision_score(y_valid, preds_valid)
print(f"PR-AUC на валидации: {pr_auc:.5f}")

model_file = f"{MODEL_PATH}lgb_chunked_fixed_train_{'-'.join(map(str, train_parts))}_PR{pr_auc:.4f}.txt"
booster.save_model(model_file)
print(f"Модель сохранена: {model_file}")

print("Готово!")

Начало обучения LightGBM по частям (исправленная версия) 2026-02-18 17:44
Загрузка части 3 (valid)...
Проверка на типы данных:
customer_id               → dtype: Int64
event_id                  → dtype: Int64
event_dttm                → dtype: Datetime(time_unit='us', time_zone=None)
event_type_nm             → dtype: UInt32
event_desc                → dtype: UInt32
channel_indicator_type    → dtype: UInt32
channel_indicator_sub_type → dtype: UInt32
operaton_amt              → dtype: Float64
currency_iso_cd           → dtype: UInt32
mcc_code                  → dtype: UInt32
pos_cd                    → dtype: UInt32
accept_language           → dtype: UInt32
browser_language          → dtype: UInt32
timezone                  → dtype: UInt32
session_id                → dtype: Int64
operating_system_type     → dtype: UInt32
battery                   → dtype: UInt32
device_system_version     → dtype: UInt32
screen_size               → dtype: UInt32
developer_tools           → dtype: Int32
p

In [None]:
# @title LightGBM baseline для Data Fusion Contest 2026 "Страж" с обучением по частям 3 часть

# 3_train_baseline_chunked_fixed.py

import polars as pl
import lightgbm as lgb
import numpy as np
from sklearn.metrics import average_precision_score
import gc
import os
from datetime import datetime

print("Начало обучения LightGBM по частям (исправленная версия)", datetime.now().strftime("%Y-%m-%d %H:%M"))

DATA_PATH = "/content/drive/MyDrive/ml-vtb-data-fusion-strazh/data/"
MODEL_PATH = "/content/drive/MyDrive/ml-vtb-data-fusion-strazh/models/"
os.makedirs(MODEL_PATH, exist_ok=True)

# ─── Параметры модели ─────────────────────────────────────────────────────────

N_ESTIMATORS      = 2500
LEARNING_RATE     = 0.035
MAX_DEPTH         = 9
NUM_LEAVES        = 120
FEATURE_FRACTION  = 0.75
BAGGING_FRACTION  = 0.80
BAGGING_FREQ      = 5
POS_WEIGHT        = 1654

EARLY_STOPPING    = 120
VERBOSE_EVAL      = 100

# ─── Категориальные признаки ─────────────────────────────────────────────────

cat_features_names = [
    "event_type_nm",
    "channel_indicator_type",
    "channel_indicator_sub_type",
    "currency_iso_cd",
    "mcc_code",
    "pos_cd",
    "accept_language",
    "browser_language",
    "timezone",
    "operating_system_type",
    "device_system_version",
    "screen_size",
    "event_desc",
    "battery"
]
exclude_cols = ["customer_id", "event_id", "event_dttm", "date", "target"]
# ─── Разбиение частей ─────────────────────────────────────────────────────────

valid_part = 3
train_parts = [1, 2]



# ─── Функция подготовки данных из одной части ────────────────────────────────
# def focal_loss(alpha=0.25, gamma=2.0):
#     def focal_loss_lgb(preds, dtrain):
#         labels = dtrain.get_label()
#         q = np.where(labels == 1, alpha, (1 - alpha))
#         p = 1. / (1. + np.exp(-preds))
#         grad = q * (labels - p) * (np.abs(labels - p) ** gamma)
#         hess = q * p * (1 - p) * (np.abs(labels - p) ** gamma)
#         return 'focal_loss', np.mean(grad), False
#     return focal_loss_lgb


def load_and_prepare_part(part_num, is_train=True):
    path = f"{DATA_PATH}train_features_v2_part_{part_num}.parquet"
    print(f"Загрузка части {part_num} ({'train' if is_train else 'valid'})...")

    df = pl.read_parquet(path)

    # Приводим категориальные колонки к типу Categorical (LightGBM их поймёт)
    for col in cat_features_names:
        if col in df.columns:
            df = df.with_columns(pl.col(col).cast(pl.Utf8).cast(pl.Categorical).to_physical())


    feature_cols = [c for c in df.columns if c not in exclude_cols]

    # Получаем индексы категориальных признаков (нужны для lightgbm)
    cat_indices = []
    for i, col in enumerate(feature_cols):
        if col in cat_features_names:
            cat_indices.append(i)

    X = df.select(feature_cols).to_numpy()
    y = df["target"].to_numpy().astype(np.float32) if "target" in df.columns else None
    print("Проверка на типы данных:")
    # for col in df.columns:
    #     print(f"{col:25} → dtype: {df[col].dtype}")
    del df
    gc.collect()

    return X, y, feature_cols, cat_indices

# ─── Загрузка валидации ───────────────────────────────────────────────────────

X_valid, y_valid, feature_cols, cat_indices_valid = load_and_prepare_part(valid_part, is_train=False)

print(f"Валидация: {X_valid.shape[0]:,} строк, {X_valid.shape[1]} признаков")
print(f"Категориальные индексы: {cat_indices_valid}")

# ─── Обучение по частям ───────────────────────────────────────────────────────

params = {
    "objective": "binary",
    "metric": "average_precision",
    "learning_rate": LEARNING_RATE,
    "num_leaves": NUM_LEAVES,
    "max_depth": MAX_DEPTH,
    "feature_fraction": FEATURE_FRACTION,
    "bagging_fraction": BAGGING_FRACTION,
    "bagging_freq": BAGGING_FREQ,
    "verbosity": -1,
    "random_state": 1842,
    "n_jobs": -1,
    "is_unbalance": True,    # автоматически подбирает
    "boosting_type": "gbdt",
}
# params["objective"] = focal_loss(alpha=0.25, gamma=2.0)

booster = None
best_iteration = None
best_score = -np.inf

for part_idx, part in enumerate(train_parts, 1):
    print(f"\nОбучение на части {part} ({part_idx}/{len(train_parts)})")

    X_train_part, y_train_part, _, cat_indices_train = load_and_prepare_part(part, is_train=True)

    print(f"  → часть {part}: {X_train_part.shape[0]:,} строк")

    train_data = lgb.Dataset(
        X_train_part,
        label=y_train_part,
        categorical_feature=cat_indices_train,  # ← здесь индексы, а не имена!
        free_raw_data=False
    )

    valid_data = lgb.Dataset(
        X_valid,
        label=y_valid,
        categorical_feature=cat_indices_valid,
        reference=train_data,
        free_raw_data=False
    )

    if booster is None:
        # Первая итерация
        booster = lgb.train(
            params,
            train_data,
            num_boost_round=N_ESTIMATORS,
            valid_sets=[valid_data],
            callbacks=[
                lgb.early_stopping(stopping_rounds=EARLY_STOPPING, verbose=True),
                lgb.log_evaluation(VERBOSE_EVAL),
            ]
        )
    else:
        # Продолжаем обучение
        booster = lgb.train(
            params,
            train_data,
            num_boost_round=N_ESTIMATORS,
            init_model=booster,
            valid_sets=[valid_data],
            callbacks=[
                lgb.early_stopping(stopping_rounds=EARLY_STOPPING, verbose=True),
                lgb.log_evaluation(VERBOSE_EVAL),
            ]
        )

    current_score = booster.best_score['valid_0']['average_precision']
    if current_score > best_score:
        best_score = current_score
        best_iteration = booster.best_iteration

    del X_train_part, y_train_part, train_data, valid_data
    gc.collect()

# ─── Финальная оценка и сохранение ────────────────────────────────────────────

model_file = f"{MODEL_PATH}lgb_chunked_fixed_train_{'-'.join(map(str, train_parts))}_PR{pr_auc:.4f}.txt"
booster.save_model(model_file)

print("\nФинальное предсказание на валидации...")
preds_valid = booster.predict(X_valid, num_iteration=best_iteration)

pr_auc = average_precision_score(y_valid, preds_valid)
print(f"PR-AUC на валидации: {pr_auc:.5f}")

model_file = f"{MODEL_PATH}lgb_chunked_fixed_train_{'-'.join(map(str, train_parts))}_PR{pr_auc:.4f}.txt"
booster.save_model(model_file)
print(f"Модель сохранена: {model_file}")

print("Готово!")

Начало обучения LightGBM по частям (исправленная версия) 2026-02-19 09:59
Загрузка части 3 (valid)...
Проверка на типы данных:
Валидация: 49,789,719 строк, 39 признаков
Категориальные индексы: [0, 1, 2, 3, 5, 6, 7, 8, 9, 10, 12, 13, 14, 15]

Обучение на части 1 (1/2)
Загрузка части 1 (train)...
Проверка на типы данных:
  → часть 1: 50,125,118 строк
Training until validation scores don't improve for 120 rounds


In [None]:
import polars as pl
import lightgbm as lgb
import numpy as np
from sklearn.metrics import average_precision_score
import gc
import os
from datetime import datetime

print("Начало обучения LightGBM по частям (исправленная версия)", datetime.now().strftime("%Y-%m-%d %H:%M"))

DATA_PATH = "/content/drive/MyDrive/ml-vtb-data-fusion-strazh/data/"
MODEL_PATH = "/content/drive/MyDrive/ml-vtb-data-fusion-strazh/models/"
os.makedirs(MODEL_PATH, exist_ok=True)

# ─── Параметры модели ─────────────────────────────────────────────────────────

N_ESTIMATORS      = 2500
LEARNING_RATE     = 0.035
MAX_DEPTH         = 9
NUM_LEAVES        = 120
FEATURE_FRACTION  = 0.75
BAGGING_FRACTION  = 0.80
BAGGING_FREQ      = 5
# POS_WEIGHT is removed as we are using focal loss

EARLY_STOPPING    = 120
VERBOSE_EVAL      = 100

# ─── Категориальные признаки ─────────────────────────────────────────────────

cat_features_names = [
    "event_type_nm",
    "channel_indicator_type",
    "channel_indicator_sub_type",
    "currency_iso_cd",
    "mcc_code",
    "pos_cd",
    "accept_language",
    "browser_language",
    "timezone",
    "operating_system_type",
    "device_system_version",
    "screen_size",
    "event_desc",
    "battery"
]
exclude_cols = ["customer_id", "event_id", "event_dttm", "date", "target"]
# ─── Разбиение частей ─────────────────────────────────────────────────────────

valid_part = 1
train_parts = [3, 2]


# ─── Функция Focal Loss для LightGBM ────────────────────────────────────────
# Reference: https://www.kaggle.com/code/rejpalcz/focal-loss-for-lightgbm/notebook
def focal_loss(alpha=0.25, gamma=2.0):
    def _focal_loss_objective(y_pred, dtrain):
        y_true = dtrain.get_label()

        p = 1.0 / (1.0 + np.exp(-y_pred)) # Predicted probability (sigmoid)

        # Gradient and Hessian of Focal Loss
        # These are derived for the raw output (logit) of the model.

        # Gradient
        grad = np.where(y_true == 1,
                        alpha * (1 - p)**gamma * (p - 1),
                        (1 - alpha) * p**gamma * p)

        # Hessian (simplified approximation for stability in LightGBM)
        hess = np.where(y_true == 1,
                        alpha * gamma * (1 - p)**(gamma - 1) * p * (1 - p) + alpha * (1 - p)**gamma * p * (1 - p),
                        (1 - alpha) * gamma * p**(gamma - 1) * p * (1 - p) + (1 - alpha) * p**gamma * p * (1 - p))

        return grad, hess
    return _focal_loss_objective


def load_and_prepare_part(part_num, is_train=True):
    path = f"{DATA_PATH}train_features_v2_part_{part_num}.parquet"
    print(f"Загрузка части {part_num} ({'train' if is_train else 'valid'})...")

    df = pl.read_parquet(path)

    # Приводим категориальные колонки к типу Categorical (LightGBM их поймёт)
    for col in cat_features_names:
        if col in df.columns:
            df = df.with_columns(pl.col(col).cast(pl.Utf8).cast(pl.Categorical).to_physical())


    feature_cols = [c for c in df.columns if c not in exclude_cols]

    # Получаем индексы категориальных признаков (нужны для lightgbm)
    cat_indices = []
    for i, col in enumerate(feature_cols):
        if col in cat_features_names:
            cat_indices.append(i)

    X = df.select(feature_cols).to_numpy()
    y = df["target"].to_numpy().astype(np.float32) if "target" in df.columns else None
    print("Проверка на типы данных:")
    # for col in df.columns:
    #     print(f"{col:25} → dtype: {df[col].dtype}")
    del df
    gc.collect()

    return X, y, feature_cols, cat_indices

# ─── Загрузка валидации ───────────────────────────────────────────────────────

X_valid, y_valid, feature_cols, cat_indices_valid = load_and_prepare_part(valid_part, is_train=False)

print(f"Валидация: {X_valid.shape[0]:,} строк, {X_valid.shape[1]} признаков")
print(f"Категориальные индексы: {cat_indices_valid}")

# ─── Обучение по частям ───────────────────────────────────────────────────────

params = {
    "objective": focal_loss(alpha=0.25, gamma=2.0), # Use custom focal loss
    "metric": "average_precision",
    "learning_rate": LEARNING_RATE,
    "num_leaves": NUM_LEAVES,
    "max_depth": MAX_DEPTH,
    "feature_fraction": FEATURE_FRACTION,
    "bagging_fraction": BAGGING_FRACTION,
    "bagging_freq": BAGGING_FREQ,
    # "scale_pos_weight": POS_WEIGHT, # Removed, as focal loss handles imbalance
    "verbosity": -1,
    "random_state": 1842,
    "n_jobs": -1,
    # "is_unbalance": True,    # Removed, as focal loss handles imbalance
    "boosting_type": "gbdt",
}

booster = None
best_iteration = None
best_score = -np.inf

for part_idx, part in enumerate(train_parts, 1):
    print(f"\nОбучение на части {part} ({part_idx}/{len(train_parts)})")

    X_train_part, y_train_part, _, cat_indices_train = load_and_prepare_part(part, is_train=True)

    print(f"  → часть {part}: {X_train_part.shape[0]:,} строк")

    train_data = lgb.Dataset(
        X_train_part,
        label=y_train_part,
        categorical_feature=cat_indices_train,  # ← здесь индексы, а не имена!
        free_raw_data=False
    )

    valid_data = lgb.Dataset(
        X_valid,
        label=y_valid,
        categorical_feature=cat_indices_valid,
        reference=train_data,
        free_raw_data=False
    )

    if booster is None:
        # Первая итерация
        booster = lgb.train(
            params,
            train_data,
            num_boost_round=N_ESTIMATORS,
            valid_sets=[valid_data],
            callbacks=[
                lgb.early_stopping(stopping_rounds=EARLY_STOPPING, verbose=True),
                lgb.log_evaluation(VERBOSE_EVAL),
            ]
        )
    else:
        # Продолжаем обучение
        booster = lgb.train(
            params,
            train_data,
            num_boost_round=N_ESTIMATORS,
            init_model=booster,
            valid_sets=[valid_data],
            callbacks=[
                lgb.early_stopping(stopping_rounds=EARLY_STOPPING, verbose=True),
                lgb.log_evaluation(VERBOSE_EVAL),
            ]
        )

    current_score = booster.best_score['valid_0']['average_precision']
    if current_score > best_score:
        best_score = current_score
        best_iteration = booster.best_iteration

    del X_train_part, y_train_part, train_data, valid_data
    gc.collect()

# ─── Финальная оценка и сохранение ────────────────────────────────────────────

print("\nФинальное предсказание на валидации...")
preds_valid = booster.predict(X_valid, num_iteration=best_iteration)

pr_auc = average_precision_score(y_valid, preds_valid)
print(f"PR-AUC на валидации: {pr_auc:.5f}")

model_file = f"{MODEL_PATH}lgb_chunked_fixed_train_{'-'.join(map(str, train_parts))}_PR{pr_auc:.4f}.txt"
booster.save_model(model_file)
print(f"Модель сохранена: {model_file}")

print("Готово!")

In [None]:
# 3_train_baseline_dask.py
# @title Dask-LightGBM baseline для Data Fusion Contest 2026 "Страж" с обучением по частям (для OOM)

!pip install --quiet dask[complete] lightgbm dask-ml dask-lightgbm
!pip install --quiet "dask[distributed]"
!pip install --quiet lightgbm

import polars as pl
import dask.dataframe as dd
import dask.array as da
from dask.distributed import Client
import lightgbm as lgb
from lightgbm.dask import DaskLGBMClassifier
from sklearn.metrics import average_precision_score
import numpy as np
import gc
import os
from datetime import datetime

print("Начало обучения Dask-LightGBM baseline", datetime.now().strftime("%Y-%m-%d %H:%M"))

DATA_PATH = "/content/drive/MyDrive/ml-vtb-data-fusion-strazh/data/"
MODEL_PATH = "./models/"
os.makedirs(MODEL_PATH, exist_ok=True)

# ─── Параметры Dask ───────────────────────────────────────────────────────────

N_WORKERS = 4                  # подбери под свой CPU (i3 — 2–4)
THREADS_PER_WORKER = 2         # для LightGBM
MEMORY_LIMIT = 'auto'          # или '8GB' per worker, если нужно ограничить

# ─── Параметры модели ─────────────────────────────────────────────────────────

N_ESTIMATORS      = 2500
LEARNING_RATE     = 0.035
MAX_DEPTH         = 9
NUM_LEAVES        = 120
FEATURE_FRACTION  = 0.75
BAGGING_FRACTION  = 0.80
BAGGING_FREQ      = 5
POS_WEIGHT        = 350          # подбери по соотношению ~1 : 1500–4000

EARLY_STOPPING    = 120
VERBOSE_EVAL      = 100

# ─── Категориальные признаки ─────────────────────────────────────────────────

cat_features = [
    "event_type_nm",
    "channel_indicator_type",
    "channel_indicator_sub_type",
    "currency_iso_cd",
    "mcc_code",
    "pos_cd",
    "accept_language",
    "browser_language",
    "timezone",
    "operating_system_type",
    "device_system_version",
    "screen_size",
    # если есть ещё категориальные — добавь
]

# ─── Dask Client ──────────────────────────────────────────────────────────────

print("Запуск Dask Client...")
client = Client(
    n_workers=N_WORKERS,
    threads_per_worker=THREADS_PER_WORKER,
    memory_limit=MEMORY_LIMIT,
    processes=True,              # для стабильности
)
print(client.dashboard_link)     # ссылка на дашборд для мониторинга

# ─── Загрузка данных по частям с Dask ─────────────────────────────────────────

print("Считываем метаданные для разбиения...")

# Берём последнюю часть как валидацию (примерно последние 1–2 месяца)
valid_part = 3
train_parts = [1, 2]   # можно [1] для теста, потом добавить 2

train_files = [f"{DATA_PATH}train_features_part_{p}.parquet" for p in train_parts]
valid_file  = f"{DATA_PATH}train_features_part_{valid_part}.parquet"

# Dask DataFrame (lazy чтение)
ddf_train = dd.read_parquet(train_files, engine="pyarrow") #
ddf_valid = dd.read_parquet(valid_file, engine="pyarrow") #

print(f"Train parts: {len(train_files)} | Valid: 1")

# ─── Подготовка массивов ──────────────────────────────────────────────────────

exclude_cols = [
    "customer_id", "event_id", "event_dttm", "date",
    "target",                     # таргет отдельно
    # если есть другие служебные — добавь
]

feature_cols = [c for c in ddf_train.columns if c not in exclude_cols]

print(f"Количество признаков: {len(feature_cols)}")

# Категориальные в Dask
for col in cat_features:
    if col in feature_cols:
        ddf_train[col] = ddf_train[col].astype("category")
        ddf_valid[col] = ddf_valid[col].astype("category")

# X/y как Dask arrays
X_train = ddf_train[feature_cols].to_dask_array() # lengths=True
y_train = ddf_train["target"].to_dask_array()
X_train, y_train = client.persist([X_train, y_train])

X_valid = ddf_valid[feature_cols].to_dask_array()
y_valid = ddf_valid["target"].to_dask_array()
X_valid, y_valid = client.persist([X_valid, y_valid])
# ─── Dask-LightGBM ────────────────────────────────────────────────────────────

print("Запуск обучения Dask-LightGBM...")

model = DaskLGBMClassifier(
    objective="binary",
    metric="average_precision",          # PR-AUC
    learning_rate=LEARNING_RATE,
    num_leaves=NUM_LEAVES,
    max_depth=MAX_DEPTH,
    feature_fraction=FEATURE_FRACTION,
    bagging_fraction=BAGGING_FRACTION,
    bagging_freq=BAGGING_FREQ,
    scale_pos_weight=POS_WEIGHT,
    random_state=1842,
    client=client,
    n_jobs=-1,
    verbosity=-1,
)

model.fit(
    X_train,
    y_train,
    eval_set=[(X_valid, y_valid)],
    eval_metric="average_precision",
    callbacks=[
        lgb.early_stopping(stopping_rounds=EARLY_STOPPING),
        lgb.log_evaluation(VERBOSE_EVAL),
    ],
)

# ─── Оценка на валидации ─────────────────────────────────────────────────────

print("Предсказание на валидации...")
preds_valid = model.predict_proba(X_valid)[:, 1].compute()  # вероятности класса 1

y_valid_np = y_valid.compute()
pr_auc = average_precision_score(y_valid_np, preds_valid)
print(f"PR-AUC на валидации: {pr_auc:.5f}")

# ─── Сохранение модели ───────────────────────────────────────────────────────

model_file = f"{MODEL_PATH}lgb_dask_baseline_part_{'-'.join(map(str, train_parts))}_PR{pr_auc:.4f}.txt"
model.booster_.save_model(model_file)
print(f"Модель сохранена: {model_file}")

client.close()
print("Готово!", datetime.now().strftime("%Y-%m-%d %H:%M"))

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/243.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m243.9/243.9 kB[0m [31m12.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mGetting requirements to build wheel[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m See above for output.
  
  [1;35mnote[0m: This error originates from a subprocess, and is likely not a problem with pip.
  Getting requirements to build wheel ... [?25l[?25herror
[1;31merror[0m: [1msubprocess-exited-with-error[0m

[31m×[0m [32mGetting requirements to build wheel[0m did not run successfully.
[31m│[0m exit code: [1;36m1[0m
[31m╰─>[0m See above for output.

[1;35mnote[0m: This error originates from a subprocess, and is likely not a problem with pip.
[2K   [90m━━━━━━━━━

INFO:distributed.http.proxy:To route to workers diagnostics web server please install jupyter-server-proxy: python -m pip install jupyter-server-proxy
INFO:distributed.scheduler:State start
INFO:distributed.scheduler:  Scheduler at:     tcp://127.0.0.1:42851
INFO:distributed.scheduler:  dashboard at:  http://127.0.0.1:8787/status
INFO:distributed.scheduler:Registering Worker plugin shuffle
INFO:distributed.nanny:        Start Nanny at: 'tcp://127.0.0.1:45677'
INFO:distributed.nanny:        Start Nanny at: 'tcp://127.0.0.1:39809'
INFO:distributed.nanny:        Start Nanny at: 'tcp://127.0.0.1:35517'
INFO:distributed.nanny:        Start Nanny at: 'tcp://127.0.0.1:44049'
INFO:distributed.scheduler:Register worker addr: tcp://127.0.0.1:41997 name: 3
INFO:distributed.scheduler:Starting worker compute stream, tcp://127.0.0.1:41997
INFO:distributed.core:Starting established connection to tcp://127.0.0.1:57688
INFO:distributed.scheduler:Register worker addr: tcp://127.0.0.1:36771 name: 1
INFO:

http://127.0.0.1:8787/status
Считываем метаданные для разбиения...
Train parts: 2 | Valid: 1
Количество признаков: 39
Запуск обучения Dask-LightGBM...




IndexError: list index out of range