## Сперва необходимо загрузить имеющиеся данные

In [1]:
import pandas as pd
import numpy as np


path = "/kaggle/input/intern-regression-courier-deficit-challenge/"

train = pd.read_csv(path + "train.csv", parse_dates=["calendar_dt"])
test = pd.read_csv(path + "test.csv")
facts = pd.read_csv(path + "facts.csv", parse_dates=["calendar_dt"])
shifts = pd.read_csv(path + "shifts_prediction.csv", parse_dates=["calendar_dt"])

print(f"Размер train: {train.shape}")
print(f"Размер test: {test.shape}")

Размер train: (8220, 3)
Размер test: (2438, 1)


## Для дальнейшего обучения необходимо объединить данные по дарксторам и датам

In [2]:
test["calendar_dt"] = pd.to_datetime("2025-11-24")

train_full = train.merge(facts, on=["calendar_dt", "store_id"], how="left")
train_full = train_full.merge(shifts, on=["calendar_dt", "store_id"], how="left")

test_full = test.merge(facts, on=["calendar_dt", "store_id"], how="left")
test_full = test_full.merge(shifts, on=["calendar_dt", "store_id"], how="left")

history_target = train[['calendar_dt', 'store_id', 'target']].copy()

history_target['calendar_dt'] = history_target['calendar_dt'] + pd.Timedelta(days=7)
history_target = history_target.rename(columns={'target': 'target_lag_1'})

train_full = train_full.merge(history_target, on=['calendar_dt', 'store_id'], how='left')
test_full = test_full.merge(history_target, on=['calendar_dt', 'store_id'], how='left')

print(f"Размер собранного train_full: {train_full.shape}")
print(f"Размер собранного test_full: {test_full.shape}")

Размер собранного train_full: (8220, 18)
Размер собранного test_full: (2438, 17)


## Разобьем данные на полную обучающую выборку(X,y) и на обучающую выборку без последней даты(X_train, y_train), чтобы на последней дате можно было проверить качество модели

In [3]:
train_full = train_full.fillna(0)
test_full = test_full.fillna(0)

drop_cols = ["target", "calendar_dt"]
X = train_full.drop(columns=drop_cols)
y = train_full["target"]

X_test = test_full.drop(columns=["calendar_dt"])

last_date = train_full["calendar_dt"].max()

X_train = train_full[train_full['calendar_dt'] < last_date].drop(columns=drop_cols)
y_train = train_full[train_full['calendar_dt'] < last_date]["target"]

X_val = train_full[train_full['calendar_dt'] == last_date].drop(columns=drop_cols)
y_val = train_full[train_full['calendar_dt'] == last_date]['target']



## Перейдем к обучению

In [4]:
from catboost import CatBoostRegressor, Pool

cat_features = ['store_id', 'city_nm']

model = CatBoostRegressor(
    iterations=1000,
    learning_rate=0.05,
    loss_function='MAE',
    eval_metric='MAE',
    random_seed=42,
    verbose=100
)

model.fit(
    X_train, y_train,
    eval_set=(X_val, y_val),
    cat_features=cat_features,
    early_stopping_rounds=100
)

val_preds = model.predict(X_val)
wape_score = np.sum(np.abs(y_val - val_preds)) / np.sum(y_val)
print(f"Финальный WAPE на валидации: {wape_score:.4f}")

accuracy = (1 - wape_score) * 100
print(f"Точность модели по метрике WAPE: {accuracy:.2f}%")

0:	learn: 1.8684474	test: 1.9214833	best: 1.9214833 (0)	total: 65.3ms	remaining: 1m 5s
100:	learn: 0.4198736	test: 0.4660313	best: 0.4660313 (100)	total: 764ms	remaining: 6.8s
200:	learn: 0.3942081	test: 0.4517448	best: 0.4517448 (200)	total: 1.37s	remaining: 5.44s
300:	learn: 0.3756366	test: 0.4464869	best: 0.4464869 (300)	total: 2.07s	remaining: 4.81s
400:	learn: 0.3635340	test: 0.4508712	best: 0.4464869 (300)	total: 2.77s	remaining: 4.13s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.4464868835
bestIteration = 300

Shrink model to first 301 iterations.
Финальный WAPE на валидации: 0.1745
Точность модели по метрике WAPE: 82.55%


## Попробуем повысить точность

In [5]:
model = CatBoostRegressor(
    iterations=3000,
    learning_rate=0.01,
    loss_function='MAE',
    eval_metric='MAE',
    random_seed=42,
    verbose=200,
    depth=4,                
    min_data_in_leaf=10,     
    l2_leaf_reg=5,  
)

model.fit(
    X_train, y_train,
    eval_set=(X_val, y_val),
    cat_features=cat_features,
    early_stopping_rounds=200
)

val_preds = model.predict(X_val)
wape_score = np.sum(np.abs(y_val - val_preds)) / np.sum(y_val)
print(f"Финальный WAPE на валидации: {wape_score:.4f}")

accuracy = (1 - wape_score) * 100
print(f"Точность модели по метрике WAPE: {accuracy:.2f}%")

0:	learn: 1.9302395	test: 1.9836359	best: 1.9836359 (0)	total: 7.89ms	remaining: 23.6s
200:	learn: 0.5958020	test: 0.6275018	best: 0.6275018 (200)	total: 1.03s	remaining: 14.4s
400:	learn: 0.4608390	test: 0.4937295	best: 0.4937295 (400)	total: 1.86s	remaining: 12.1s
600:	learn: 0.4335728	test: 0.4689523	best: 0.4689523 (600)	total: 2.65s	remaining: 10.6s
800:	learn: 0.4240765	test: 0.4621134	best: 0.4621134 (800)	total: 3.49s	remaining: 9.58s
1000:	learn: 0.4148530	test: 0.4564073	best: 0.4564073 (1000)	total: 4.3s	remaining: 8.59s
1200:	learn: 0.4090120	test: 0.4530274	best: 0.4530245 (1192)	total: 5.16s	remaining: 7.73s
1400:	learn: 0.4073744	test: 0.4532243	best: 0.4525859 (1333)	total: 5.95s	remaining: 6.79s
1600:	learn: 0.4014606	test: 0.4491421	best: 0.4491421 (1600)	total: 6.73s	remaining: 5.88s
1800:	learn: 0.3989673	test: 0.4482141	best: 0.4482067 (1797)	total: 7.5s	remaining: 5s
2000:	learn: 0.3974794	test: 0.4476238	best: 0.4476238 (2000)	total: 8.31s	remaining: 4.15s
2200:	

## Сделаем перебор параметров

In [6]:
"""
import itertools


depths = [2, 3, 4, 5] 

l2_regs = [5, 8, 10, 15]

min_leaves = [1, 3, 5]

learning_rates = [0.1, 0.05]

best_wape = 1.0
best_params = {}

print("Начинаю расширенный поиск параметров...")

combinations = list(itertools.product(depths, l2_regs, min_leaves, learning_rates))

for d, l2, ml, lr in combinations:
    model = CatBoostRegressor(
        iterations=2000,
        learning_rate=lr,
        loss_function='MAE',
        eval_metric='MAE',
        random_seed=42,
        verbose=False,
        depth=d,
        l2_leaf_reg=l2,
        min_data_in_leaf=ml,
        early_stopping_rounds=100
    )
    
    model.fit(
        X_train, y_train, 
        eval_set=(X_val, y_val), 
        cat_features=['store_id', 'city_nm']
    )
    
    preds = model.predict(X_val)
    current_wape = np.sum(np.abs(y_val - preds)) / np.sum(y_val)
    current_iter = model.get_best_iteration()
    
    print(f"Depth: {d}, L2: {l2}, ML: {ml}, LR: {lr} | WAPE: {current_wape:.4f} | Iter: {current_iter}")
    
    if current_wape < best_wape:
        best_wape = current_wape
        best_params = {
            'depth': d, 
            'l2_leaf_reg': l2, 
            'min_data_in_leaf': ml, 
            'learning_rate': lr,
            'best_iteration': current_iter
        }

print("\n--- ГЛОБАЛЬНЫЙ ПОИСК ЗАВЕРШЕН ---")
print(f"Лучший WAPE: {best_wape:.4f} (Точность: {(1-best_wape)*100:.2f}%)")
print(f"Лучшие параметры: {best_params}")
"""

'\nimport itertools\n\n\ndepths = [2, 3, 4, 5] \n\nl2_regs = [5, 8, 10, 15]\n\nmin_leaves = [1, 3, 5]\n\nlearning_rates = [0.1, 0.05]\n\nbest_wape = 1.0\nbest_params = {}\n\nprint("Начинаю расширенный поиск параметров...")\n\ncombinations = list(itertools.product(depths, l2_regs, min_leaves, learning_rates))\n\nfor d, l2, ml, lr in combinations:\n    model = CatBoostRegressor(\n        iterations=2000,\n        learning_rate=lr,\n        loss_function=\'MAE\',\n        eval_metric=\'MAE\',\n        random_seed=42,\n        verbose=False,\n        depth=d,\n        l2_leaf_reg=l2,\n        min_data_in_leaf=ml,\n        early_stopping_rounds=100\n    )\n    \n    model.fit(\n        X_train, y_train, \n        eval_set=(X_val, y_val), \n        cat_features=[\'store_id\', \'city_nm\']\n    )\n    \n    preds = model.predict(X_val)\n    current_wape = np.sum(np.abs(y_val - preds)) / np.sum(y_val)\n    current_iter = model.get_best_iteration()\n    \n    print(f"Depth: {d}, L2: {l2}, ML: {

In [7]:
import lightgbm as lgb

cat_model = CatBoostRegressor(
    iterations=1000,
    learning_rate=0.1,
    loss_function='MAE',
    eval_metric='MAE',
    random_seed=42,
    verbose=200,
    depth=3,                
    min_data_in_leaf=1,     
    l2_leaf_reg=8,
)

cat_model.fit(X_train, y_train, eval_set=(X_val, y_val), cat_features=['store_id', 'city_nm'])
cat_preds_val = cat_model.predict(X_val)

X_train_lgb = X_train.copy()
X_val_lgb = X_val.copy()

for col in ['store_id', 'city_nm']:
    X_train_lgb[col] = X_train_lgb[col].astype('category')
    X_val_lgb[col] = X_val_lgb[col].astype('category')

lgb_model = lgb.LGBMRegressor(
    n_estimators=1000,
    learning_rate=0.05,
    objective='regression_l1',
    metric='mae',
    random_seed=42,
    max_depth=4,
    num_leaves=15,
    verbose=-1
)

lgb_model.fit(
    X_train_lgb, y_train,
    eval_set=[(X_val_lgb, y_val)],
    callbacks=[lgb.early_stopping(stopping_rounds=100)]
)
lgb_preds_val = lgb_model.predict(X_val_lgb)

blended_preds_val = (0.7 * cat_preds_val) + (0.3 * lgb_preds_val)

blended_preds_val = np.maximum(0, blended_preds_val)

wape_score = np.sum(np.abs(y_val - blended_preds_val)) / np.sum(y_val)
print(f"\nБлендинг WAPE на валидации: {wape_score:.4f}")

accuracy = (1 - wape_score) * 100
print(f"Точность ансамбля: {accuracy:.2f}%")

  if entities is not ():


0:	learn: 1.7898797	test: 1.8493226	best: 1.8493226 (0)	total: 6.51ms	remaining: 6.51s
200:	learn: 0.4040343	test: 0.4386316	best: 0.4386316 (200)	total: 918ms	remaining: 3.65s
400:	learn: 0.3867309	test: 0.4355792	best: 0.4355789 (399)	total: 1.67s	remaining: 2.49s
600:	learn: 0.3824645	test: 0.4355921	best: 0.4353389 (414)	total: 2.41s	remaining: 1.6s
800:	learn: 0.3798073	test: 0.4358550	best: 0.4353389 (414)	total: 3.19s	remaining: 793ms
999:	learn: 0.3756022	test: 0.4396990	best: 0.4353389 (414)	total: 3.99s	remaining: 0us

bestTest = 0.4353389187
bestIteration = 414

Shrink model to first 415 iterations.
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[159]	valid_0's l1: 0.43142

Блендинг WAPE на валидации: 0.1688
Точность ансамбля: 83.12%


## Сделаем финальное предсказание

In [8]:
X_all = pd.concat([X_train, X_val])
y_all = pd.concat([y_train, y_val])

X_all_lgb = X_all.copy()
X_test_lgb = X_test.copy()
for col in ['store_id', 'city_nm']:
    X_all_lgb[col] = X_all_lgb[col].astype('category')
    X_test_lgb[col] = X_test_lgb[col].astype('category')

print("Обучение финального CatBoost...")
cat_final = CatBoostRegressor(
    iterations=415,
    learning_rate=0.1,
    loss_function='MAE',
    random_seed=42,
    verbose=100,
    depth=3,
    min_data_in_leaf=1,
    l2_leaf_reg=8,
)
cat_final.fit(X_all, y_all, cat_features=['store_id', 'city_nm'])
cat_preds_test = cat_final.predict(X_test)

print("Обучение финального LightGBM...")

lgb_final = lgb.LGBMRegressor(
    n_estimators=1000,
    learning_rate=0.05,
    objective='regression_l1',
    random_seed=42,
    max_depth=4,
    num_leaves=15,
    verbose=-1
)
lgb_final.fit(X_all_lgb, y_all)
lgb_preds_test = lgb_final.predict(X_test_lgb)

final_preds = (0.7 * cat_preds_test) + (0.3 * lgb_preds_test)

final_preds = np.maximum(0, final_preds)
final_preds = np.round(final_preds).astype(int)

submission = pd.DataFrame({
    'store_id': test_full['store_id'],
    'target': final_preds
})

submission = submission.sort_values('store_id')
submission.to_csv('submission.csv', index=False)

print("Файл 'submission.csv' успешно создан")

Обучение финального CatBoost...
0:	learn: 1.8300841	total: 8.34ms	remaining: 3.45s
100:	learn: 0.4388118	total: 629ms	remaining: 1.95s
200:	learn: 0.4178733	total: 1.13s	remaining: 1.21s
300:	learn: 0.4115391	total: 1.59s	remaining: 603ms
400:	learn: 0.4054368	total: 2.06s	remaining: 72ms
414:	learn: 0.4051476	total: 2.12s	remaining: 0us
Обучение финального LightGBM...
Файл 'submission.csv' успешно создан
