In [1]:
import pandas as pd
from sklearn.metrics import mean_absolute_error
from catboost import CatBoostRegressor, cv, Pool
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.model_selection import KFold
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('../data/processed/preprocessed_train.csv')

In [3]:
df = df.drop(['year'], axis=1)

In [4]:
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()

In [None]:
X = df.drop(['log_price'], axis=1)
y = df['log_price']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

model = CatBoostRegressor(
    iterations=4000,
    learning_rate=0.03,
    depth=8,
    loss_function='MAE',
    eval_metric='MAE',
    cat_features=categorical_cols,
    verbose=100,
    random_seed=42,
    early_stopping_rounds=100,
    l2_leaf_reg=3,
    bagging_temperature=1, 
    task_type="GPU",
    devices='0'
)
params = {'learning_rate': 0.03171940198545533, 'depth': 7, 'l2_leaf_reg': 2.537580754686049, 'bagging_temperature': 1.4625568557921227}
model.set_params(**params)
model.fit(
    X_train, y_train,
    eval_set=(X_val, y_val),
    use_best_model=True
)

preds_log = model.predict(X_val)

preds = np.expm1(preds_log)
y_val_real = np.expm1(y_val)

mae = mean_absolute_error(y_val_real, preds)

Default metric period is 5 because MAE is/are not implemented for GPU


0:	learn: 0.7403141	test: 0.7251882	best: 0.7251882 (0)	total: 52.4ms	remaining: 3m 29s
100:	learn: 0.2666700	test: 0.2597600	best: 0.2597600 (100)	total: 4.98s	remaining: 3m 12s
200:	learn: 0.2195880	test: 0.2155612	best: 0.2155612 (200)	total: 9.82s	remaining: 3m 5s
300:	learn: 0.2042235	test: 0.2029654	best: 0.2029654 (300)	total: 14.5s	remaining: 2m 58s
400:	learn: 0.1953455	test: 0.1963813	best: 0.1963813 (400)	total: 19.3s	remaining: 2m 53s
500:	learn: 0.1891109	test: 0.1922026	best: 0.1922026 (500)	total: 24s	remaining: 2m 47s
600:	learn: 0.1844916	test: 0.1891856	best: 0.1891856 (600)	total: 28.7s	remaining: 2m 42s
700:	learn: 0.1807987	test: 0.1870843	best: 0.1870843 (700)	total: 33.2s	remaining: 2m 36s
800:	learn: 0.1776481	test: 0.1855482	best: 0.1855482 (800)	total: 37.8s	remaining: 2m 30s
900:	learn: 0.1750928	test: 0.1844806	best: 0.1844806 (900)	total: 42.4s	remaining: 2m 25s
1000:	learn: 0.1726124	test: 0.1834306	best: 0.1834306 (1000)	total: 46.9s	remaining: 2m 20s
110

In [14]:
print(f"Итоговый MAE: {mae:,.0f} руб.")
print(f"Средняя цена в валидации: {y_val_real.mean():,.0f} руб.")
print(f"Ошибка составляет примерно {(mae / y_val_real.mean()) * 100:.1f}% от средней цены.")

feature_importance = model.get_feature_importance(prettified=True)
print("\nТоп-10 важных признаков:")
print(feature_importance.head(10))

Итоговый MAE: 241,570 руб.
Средняя цена в валидации: 1,570,479 руб.
Ошибка составляет примерно 15.4% от средней цены.

Топ-10 важных признаков:
       Feature Id  Importances
0         car_age    19.962118
1     horse_power    13.799699
2            mark     9.878870
3           model     5.326866
4      generation     5.040221
5    displacement     3.214603
6       body_type     3.174423
7  specific_power     3.077685
8          km_age     3.069204
9          region     3.015796


In [15]:
errors = preds - y_val_real
abs_errors = np.abs(errors)

print("Mean error:", errors.mean())
print("Median error:", np.median(errors))
print("MAE:", abs_errors.mean())
print("90 percentile error:", np.percentile(abs_errors, 90))
print("95 percentile error:", np.percentile(abs_errors, 95))

Mean error: -49258.139751136405
Median error: -196.9905320108519
MAE: 241570.12499334538
90 percentile error: 477665.415651987
95 percentile error: 759760.7672502147


Список машин, где модель ошибается:

In [16]:
df_val = X_val.copy()
df_val['real_price'] = y_val_real
df_val['pred_price'] = preds
df_val['abs_error'] = abs_errors
df_val['error'] = errors

df_val.sort_values('abs_error', ascending=False).head(20)

Unnamed: 0,mark,model,generation,configuration,complectation,body_type,color,displacement,drive_type,engine_type,...,hp_zscore,hp_age_interaction,log_km,power_segment,is_power_awd,owners_per_year,real_price,pred_price,abs_error,error
30749,Aurus,Komendant,Неизвестно,Внедорожник 5 дв.,Individual,пятидверный внедорожник,коричневый,4.4,полный,гибрид,...,0.0,598,12.078245,sport,1,0.0,52390000.0,21900920.0,30489080.0,-30489080.0
72961,ГАЗ,21 «Волга»,Неизвестно,Седан,Неизвестно,седан,пурпурный,2.5,задний,бензин,...,-1.443061,4550,3.931826,low,0,0.030303,18000000.0,1420885.0,16579110.0,-16579110.0
77787,Hummer,H1,Неизвестно,Внедорожник 5 дв.,Неизвестно,пятидверный внедорожник,серый,6.6,полный,дизель,...,0.866025,6405,9.615872,sport,1,0.045455,27200000.0,11312130.0,15887870.0,-15887870.0
53988,Bentley,Continental GT,II,Купе,Неизвестно,купе,белый,4.0,полный,бензин,...,0.645328,6380,4.219508,sport,1,0.083333,33900000.0,18316230.0,15583770.0,-15583770.0
21854,Mercedes-Benz,GLS AMG,II (X167) Рестайлинг,Внедорожник 5 дв.,GLS 63 4MATIC+,пятидверный внедорожник,серый,4.0,полный,бензин,...,0.621059,1224,8.006701,sport,1,0.333333,44390000.0,30507960.0,13882040.0,-13882040.0
22124,Toyota,Land Cruiser,200 Series Рестайлинг 2,Внедорожник 5 дв.,Неизвестно,пятидверный внедорожник,белый,4.5,полный,дизель,...,0.306537,2241,12.002297,high,1,0.1,18000000.0,6498452.0,11501550.0,-11501550.0
35563,Mercedes-Benz,S-Класс,VII (W223),Седан Long,Неизвестно,седан,чёрный,2.9,полный,дизель,...,-0.813631,1245,7.824446,high,1,0.166667,25000000.0,14282470.0,10717530.0,-10717530.0
79823,Cadillac,Escalade,V,Внедорожник 5 дв. ESV,Luxury ESV,пятидверный внедорожник,чёрный,6.2,полный,бензин,...,0.449604,1664,9.510519,sport,1,0.4,25000000.0,14469650.0,10530350.0,-10530350.0
60826,BMW,7 серии,V (F01/F02/F04),Седан Long,Неизвестно,седан,чёрный,4.4,полный,бензин,...,0.903704,5698,11.332614,sport,1,0.133333,12000000.0,3016452.0,8983548.0,-8983548.0
46420,Toyota,Mark II,VIII (X100),Седан,Неизвестно,седан,пурпурный,2.5,задний,бензин,...,0.976361,6000,11.813037,upper_mid,0,0.096774,9800000.0,823199.0,8976801.0,-8976801.0


In [59]:
import optuna

X = df.drop(['log_price'], axis=1)
y = df['log_price']

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42
)

def objective(trial):
    
    params = {
        "iterations": 4000,
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.1, log=True),
        "depth": trial.suggest_int("depth", 6, 10),
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1, 10),
        "bagging_temperature": trial.suggest_float("bagging_temperature", 0, 3),
        "loss_function": "MAE",
        "eval_metric": "MAE",
        "random_seed": 42,
        "early_stopping_rounds": 100,
        "verbose": 0,
        "task_type": "GPU",
        "devices": "0"
    }

    model = CatBoostRegressor(**params)

    model.fit(
        X_train,
        y_train,
        eval_set=(X_val, y_val),
        cat_features=categorical_cols,
        use_best_model=True
    )

    preds_log = model.predict(X_val)

    preds = np.expm1(preds_log)
    y_val_real = np.expm1(y_val)

    mae = mean_absolute_error(y_val_real, preds)

    return mae


study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=25)

print("Best MAE:", study.best_value)
print("Best params:", study.best_params)

[I 2026-02-16 13:15:58,517] A new study created in memory with name: no-name-b68391fd-6461-4dc0-86e4-a5ec31d44298
Default metric period is 5 because MAE is/are not implemented for GPU
[I 2026-02-16 13:20:21,971] Trial 0 finished with value: 243611.5059882936 and parameters: {'learning_rate': 0.010025543888734063, 'depth': 8, 'l2_leaf_reg': 4.6085555030621865, 'bagging_temperature': 0.8789059008351203}. Best is trial 0 with value: 243611.5059882936.
Default metric period is 5 because MAE is/are not implemented for GPU
[I 2026-02-16 13:22:59,908] Trial 1 finished with value: 242812.7876594353 and parameters: {'learning_rate': 0.02828254230279367, 'depth': 6, 'l2_leaf_reg': 9.111176724924345, 'bagging_temperature': 0.042280941467160105}. Best is trial 1 with value: 242812.7876594353.
Default metric period is 5 because MAE is/are not implemented for GPU
[I 2026-02-16 13:26:59,278] Trial 2 finished with value: 242296.23139719394 and parameters: {'learning_rate': 0.013302630528405867, 'depth

Best MAE: 240036.96350070368
Best params: {'learning_rate': 0.03171940198545533, 'depth': 7, 'l2_leaf_reg': 2.537580754686049, 'bagging_temperature': 1.4625568557921227}


In [34]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)
mae_train = []
mae_val = []

for fold, (train_idx, val_idx) in enumerate(kf.split(df), 1):
    train_df = df.iloc[train_idx].copy()
    val_df = df.iloc[val_idx].copy()
    
    X_train = train_df.drop('log_price', axis=1) 
    y_train = train_df['log_price']               
    
    X_val = val_df.drop('log_price', axis=1)  
    y_val = val_df['log_price']                   
    
    model = CatBoostRegressor(
        iterations=1000,
        learning_rate=0.01,
        depth=8,
        loss_function='MAE',
        eval_metric='MAE',
        cat_features=categorical_cols,
        verbose=100,
        random_seed=42,
        early_stopping_rounds=100
    )
    
    model.fit(
        X_train, y_train,  
        eval_set=(X_val, y_val),
        use_best_model=True
    )
    
    train_preds = model.predict(X_train)
    val_preds = model.predict(X_val)
    
    train_mae = mean_absolute_error(y_train, train_preds)
    val_mae = mean_absolute_error(y_val, val_preds)

    train_mae = mean_absolute_error(np.expm1(y_train), train_preds)
    val_mae   = mean_absolute_error(np.expm1(y_val), val_preds)
    
    mae_train.append(train_mae)
    mae_val.append(val_mae)
    
    print(
        f'Fold {fold}: '
        f'Train MAE = {train_mae:,.0f}, '
        f'Val MAE = {val_mae:,.0f}'
    )

print(f'\nMean Train MAE: {np.mean(mae_train):,.0f}')
print(f'Mean Val MAE:   {np.mean(mae_val):,.0f}')
print(f'Std Val MAE:    {np.std(mae_val):,.0f}')

feature_importance = model.get_feature_importance(prettified=True)

0:	learn: 0.7449229	test: 0.7298585	best: 0.7298585 (0)	total: 153ms	remaining: 2m 32s
100:	learn: 0.3885204	test: 0.3789594	best: 0.3789594 (100)	total: 12s	remaining: 1m 46s
200:	learn: 0.2793670	test: 0.2717768	best: 0.2717768 (200)	total: 25.4s	remaining: 1m 41s
300:	learn: 0.2421601	test: 0.2358722	best: 0.2358722 (300)	total: 38.5s	remaining: 1m 29s
400:	learn: 0.2258840	test: 0.2206909	best: 0.2206909 (400)	total: 51.2s	remaining: 1m 16s
500:	learn: 0.2174949	test: 0.2131829	best: 0.2131829 (500)	total: 1m 4s	remaining: 1m 3s
600:	learn: 0.2119988	test: 0.2085108	best: 0.2085108 (600)	total: 1m 17s	remaining: 51.3s
700:	learn: 0.2082021	test: 0.2054041	best: 0.2054041 (700)	total: 1m 29s	remaining: 38.4s
800:	learn: 0.2048941	test: 0.2027867	best: 0.2027867 (800)	total: 1m 42s	remaining: 25.5s
900:	learn: 0.2020435	test: 0.2005508	best: 0.2005508 (900)	total: 1m 55s	remaining: 12.7s
999:	learn: 0.1994772	test: 0.1986313	best: 0.1986313 (999)	total: 2m 8s	remaining: 0us

bestTest