In [1]:
import pandas as pd
from sklearn.metrics import mean_absolute_error
from catboost import CatBoostRegressor, cv, Pool
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.model_selection import KFold

In [2]:
df = pd.read_csv('../data/processed/preprocessed_train.csv')

In [3]:
df.columns

Index(['id', 'price_rub', 'mark', 'model', 'generation', 'configuration',
       'complectation', 'body_type', 'color', 'displacement', 'drive_type',
       'engine_type', 'horse_power', 'transmission', 'wheel', 'km_age', 'year',
       'owners_count', 'condition', 'custom', 'pts', 'seller_type', 'region',
       'city', 'address', 'log_price', 'bin_horse_power', 'car_age',
       'city_price_ratio', 'count_options', 'has_heating', 'has_damage'],
      dtype='object')

Создадим базовый регрессор, чтобы в дальнейшем сравнивать наработки и улучшения с бейзлайном

In [12]:
    categorical_cols = [
        'mark', 'model', 'generation', 'configuration', 'complectation',
        'body_type', 'color', 'drive_type', 'engine_type', 'transmission',
        'wheel', 'condition', 'custom', 'pts', 'seller_type', 'region', 'city', 'address','bin_horse_power'
    ]

In [13]:
X = df.drop(['price_rub', 'log_price', 'id'], axis=1)
y = df['log_price']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

model = CatBoostRegressor(
    iterations=1000,
    learning_rate=0.05,
    depth=8,
    loss_function='MAE',
    eval_metric='MAE',
    cat_features=categorical_cols,
    verbose=100,
    random_seed=42,
    early_stopping_rounds=100
)

model.fit(
    X_train, y_train,
    eval_set=(X_val, y_val),
    use_best_model=True
)

preds_log = model.predict(X_val)

preds = np.expm1(preds_log)
y_val_real = np.expm1(y_val)

mae = mean_absolute_error(y_val_real, preds)

print(f"--- Результаты ---")
print(f"Итоговый MAE: {mae:,.0f} руб.")
print(f"Средняя цена в валидации: {y_val_real.mean():,.0f} руб.")
print(f"Ошибка составляет примерно {(mae / y_val_real.mean()) * 100:.1f}% от средней цены.")

feature_importance = model.get_feature_importance(prettified=True)
print("\nТоп-10 важных признаков:")
print(feature_importance.head(10))

0:	learn: 0.7214015	test: 0.7059434	best: 0.7059434 (0)	total: 493ms	remaining: 8m 12s
100:	learn: 0.2192651	test: 0.2130747	best: 0.2130747 (100)	total: 19.9s	remaining: 2m 57s
200:	learn: 0.2017842	test: 0.1996492	best: 0.1996492 (200)	total: 39.3s	remaining: 2m 36s
300:	learn: 0.1915475	test: 0.1922204	best: 0.1922204 (300)	total: 58.9s	remaining: 2m 16s
400:	learn: 0.1848042	test: 0.1883575	best: 0.1883575 (400)	total: 1m 19s	remaining: 1m 58s
500:	learn: 0.1796863	test: 0.1858126	best: 0.1858126 (500)	total: 1m 39s	remaining: 1m 38s
600:	learn: 0.1755941	test: 0.1841546	best: 0.1841546 (600)	total: 1m 59s	remaining: 1m 19s
700:	learn: 0.1723991	test: 0.1830838	best: 0.1830838 (700)	total: 2m 24s	remaining: 1m 1s
800:	learn: 0.1697315	test: 0.1822945	best: 0.1822918 (799)	total: 2m 54s	remaining: 43.4s
900:	learn: 0.1671903	test: 0.1816811	best: 0.1816811 (900)	total: 3m 25s	remaining: 22.6s
999:	learn: 0.1652764	test: 0.1812070	best: 0.1812070 (999)	total: 3m 47s	remaining: 0us

b

In [14]:
pd.set_option('display.max_rows', 500)
print("\nТоп-10 важных признаков:")
print(feature_importance.head(10))


Топ-10 важных признаков:
        Feature Id  Importances
0             year    13.339338
1            model    10.584575
2          car_age     9.690368
3           km_age     9.033352
4      horse_power     8.981540
5             mark     8.851417
6       generation     7.788972
7  bin_horse_power     5.903746
8     displacement     3.907209
9       drive_type     3.408337
