In [1]:
import pandas as pd
from sklearn.metrics import mean_absolute_error
from catboost import CatBoostRegressor, cv, Pool
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.model_selection import KFold

In [2]:
df = pd.read_csv('../data/processed/preprocessed_train.csv')

In [3]:
df.columns

Index(['id', 'price_rub', 'mark', 'model', 'generation', 'configuration',
       'complectation', 'body_type', 'color', 'displacement', 'drive_type',
       'engine_type', 'horse_power', 'transmission', 'wheel', 'km_age', 'year',
       'owners_count', 'condition', 'custom', 'pts', 'seller_type', 'region',
       'city', 'address', 'log_price', 'bin_horse_power', 'car_age',
       'city_price_ratio', 'count_options', 'has_heating', 'has_damage'],
      dtype='object')

Создадим базовый регрессор, чтобы в дальнейшем сравнивать наработки и улучшения с бейзлайном

In [22]:
categorical_cols = [
    'mark', 'model', 'generation', 'configuration', 'complectation',
    'body_type', 'color', 'drive_type', 'engine_type', 'transmission',
    'wheel', 'condition', 'custom', 'pts', 'seller_type', 'region', 'city','bin_horse_power'
]

In [56]:
X = df.drop(['log_price'], axis=1)
y = df['log_price']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

model = CatBoostRegressor(
    iterations=3000,
    learning_rate=0.05,
    depth=9,
    loss_function='MAE',
    eval_metric='MAE',
    cat_features=categorical_cols,
    verbose=100,
    random_seed=42,
    early_stopping_rounds=100,
    l2_leaf_reg=3,
    bagging_temperature=1, 
    task_type="GPU",
    devices='0'
)

model.fit(
    X_train, y_train,
    eval_set=(X_val, y_val),
    use_best_model=True
)

preds_log = model.predict(X_val)

preds = np.expm1(preds_log)
y_val_real = np.expm1(y_val)

mae = mean_absolute_error(y_val_real, preds)

Default metric period is 5 because MAE is/are not implemented for GPU


0:	learn: 0.7338256	test: 0.7187627	best: 0.7187627 (0)	total: 104ms	remaining: 5m 10s
100:	learn: 0.2176961	test: 0.2152564	best: 0.2152564 (100)	total: 6.53s	remaining: 3m 7s
200:	learn: 0.1910157	test: 0.1950868	best: 0.1950868 (200)	total: 12.8s	remaining: 2m 58s
300:	learn: 0.1785513	test: 0.1881259	best: 0.1881259 (300)	total: 18.9s	remaining: 2m 49s
400:	learn: 0.1694296	test: 0.1844950	best: 0.1844919 (398)	total: 25.3s	remaining: 2m 43s
500:	learn: 0.1633236	test: 0.1828110	best: 0.1828056 (499)	total: 31.1s	remaining: 2m 35s
600:	learn: 0.1585551	test: 0.1817407	best: 0.1817407 (600)	total: 37s	remaining: 2m 27s
700:	learn: 0.1543602	test: 0.1810544	best: 0.1810544 (700)	total: 42.7s	remaining: 2m 20s
800:	learn: 0.1505782	test: 0.1806669	best: 0.1806636 (799)	total: 48.7s	remaining: 2m 13s
900:	learn: 0.1473636	test: 0.1802804	best: 0.1802804 (900)	total: 54.9s	remaining: 2m 7s
1000:	learn: 0.1442603	test: 0.1799975	best: 0.1799831 (998)	total: 1m	remaining: 2m
1100:	learn: 

In [57]:
print(f"Итоговый MAE: {mae:,.0f} руб.")
print(f"Средняя цена в валидации: {y_val_real.mean():,.0f} руб.")
print(f"Ошибка составляет примерно {(mae / y_val_real.mean()) * 100:.1f}% от средней цены.")

feature_importance = model.get_feature_importance(prettified=True)
print("\nТоп-10 важных признаков:")
print(feature_importance.head(10))

Итоговый MAE: 243,363 руб.
Средняя цена в валидации: 1,570,479 руб.
Ошибка составляет примерно 15.5% от средней цены.

Топ-10 важных признаков:
       Feature Id  Importances
0     horse_power     9.883300
1            mark     9.675907
2         car_age     8.557731
3            year     8.042009
4      generation     5.559801
5           model     5.346107
6          km_age     5.252429
7          region     5.150018
8       body_type     3.824697
9  specific_power     3.596415


Для улучшения точности модели попробуем TargetEncoding

In [None]:
df = df.drop(['id', 'address'], axis=1)
df = df.drop(['price_rub'], axis=1)
df.select_dtypes(include=['object']).astype('category')
df['specific_power'] = df['horse_power'] / df['displacement']
df['km_per_year'] = df['km_age'] / df['car_age']
df['hp_per_year'] = df['horse_power'] * df['car_age']

In [55]:
from sklearn.model_selection import RandomizedSearchCV

param_dist = {
    'depth': [6, 7, 8, 9, 10],
    'learning_rate': np.linspace(0.01, 0.1, 10),
    'l2_leaf_reg': [1, 3, 5, 7, 9],
    'bagging_temperature': [0, 1, 2, 3]
}

model = CatBoostRegressor(
    iterations=3000,
    loss_function='MAE',
    task_type="GPU",
    verbose=0
)

search = RandomizedSearchCV(
    model,
    param_dist,
    n_iter=20,
    scoring='neg_mean_absolute_error',
    cv=3,
    verbose=2
)

search.fit(X_train, y_train, cat_features=categorical_cols)

Fitting 3 folds for each of 20 candidates, totalling 60 fits


Default metric period is 5 because MAE is/are not implemented for GPU


[CV] END bagging_temperature=1, depth=9, l2_leaf_reg=3, learning_rate=0.05000000000000001; total time= 7.6min


Default metric period is 5 because MAE is/are not implemented for GPU


[CV] END bagging_temperature=1, depth=9, l2_leaf_reg=3, learning_rate=0.05000000000000001; total time= 7.6min


Default metric period is 5 because MAE is/are not implemented for GPU


KeyboardInterrupt: 

In [34]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)
mae_train = []
mae_val = []

for fold, (train_idx, val_idx) in enumerate(kf.split(df), 1):
    train_df = df.iloc[train_idx].copy()
    val_df = df.iloc[val_idx].copy()
    
    X_train = train_df.drop('log_price', axis=1) 
    y_train = train_df['log_price']               
    
    X_val = val_df.drop('log_price', axis=1)  
    y_val = val_df['log_price']                   
    
    model = CatBoostRegressor(
        iterations=1000,
        learning_rate=0.01,
        depth=8,
        loss_function='MAE',
        eval_metric='MAE',
        cat_features=categorical_cols,
        verbose=100,
        random_seed=42,
        early_stopping_rounds=100
    )
    
    model.fit(
        X_train, y_train,  
        eval_set=(X_val, y_val),
        use_best_model=True
    )
    
    train_preds = model.predict(X_train)
    val_preds = model.predict(X_val)
    
    train_mae = mean_absolute_error(y_train, train_preds)
    val_mae = mean_absolute_error(y_val, val_preds)

    train_mae = mean_absolute_error(np.expm1(y_train), train_preds)
    val_mae   = mean_absolute_error(np.expm1(y_val), val_preds)
    
    mae_train.append(train_mae)
    mae_val.append(val_mae)
    
    print(
        f'Fold {fold}: '
        f'Train MAE = {train_mae:,.0f}, '
        f'Val MAE = {val_mae:,.0f}'
    )

print(f'\nMean Train MAE: {np.mean(mae_train):,.0f}')
print(f'Mean Val MAE:   {np.mean(mae_val):,.0f}')
print(f'Std Val MAE:    {np.std(mae_val):,.0f}')

feature_importance = model.get_feature_importance(prettified=True)

0:	learn: 0.7449229	test: 0.7298585	best: 0.7298585 (0)	total: 153ms	remaining: 2m 32s
100:	learn: 0.3885204	test: 0.3789594	best: 0.3789594 (100)	total: 12s	remaining: 1m 46s
200:	learn: 0.2793670	test: 0.2717768	best: 0.2717768 (200)	total: 25.4s	remaining: 1m 41s
300:	learn: 0.2421601	test: 0.2358722	best: 0.2358722 (300)	total: 38.5s	remaining: 1m 29s
400:	learn: 0.2258840	test: 0.2206909	best: 0.2206909 (400)	total: 51.2s	remaining: 1m 16s
500:	learn: 0.2174949	test: 0.2131829	best: 0.2131829 (500)	total: 1m 4s	remaining: 1m 3s
600:	learn: 0.2119988	test: 0.2085108	best: 0.2085108 (600)	total: 1m 17s	remaining: 51.3s
700:	learn: 0.2082021	test: 0.2054041	best: 0.2054041 (700)	total: 1m 29s	remaining: 38.4s
800:	learn: 0.2048941	test: 0.2027867	best: 0.2027867 (800)	total: 1m 42s	remaining: 25.5s
900:	learn: 0.2020435	test: 0.2005508	best: 0.2005508 (900)	total: 1m 55s	remaining: 12.7s
999:	learn: 0.1994772	test: 0.1986313	best: 0.1986313 (999)	total: 2m 8s	remaining: 0us

bestTest