In [1]:
import pandas as pd
import numpy as np
from copy import copy

from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.linear_model import ElasticNet, LinearRegression, Ridge, Lasso
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, StackingRegressor

from sklearn.model_selection import train_test_split, GridSearchCV, RepeatedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import make_scorer

In [2]:
train = pd.read_csv('train_data.csv')
kaggle = pd.read_csv('test_data.csv')

In [3]:
train.head()

Unnamed: 0,bodyType,brand,color,complectation_dict,engineDisplacement,enginePower,equipment_dict,fuelType,mileage,modelDate,...,sell_id,vehicleTransmission,vendor,owners_number,pts,gear_type,steering_wheel,price,gearbox,years
0,58,1,13,1,2.0,249,1,0,60000,2016,...,1101602743,0,0,0,1,2,0,3088888.0,1,3
1,2,1,15,0,4.4,555,1,0,89000,2009,...,1102385278,0,0,2,1,2,0,1680000.0,1,9
2,2,1,15,0,3.0,313,1,3,87000,2014,...,1101787929,0,0,0,1,2,0,3990000.0,1,3
3,2,1,1,0,3.0,286,1,3,167302,2007,...,1101434027,0,0,2,1,2,0,1365000.0,1,11
4,2,1,12,1,3.0,306,1,0,94846,2013,...,1101592048,0,0,2,1,2,0,2295000.0,1,5


In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22347 entries, 0 to 22346
Data columns (total 22 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   bodyType             22347 non-null  int64  
 1   brand                22347 non-null  int64  
 2   color                22347 non-null  int64  
 3   complectation_dict   22347 non-null  int64  
 4   engineDisplacement   22347 non-null  float64
 5   enginePower          22347 non-null  int64  
 6   equipment_dict       22347 non-null  int64  
 7   fuelType             22347 non-null  int64  
 8   mileage              22347 non-null  int64  
 9   modelDate            22347 non-null  int64  
 10  model_name           22347 non-null  int64  
 11  numberOfDoors        22347 non-null  int64  
 12  sell_id              22347 non-null  int64  
 13  vehicleTransmission  22347 non-null  int64  
 14  vendor               22347 non-null  int64  
 15  owners_number        22347 non-null 

In [5]:
train.describe()

Unnamed: 0,bodyType,brand,color,complectation_dict,engineDisplacement,enginePower,equipment_dict,fuelType,mileage,modelDate,...,sell_id,vehicleTransmission,vendor,owners_number,pts,gear_type,steering_wheel,price,gearbox,years
count,22347.0,22347.0,22347.0,22347.0,22347.0,22347.0,22347.0,22347.0,22347.0,22347.0,...,22347.0,22347.0,22347.0,22347.0,22347.0,22347.0,22347.0,22347.0,22347.0,22347.0
mean,33.957802,5.617577,9.583792,0.190853,2.42582,195.467445,0.811339,0.601289,145684.540296,2008.775406,...,1100301000.0,0.808744,0.33745,1.075267,0.882758,1.322638,0.029982,1410663.0,1.265673,10.100595
std,29.366156,3.435211,5.561925,0.392983,0.968769,86.866785,0.391248,1.199426,95704.631981,7.124645,...,15443990.0,1.122071,0.47285,0.847163,0.321715,0.711233,0.17054,1121662.0,0.869692,6.737997
min,0.0,0.0,0.0,0.0,0.6,7.0,0.0,0.0,1.0,1936.0,...,2665.0,0.0,0.0,0.0,0.0,0.0,0.0,30000.0,0.0,1.0
25%,2.0,2.0,4.0,0.0,1.8,140.0,1.0,0.0,75600.0,2006.0,...,1101566000.0,0.0,0.0,0.0,1.0,1.0,0.0,620000.0,1.0,5.0
50%,31.0,6.0,12.0,0.0,2.0,180.0,1.0,0.0,130333.0,2010.0,...,1102118000.0,0.0,0.0,1.0,1.0,1.0,0.0,1075000.0,1.0,9.0
75%,58.0,9.0,15.0,0.0,3.0,245.0,1.0,0.0,195000.0,2014.0,...,1102335000.0,2.0,1.0,2.0,1.0,2.0,0.0,1855000.0,2.0,13.0
max,87.0,11.0,15.0,1.0,6.6,612.0,1.0,4.0,1000000.0,2020.0,...,1102448000.0,3.0,1.0,2.0,1.0,2.0,1.0,6000000.0,3.0,84.0


In [6]:
train.nunique()

bodyType                  85
brand                     12
color                     16
complectation_dict         2
engineDisplacement        54
enginePower              288
equipment_dict             2
fuelType                   5
mileage                 8660
modelDate                 60
model_name               484
numberOfDoors              4
sell_id                21558
vehicleTransmission        4
vendor                     2
owners_number              3
pts                        2
gear_type                  3
steering_wheel             2
price                   2761
gearbox                    4
years                     59
dtype: int64

In [7]:
## Выделим числовые признаки, бинарные и категориальные.
## Последние нужны, в частности, для того, чтобы передать их CatBoost.

In [8]:
num_features = ['enginePower', 'mileage', 'modelDate', 'owners_number', 'years']
cat_features = ['bodyType', 'brand', 'color', 'engineDisplacement', 'fuelType', 'numberOfDoors', 'vehicleTransmission', 'gear_type', 'gearbox']
bin_features = ['complectation_dict', 'equipment_dict', 'vendor', 'pts', 'steering_wheel']

# Разбиение на трейн и валидацию

In [9]:
y = np.log(train.price.values)
X = train.drop(['price', 'sell_id'], axis=1)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.01, random_state=42)

In [10]:
# std_scaler_obj = StandardScaler()
# X_train = std_scaler_obj.fit_transform(X_train)
# X_val = std_scaler_obj.transform(X_val)


# std_scaler_lab = StandardScaler()
# y_train = std_scaler_lab.fit_transform(y_train)
# y_val = std_scaler_lab.transform(y_val)

# Тренировка моделей

In [11]:
# Напишем нашу метрику в виде функции
def mape(y_true, y_pred):
    y_true = np.exp(y_true)
    y_pred = np.exp(y_pred)
    return np.mean(np.abs((y_pred - y_true) / y_true)) * 100

MAPE_scorer = make_scorer(score_func=mape, greater_is_better=False)

In [12]:
# Начнём с простого -- обучим каждую модель в отдельности и найдём лучшие гиперпараметры
# Напишем функцию, которая на вход будет принимать модель и словарь гиперпараметров

def fit_best(model, param_grid):
    cv = RepeatedKFold(n_repeats=3)
    gs_model = GridSearchCV(estimator=model, param_grid=param_grid, scoring=MAPE_scorer, cv=cv, n_jobs=-1)
    if type(model).__name__ == 'CatBoostRegressor':
        X_train_aux = copy(X_train)
        X_val_aux = copy(X_val)
        X_train_aux[cat_features] = X_train_aux[cat_features].astype(str).values
        X_val_aux[cat_features] = X_val_aux[cat_features].astype(str).values
    else:
        X_train_aux = X_train.values
        X_val_aux = X_val.values
    gs_model.fit(X_train_aux, y_train)
    print(f'Best {type(model).__name__} has cv MAPE of {-gs_model.best_score_}')
    print(f'Best {type(model).__name__} has {gs_model.best_params_}')
    print(f'Validation score is {mape(y_val, gs_model.best_estimator_.predict(X_val_aux))}')

### ElasticNet -- смесь l1 и l2 регуляризаций

In [13]:
fit_best(model=ElasticNet(max_iter=3000),
               param_grid={'alpha': np.logspace(1e-2, 1e2, 4),
                           'l1_ratio': np.linspace(0.1, 0.9, 4)
                          }
              )

Best ElasticNet has cv MAPE of 40.06288728542806
Best ElasticNet has {'alpha': 1.023292992280754, 'l1_ratio': 1.0}
Validation score is 48.663961934591384


### RandomForestRegressor

In [14]:
fit_best(model=RandomForestRegressor(n_jobs=-1),
               param_grid={
                   'criterion': ['mse', 'mae'],
                   'n_estimators': [100, 250],
                   'min_samples_leaf': [10, 20]
                          }
              )

Best RandomForestRegressor has cv MAPE of 14.023459110254782
Best RandomForestRegressor has {'criterion': 'mse', 'min_samples_leaf': 10, 'n_estimators': 250}
Validation score is 15.69045063049733


# Boosting

### CatBoost


In [15]:
fit_best(model=CatBoostRegressor(verbose=0, loss_function='MAE', cat_features=cat_features),
               param_grid={
                   'n_estimators': [500, 1000],
                   'max_depth': [3, 5, 7],
                   'eta': [None, 0.01, 0.1, 1.]
               }
              )

Best CatBoostRegressor has cv MAPE of 12.441821836943147
Best CatBoostRegressor has {'eta': 0.1, 'max_depth': 7, 'n_estimators': 1000}
Validation score is 15.569577872920744


### LightGBM

In [16]:
fit_best(model=LGBMRegressor(),
               param_grid={
                   'n_estimators': [500, 1000],
                   'max_depth': [3, 5, 7],
                   'learning_rate': [0.01, 0.1, 1.],
               }
        )

Best LGBMRegressor has cv MAPE of 12.318309064688878
Best LGBMRegressor has {'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 1000}
Validation score is 13.539902310813984


### XGBoost

In [17]:
fit_best(model=XGBRegressor(verbosity=0),
               param_grid={
                   'n_estimators': [500, 1000],
                   'max_depth': [3, 5, 7],
                   'learning_rate': [0.01, 0.1, 1.],
               }
        )

Best XGBRegressor has cv MAPE of 12.072597455216158
Best XGBRegressor has {'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 1000}
Validation score is 13.129986403989873


# Stacking

Теперь обучим несколько базовых моделей, используя те лучшие конфигурации гиперпараметров, которые мы обнаружили. А дальше, поверх них, добавим мета-модель

In [16]:
stacked_model = StackingRegressor(estimators=[
    ('xgboost', XGBRegressor(verbosity=0, n_estimators=1000, max_depth=7, learning_rate=0.1)),
    ('lgbm', LGBMRegressor(n_estimators=1000, max_depth=7, learning_rate=0.1)),
    ('rf', RandomForestRegressor(n_estimators=250, criterion='mse', min_samples_leaf=10)),
    ('catboost', CatBoostRegressor(verbose=0, loss_function='MAE', n_estimators=1000, max_depth=7, eta=0.1))],
                                  final_estimator=Ridge(),
                                  cv=4
                                )

In [17]:
fit_best(model=stacked_model,
         param_grid={
        'final_estimator__fit_intercept': [True, False],
        'final_estimator__alpha': np.logspace(1e-2, 1e2, 4),
        'xgboost__n_estimators': [100, 150, 1000],
         }
        )

Best StackingRegressor has cv MAPE of 11.738248830460137
Best StackingRegressor has {}
Validation score is 13.515857640161283
