In [1]:
!pip install xgboost
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import json
import pickle
from functools import partial
from tqdm import tqdm

from sklearn.model_selection import KFold, cross_val_score, train_test_split, GridSearchCV
from sklearn.metrics import make_scorer
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import Ridge, Lasso, LinearRegression
from sklearn.base import clone

from catboost import CatBoostRegressor
import xgboost as xgb



In [2]:
#Настроим ноутбук
pd.options.display.float_format = '{:,.2f}'.format
plt.rcParams['figure.figsize'] = 12, 8
RANDOM_SEED=42

### Загрузим датасет из прошлой части

In [3]:
df = pd.read_csv('data2.csv')
df_train = df[df.is_train==1]
print(f"{df.shape=}")

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


df.shape=(71765, 693)


### Скопируем оттуда же набор функций для анализа
* fit_and_score_model - оценивает переданнух модель на кросс валидации, и выводит среднее значение метрики mape
* make_submit - делает предсказание модели на тестовых данныхЪ и сохраняет их в файл predict.csv
* prepare_data - подгаталивает тренировочный и тестовый набор (на основании переданного списка колонок)
* create_dataframe_from_json_column - создает датафрейм из json-колонки
* feature_enginering_add_equipment_features - добавляет к датасету список фич из признака equipment

In [38]:
def parse_dict_column(row, column_name):
    """Функция парсит значение в ячейке.
    Если это train строка то применяет функцию eval
    Если это test - то загружает методом json.loads
    Возвращает dict полученный из строкового представления.
    Или возвращает None если значение не указано, или словарь пустой"""
    if row[column_name] is np.nan:
        value = None
    elif row.is_train:
        value = eval(row[column_name])
    else:
        value = json.loads(row[column_name])
    
    return value if value else None

def create_dataframe_from_json_column(df, column_name):
    """Создает датафрейм из колонки, в которой храняться json(dict) параметров автомобиля """
    series_ = df.apply(partial(parse_dict_column, column_name=column_name), axis=1)
    dataframe_ = series_.apply(pd.Series)
    return dataframe_
equipment_dataframe = create_dataframe_from_json_column(df, 'equipment_dict')
equipment_dataframe.fillna(False, inplace=True)

def mape(y_true, y_pred):
    """оценка MEAN ABSOLUTE PERCENTAGE ERROR, которая используется в соревновании"""
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

    
def fit_and_score_model(model, X, y):
    """Оцениваем модель и данные на трех фолдах. Выводим средюю оценку, и оценки на фолдах"""
    
    kf = KFold(n_splits=3, shuffle=True, random_state=RANDOM_SEED)
    cv_results = cross_val_score(
        model, X, y, cv=kf, n_jobs=-1, scoring=make_scorer(mape))
    
    model.fit(X_train,y)
    train_score = mape(y, model.predict(X_train))
    
    print("#"*100)
    print(model)
    print(f"train shape {X.shape}")
    print(f"{cat_cols=}")
    print(f"{num_cols=}")
    print("#"*100)
    print(f"Train result: {train_score}")
    print(f'CV result: {np.mean(cv_results)} ({cv_results})')
    print("#"*100)


def make_submit_file(model, X_test, file_name='predict.csv'):
    """запишем предсказание модели в файл, готовый к отправке на kaggle"""
    pred = np.round(model.predict(X_test))
    answer = pd.Series(pred, index=df[~df.is_train].sell_id, name='price')
    answer.to_csv(file_name, index_label='sell_id')


def prepare_data(categorical_columns: list, numerical_columns: list):
    """Вытащим требуемые данные из глобальной переменной df
    к категориальным применим dummy кодирование
    к числовым Шкалирование"""
    X = df.loc[:, categorical_columns+numerical_columns+['is_train']]

    X = pd.get_dummies(X, columns=categorical_columns)
    X[numerical_columns] = StandardScaler().fit_transform(X[numerical_columns])
    
    X_train = X[X.is_train == 1].drop('is_train', axis=1)
    X_test = X[X.is_train == 0].drop('is_train', axis=1)

    y = df[df.is_train].price

    return X_train, X_test, y

def feature_enginering_add_equipment_features(df):
    """Добавим новые из колонки equipment_dict"""    
    return pd.concat([df, equipment_dataframe],
          axis=1)

In [7]:
cat_cols = ['bodyType', 'brand', 'fuelType', 'color', 'model_name',
            'vehicleTransmission', 'vendor', 'ПТС', 'Владельцы', 'Привод', 'Руль'] 
num_cols = ['productionDate', 'numberOfDoors',
            'enginePower_log', 'mileage_log', 'diff_date_model_production'] + list(equipment_dataframe.columns.values)

### Подготовим данные для моделей

In [6]:
X_train, X_test, y = prepare_data(cat_cols, num_cols)

### Случайный лес из 1000 деревьев

In [8]:
model = RandomForestRegressor(1000, random_state=RANDOM_STATE, verbose=True, n_jobs = -1)

In [9]:
%%time
fit_and_score_model(model, X_train, y)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:   10.6s
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:   54.9s
[Parallel(n_jobs=-1)]: Done 426 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 776 tasks      | elapsed:  3.9min
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:  5.1min finished
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 176 tasks      | elapsed:    0.3s
[Parallel(n_jobs=12)]: Done 426 tasks      | elapsed:    0.9s
[Parallel(n_jobs=12)]: Done 776 tasks      | elapsed:    1.7s


####################################################################################################
RandomForestRegressor(n_estimators=1000, n_jobs=-1, random_state=42,
                      verbose=True)
train shape (37079, 1092)
cat_cols=['bodyType', 'brand', 'fuelType', 'color', 'model_name', 'vehicleTransmission', 'vendor', 'ПТС', 'Владельцы', 'Привод', 'Руль']
num_cols=['productionDate', 'numberOfDoors', 'enginePower_log', 'mileage_log', 'diff_date_model_production', 'engine-proof', 'tinted-glass', 'airbag-driver', 'aux', 'isofix', 'electro-window-front', 'ashtray-and-cigarette-lighter', 'airbag-passenger', 'computer', 'high-beam-assist', 'seat-transformation', 'isofix-front', 'wheel-power', 'alarm', 'lock', 'door-sill-panel', 'fabric-seats', 'electro-mirrors', 'airbag-rear-side', 'electro-window-back', 'steel-wheels', 'ptf', '16-inch-wheels', 'rain-sensor', 'airbag-side', 'audiosystem-cd', 'dark-interior', 'cooling-box', 'condition', 'abs', 'power-child-locks-rear-doors', 'front

[Parallel(n_jobs=12)]: Done 1000 out of 1000 | elapsed:    2.2s finished


In [10]:
pd.Series(model.feature_importances_,
          index=X_test.columns).sort_values(ascending=False).iloc[:20]

body-kit                     0.37
enginePower_log              0.28
productionDate               0.15
mileage_log                  0.04
228                          0.01
fuelType_бензин              0.01
model_name_W188              0.01
model_name_G-КЛАСС_AMG       0.01
fuelType_дизель              0.01
model_name_G-КЛАСС           0.01
model_name_V-КЛАСС           0.01
diff_date_model_production   0.00
model_name_S-КЛАСС           0.00
rear-camera                  0.00
feedback-alarm               0.00
19-inch-wheels               0.00
brand_MERCEDES               0.00
model_name_LAND_CRUISER      0.00
electro-trunk                0.00
301                          0.00
dtype: float64

In [11]:
make_submit_file(model, X_test, "6.1_random_forest_1000.csv") # 12.88

[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 176 tasks      | elapsed:    0.2s
[Parallel(n_jobs=12)]: Done 426 tasks      | elapsed:    0.6s
[Parallel(n_jobs=12)]: Done 776 tasks      | elapsed:    1.3s
[Parallel(n_jobs=12)]: Done 1000 out of 1000 | elapsed:    1.7s finished


### Случайный лес на 2000 деревьев

In [12]:
model = RandomForestRegressor(2000, random_state=RANDOM_STATE, verbose=True, n_jobs = -1)

In [13]:
fit_and_score_model(model, X_train, y)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:   11.0s
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:   56.5s
[Parallel(n_jobs=-1)]: Done 426 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 776 tasks      | elapsed:  4.0min
[Parallel(n_jobs=-1)]: Done 1226 tasks      | elapsed:  6.3min
[Parallel(n_jobs=-1)]: Done 1776 tasks      | elapsed:  9.0min
[Parallel(n_jobs=-1)]: Done 2000 out of 2000 | elapsed: 10.2min finished
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 176 tasks      | elapsed:    0.3s
[Parallel(n_jobs=12)]: Done 426 tasks      | elapsed:    0.9s
[Parallel(n_jobs=12)]: Done 776 tasks      | elapsed:    1.7s
[Parallel(n_jobs=12)]: Done 1226 tasks      | elapsed:    2.8s
[Parallel(n_jobs=12)]: Done 1776 tasks      | elapsed:    4.0s


####################################################################################################
RandomForestRegressor(n_estimators=2000, n_jobs=-1, random_state=42,
                      verbose=True)
train shape (37079, 1092)
cat_cols=['bodyType', 'brand', 'fuelType', 'color', 'model_name', 'vehicleTransmission', 'vendor', 'ПТС', 'Владельцы', 'Привод', 'Руль']
num_cols=['productionDate', 'numberOfDoors', 'enginePower_log', 'mileage_log', 'diff_date_model_production', 'engine-proof', 'tinted-glass', 'airbag-driver', 'aux', 'isofix', 'electro-window-front', 'ashtray-and-cigarette-lighter', 'airbag-passenger', 'computer', 'high-beam-assist', 'seat-transformation', 'isofix-front', 'wheel-power', 'alarm', 'lock', 'door-sill-panel', 'fabric-seats', 'electro-mirrors', 'airbag-rear-side', 'electro-window-back', 'steel-wheels', 'ptf', '16-inch-wheels', 'rain-sensor', 'airbag-side', 'audiosystem-cd', 'dark-interior', 'cooling-box', 'condition', 'abs', 'power-child-locks-rear-doors', 'front

[Parallel(n_jobs=12)]: Done 2000 out of 2000 | elapsed:    4.5s finished


Случайный лес на 2000 деревьев показал схожий результат что и на 1000, при этом времени и ресурсов на него было затрачено больше. Вывод: не будем его сохранять.
### Подбор параметров для модели
Посмотрим какие параметры окажутся наиболее подходящими для случайного леса
* n_estimators — число «деревьев» в «случайном лесу».
* max_features — число признаков для выбора расщепления.
* max_depth — максимальная глубина деревьев.
* min_samples_split — минимальное число объектов, необходимое для того, * чтобы узел дерева мог бы расщепиться.
* min_samples_leaf — минимальное число объектов в листьях.
* bootstrap — использование для построения деревьев подвыборки с возвращением.

In [15]:
from sklearn.model_selection import RandomizedSearchCV
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 1000, num = 10)]
max_features = ['log2', 'sqrt']
max_depth = [int(x) for x in np.linspace(start = 1, stop = 15, num = 15)]
min_samples_split = [int(x) for x in np.linspace(start = 2, stop = 50, num = 10)]
min_samples_leaf = [int(x) for x in np.linspace(start = 2, stop = 50, num = 10)]
bootstrap = [True, False]
param_dist = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
model = RandomForestRegressor()
rs = RandomizedSearchCV(model, 
                        param_dist, 
                        n_iter = 100, 
                        cv = 3, 
                        verbose = 1, 
                        n_jobs=-1, 
                        random_state=42)

In [16]:
rs.fit(X_train, y)
rs.best_params_

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:  8.8min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed: 14.4min finished


{'n_estimators': 600,
 'min_samples_split': 23,
 'min_samples_leaf': 2,
 'max_features': 'sqrt',
 'max_depth': 15,
 'bootstrap': False}

Таким образом наилучшими параметрами для модели оказались:
* 'n_estimators': 600,
*  'min_samples_split': 23,
*  'min_samples_leaf': 2,
*  'max_features': 'sqrt',
*  'max_depth': 15,
*  'bootstrap': False   
Попробуем обучить модель на полученных гиперпараметрах

In [18]:
model = RandomForestRegressor(600,min_samples_split = 23,
                              min_samples_leaf = 2,
                              max_features = 'sqrt', max_depth = 15,
                              bootstrap = False, random_state=RANDOM_STATE,
                              verbose=True, n_jobs = -1)

In [19]:
%%time
fit_and_score_model(model, X_train, y)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    0.9s
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:    5.2s
[Parallel(n_jobs=-1)]: Done 426 tasks      | elapsed:   12.5s
[Parallel(n_jobs=-1)]: Done 600 out of 600 | elapsed:   17.5s finished
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 176 tasks      | elapsed:    0.1s


####################################################################################################
RandomForestRegressor(bootstrap=False, max_depth=15, max_features='sqrt',
                      min_samples_leaf=2, min_samples_split=23,
                      n_estimators=600, n_jobs=-1, random_state=42,
                      verbose=True)
train shape (37079, 1092)
cat_cols=['bodyType', 'brand', 'fuelType', 'color', 'model_name', 'vehicleTransmission', 'vendor', 'ПТС', 'Владельцы', 'Привод', 'Руль']
num_cols=['productionDate', 'numberOfDoors', 'enginePower_log', 'mileage_log', 'diff_date_model_production', 'engine-proof', 'tinted-glass', 'airbag-driver', 'aux', 'isofix', 'electro-window-front', 'ashtray-and-cigarette-lighter', 'airbag-passenger', 'computer', 'high-beam-assist', 'seat-transformation', 'isofix-front', 'wheel-power', 'alarm', 'lock', 'door-sill-panel', 'fabric-seats', 'electro-mirrors', 'airbag-rear-side', 'electro-window-back', 'steel-wheels', 'ptf', '16-inch-wheels', '

[Parallel(n_jobs=12)]: Done 426 tasks      | elapsed:    0.3s
[Parallel(n_jobs=12)]: Done 600 out of 600 | elapsed:    0.5s finished


## Используем в качестве модели CatBoost

In [8]:
X = df.loc[:, cat_cols+num_cols+['is_train']]
X_train = X[X.is_train == 1].drop('is_train', axis=1)
X_sub = X[X.is_train == 0].drop('is_train', axis=1)
y = df[df.is_train].price

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X_train, y, test_size=0.2, shuffle=True, random_state=RANDOM_SEED)

In [25]:
model = CatBoostRegressor(iterations=5000,
                          random_seed=RANDOM_SEED,
                          eval_metric='MAPE',
                          custom_metric=['R2', 'MAE'],
                          silent=True,)

In [26]:
%%time
model.fit(X_train, y_train,
         cat_features=cat_cols,
         eval_set=(X_test, y_test),
         verbose_eval=0,
         use_best_model=True,
         #plot=True
         )

Wall time: 4min 20s


<catboost.core.CatBoostRegressor at 0x23c11f7bb20>

In [27]:
pred = model.predict(X_test)

In [28]:
def mape(y_true, y_pred):
    """оценка MEAN ABSOLUTE PERCENTAGE ERROR, которая используется в соревновании"""
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [29]:
mape(y_test,pred)

12.524124588881538

Результат получился хуже, чем при модели случайного леса. Но для случайного леса свойственно переобучение, попробую обе модели в соревновании.

In [32]:
model.save_model('catboost.model')

### XGBOOST

In [39]:
X_train, X_test, y = prepare_data(cat_cols, num_cols)

In [40]:
model = xgb.XGBRegressor(objective='reg:squarederror', 
                          colsample_bytree=0.5,
                          learning_rate=0.1, 
                          max_depth=12, 
                          alpha=1,
                          n_estimators=2000,
                          n_jobs = -1,
                          silent=1)

In [41]:
%%time
fit_and_score_model(model, X_train, y)

Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


####################################################################################################
XGBRegressor(alpha=1, base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.5, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.1, max_delta_step=0, max_depth=12,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=2000, n_jobs=-1, num_parallel_tree=1, random_state=0,
             reg_alpha=1, reg_lambda=1, scale_pos_weight=1, silent=1,
             subsample=1, tree_method='exact', validate_parameters=1,
             verbosity=None)
train shape (3

### Точность хуже чем при предыдущих моделях
Попробуем применить ансамбль

In [42]:
def compute_meta_feature(clf, X_train, X_test, y_train, cv):    
    X_meta_train = np.zeros((len(y_train)), dtype=np.float32)

    splits = cv.split(X_train)
    
    for train_fold_index, predict_fold_index in splits:
        X_fold_train, X_fold_predict = X_train[train_fold_index], X_train[predict_fold_index]
        y_fold_train = y_train[train_fold_index]
        
        folded_clf = clone(clf)
        folded_clf.fit(X_fold_train, y_fold_train)
        
        X_meta_train[predict_fold_index] = folded_clf.predict(X_fold_predict)
    
    meta_clf = clone(clf)
    meta_clf.fit(X_train, y_train)
    
    X_meta_test = meta_clf.predict(X_test)
    
    return X_meta_train, X_meta_test

def generate_meta_features(classifiers, X_train, X_test, y_train, cv):
    
    features = [
        compute_meta_feature(clf, X_train, X_test, y_train, cv)
        for clf in tqdm(classifiers)
    ]
    
    stacked_features_train = np.hstack([
        features_train for features_train, features_test in features
    ]).reshape(-1,len(features), order='F')

    stacked_features_test = np.hstack([
        features_test for features_train, features_test in features
    ]).reshape(-1,len(features), order='F')
    
    return stacked_features_train, stacked_features_test

In [43]:
cv = KFold(n_splits=3, shuffle=True, random_state=RANDOM_SEED)

In [None]:
%%time
stacked_features_train, stacked_features_test = generate_meta_features([
    RandomForestRegressor(n_estimators=100, n_jobs=-1, random_state=RANDOM_SEED),
    ExtraTreesRegressor(n_estimators=100, random_state=RANDOM_SEED),
    Ridge(random_state=RANDOM_SEED),
    GradientBoostingRegressor(n_estimators=100, random_state=RANDOM_SEED),
    CatBoostRegressor(random_state=RANDOM_SEED)
], X_train.values, X_test.values, y.values, cv)

 20%|████████████████▌                                                                  | 1/5 [02:01<08:06, 121.53s/it]