## 1. Импорт библиотек

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from statistics import mode, multimode
from itertools import combinations
from scipy.stats import ttest_ind, pearsonr, f_oneway
import statsmodels.api as sm
import scipy.stats as sst
import warnings
from datetime import datetime
from datetime import date
import time
import re

import requests
from bs4 import BeautifulSoup

from sklearn.feature_selection import f_classif, mutual_info_classif
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler, RobustScaler,\
                                  MinMaxScaler, KBinsDiscretizer

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression

from sklearn.ensemble import (ExtraTreesRegressor, RandomForestRegressor,\
                              GradientBoostingRegressor, StackingRegressor)

from sklearn.metrics import confusion_matrix, auc, roc_auc_score, roc_curve, accuracy_score,\
precision_score, recall_score, f1_score, precision_recall_curve, average_precision_score

%matplotlib inline
warnings.filterwarnings('ignore')

## 2. Определение функций

In [2]:
# MAPE
def mape(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true) * 100)

## 3. Импорт данных

In [3]:
data = pd.read_excel('data_all.xlsx')
# data_disc = pd.read_excel('data_disc.xlsx')
data_log = pd.read_excel('data_log.xlsx')
data_scaled = pd.read_excel('data_scaled.xlsx')

In [4]:
sell_id = pd.read_excel('sell_id.xlsx')

In [5]:
sell_id.describe()

Unnamed: 0.1,Unnamed: 0,sell_id
count,34686.0,34686.0
mean,17342.5,1098300000.0
std,10013.130055,19112250.0
min,0.0,2665.0
25%,8671.25,1099049000.0
50%,17342.5,1100911000.0
75%,26013.75,1101245000.0
max,34685.0,1101375000.0


In [6]:
target = 'price'
cat_cols = ['bodyType', 'brand', 'color', 'description', 'fuelType', 'model_name', 'vehicleTransmission', 
             'Владельцы', 'Привод', 'vendor', 'ПТС', 'Руль']
num_cols = ['engineDisplacement', 'enginePower', 'mileage', 'age', 'Владение']

In [7]:
data_cat = pd.get_dummies(data.loc[:, cat_cols], columns = cat_cols)
data_cat.shape

(153531, 788)

### 9. Моделирование

#### 9.1. Подготовка комбинаций датафреймов с различными вариантами обработки числовых признаков

In [8]:
data_scaled = pd.concat([data_scaled, data[target], data.flag], axis = 1)
data_scaled = pd.concat([data_scaled, data_cat], axis = 1)

In [9]:
data_log = pd.concat([data_log, data[target], data.flag], axis = 1)
data_log = pd.concat([data_log, data_cat], axis = 1)

In [10]:
# data_disc = pd.concat([data_disc, data[target], data.flag], axis = 1)
# data_disc = pd.concat([data_disc, data_cat], axis = 1)

In [13]:
data_scaled = data_scaled.drop(['Unnamed: 0'], axis=1)

In [14]:
data_scaled()

Unnamed: 0,engineDisplacement,enginePower,mileage,age,Владение,price,flag,bodyType_внедорожник 3 дв.,bodyType_внедорожник 5 дв.,bodyType_внедорожник открытый,...,Владельцы_3,Привод_задний,Привод_передний,Привод_полный,vendor_EUROPEAN,vendor_JAPANESE,ПТС_Дубликат,ПТС_Оригинал,Руль_Левый,Руль_Правый
0,0.151515,0.123153,0.147998,0.171429,0.695122,,0,0,0,0,...,1,0,1,0,1,0,0,1,1,0
1,0.272727,0.131363,0.121124,0.085714,0.609756,,0,0,0,0,...,0,0,1,0,1,0,0,1,1,0
2,0.333333,0.200328,0.175998,0.171429,0.695122,,0,0,0,0,...,0,0,1,0,1,0,0,1,1,0
3,0.272727,0.131363,0.189998,0.171429,0.695122,,0,0,0,0,...,0,0,1,0,1,0,0,1,1,0
4,0.333333,0.200328,0.11707,0.228571,0.678862,,0,0,0,0,...,0,0,1,0,1,0,0,1,1,0


#### 9.2. Random Forest

In [33]:
# Разделение на тренировочную и валидационную выборки data_scaled
data_fit = data_scaled[data_scaled['flag'] == 1]
data_fit = data_fit.dropna()
X = data_fit.drop([target], axis=1)
y = data_fit[target]

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2,
                                                  shuffle=True, random_state=0)

In [35]:
# Разделение на тренировочную и валидационную выборки data_scaled
data_fit = data_scaled[data_scaled['flag'] == 1]
data_fit = data_fit.dropna()
X = data_fit.drop([target], axis=1)
y = data_fit[target]

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2,
                                                  shuffle=True, random_state=0)
# Обучение и предсказание
rf = ExtraTreesRegressor(n_estimators=300, random_state=0, n_jobs=-1,
                         bootstrap=True, verbose=1)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_val)

MAPE = mape(y_val, y_pred)
print(f'Mean Absolute Percentage Error: {MAPE}')
# MAPE = 15.299

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=-1)]: Done  44 tasks      | elapsed:  3.2min
[Parallel(n_jobs=-1)]: Done 194 tasks      | elapsed: 15.4min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed: 23.8min finished
[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    1.9s
[Parallel(n_jobs=3)]: Done 194 tasks      | elapsed:   14.2s


Mean Absolute Percentage Error: 15.299080423661566


[Parallel(n_jobs=3)]: Done 300 out of 300 | elapsed:   22.5s finished


#### 9.3. Catboost

In [None]:
# Разделение на тренировочную и валидационную выборки data_scaled
data_fit = data_scaled[data_scaled['flag'] == 1]
data_fit = data_fit.dropna()
X = data_fit.drop([target], axis=1)
y = data_fit[target]

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2,
                                                  shuffle=True, random_state=0)

In [None]:
# Обучение
cbr = CatBoostRegressor()

# Grid search
param_grid = {'iterations': [500],
              'learning_rate': [0.01, 0.1, 0.25, 0.5],
              'l2_leaf_reg': [1, 3, 5, 10],
              'depth': np.arange(10, 14),
              'thread_count': [4]}

cbr.grid_search(param_grid=param_grid,
                X=X,
                y=y,
                cv=5,
                partition_random_seed=0,
                calc_cv_statistics=True,
                search_by_train_test_split=True,
                refit=True,
                shuffle=True,
                stratified=None,
                train_size=0.8,
                verbose=True,
                plot=True)

grid_search = GridSearchCV(cbr, param_grid, n_jobs=-1,
                           scoring=mape, cv=5, refit=True,
                           return_train_score=True, verbose=5)

In [None]:
# Лучшие параметры
cbr.get_params()

In [None]:
MAPE = mape(y_val, cbr.predict(X_val))
print(f'Mean Absolute Percentage Error: {MAPE}')
# MAPE = 14.2198

#### 9.4. XGBoost

In [None]:
# Обучение
xg_reg = xgb.XGBRegressor(objective='reg:squarederror', colsample_bytree=0.5,
                          learning_rate=0.05, max_depth=12, alpha=1,
                          n_estimators=1000, silent=1)
xg_reg.fit(X_train, y_train)
xg_red_pred = xg_reg.predict(X_val)

In [None]:
# Кросс-валидация
# params = {'objective': 'reg:squarederror',
#           'colsample_bytree': 0.5,
#           'learning_rate': 0.05,
#           'max_depth': 10, 
#           'alpha': 1,
#           'n_estimators': 1000}

# cv_results = xgb.cv(dtrain=data_dmatrix, params=params, nfold=5,
#                     num_boost_round=1000, early_stopping_rounds=10,
#                     metrics="rmse", as_pandas=True, seed=0)

In [None]:
MAPE = mape(y_val, xg_reg.predict(X_val))
print(f'Mean Absolute Percentage Error: {MAPE}')
# MAPE = 15.645

#### 9.5. Создание submission для выгрузки на kaggle

In [None]:
# Предсказание выбранной модели
X_test = data_scaled[data_scaled['flag'] == 0].drop([target], axis=1)
Y_pred = rf.predict(X_val)
X_test.shape

In [None]:
Y_test = pd.DataFrame(data=Y_pred, columns=['price'])
data = sell_id.merge(Y_test, left_index=True, right_index=True, how='right')
data = data.drop(['Unnamed: 0'], axis=1)
data.head()

In [None]:
data.shape

In [None]:
data.to_csv(path_or_buf='submission_1.0', index=False)

#### 9.6. Выводы
Лучший результат на тесте получился с использованием XGBoost и нормализацией - 24.17