In [1]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import KFold
from lightgbm.sklearn import LGBMRegressor
from xgboost.sklearn import XGBRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
import time
from sklearn.model_selection import GridSearchCV
import datetime
from catboost import CatBoostRegressor
from sklearn.model_selection import cross_val_score
import pickle

In [2]:
# sample的

In [82]:
df1 = pd.read_csv('./data/sample_data.gz')

In [83]:
test = df1[df1['price'].isnull()]
X_train = df1[df1['price'].notnull()].drop(['price','regDate','creatDate','SaleID','regionCode'],axis=1)
Y_train = df1[df1['price'].notnull()]['price']
X_test = df1[df1['price'].isnull()].drop(['price','regDate','creatDate','SaleID','regionCode'],axis=1)
print("test information:")
X_test.info()
print("X_train information:")
X_train.info()

test information:
<class 'pandas.core.frame.DataFrame'>
Int64Index: 50000 entries, 150000 to 199999
Columns: 139 entries, model to new14*year
dtypes: float64(104), int64(35)
memory usage: 53.4 MB
X_train information:
<class 'pandas.core.frame.DataFrame'>
Int64Index: 150000 entries, 0 to 149999
Columns: 139 entries, model to new14*year
dtypes: float64(104), int64(35)
memory usage: 160.2 MB


In [5]:
X_train.columns

Index(['model', 'brand', 'bodyType', 'fuelType', 'gearbox', 'power',
       'kilometer', 'notRepairedDamage', 'v_0', 'v_1',
       ...
       'new5*year', 'new6*year', 'new7*year', 'new8*year', 'new9*year',
       'new10*year', 'new11*year', 'new12*year', 'new13*year', 'new14*year'],
      dtype='object', length=139)

In [6]:
# light GBM grid search

param = {}
param['gpu_id'] = 0
param['tree_method'] = 'gpu_hist'
lgb = LGBMRegressor(
    n_estimators=1000,
    
    boosting_type= 'gbdt',
    objective = 'regression_l1',
    num_leaves=31,
    min_child_samples = 20,
    feature_fraction = 0.8,
    bagging_freq = 1,
    bagging_fraction = 0.8,
    lambda_l2 = 2,
    random_state=2020,
    metric='mae',
    device = 'gpu'
)

param_grid_gb = {
        'depth': [6, 7, 8],
        'learning_rate': [0.1,0.3]
    }
gs_lgb = GridSearchCV(lgb, param_grid_gb)


skf = KFold(n_splits=5)

params_list = []

lgb_mae = 0
sub_lgb = 0
#交叉验证
for i, (trn_idx, val_idx) in enumerate(skf.split(X_train, Y_train)):
    print('--------------------- {} fold ---------------------'.format(i+1))
    print(datetime.datetime.now())
    trn_x, trn_y = X_train.iloc[trn_idx].reset_index(drop=True), Y_train[trn_idx]
    val_x, val_y = X_train.iloc[val_idx].reset_index(drop=True), Y_train[val_idx]
    # fit函数参数：eval_set=[(x_test,y_test)]  评估数据集,list类型,eval_set用来作为早期停止的验证集，一般我们放x_test和y_test
    # fit函数参数：eval_metric="mlogloss"      评估标准(多分类问题，使用mlogloss作为损失函数)
    # fit函数参数：early_stopping_rounds= 10   如果模型的loss十次内没有减小，则提前结束模型训练
    # fit函数参数：verbose = True              True显示，False不显示
    #import pdb;pdb.set_trace()
    gs_lgb.fit(
        trn_x, trn_y,
        eval_set=[(val_x, val_y)], 
                eval_metric='mae',
        early_stopping_rounds=300,
        #verbose_eval=300
        verbose=False
    )
    
    params_list.append(gs_lgb.best_params_)

    sub_lgb += np.expm1(gs_lgb.predict(X_test)) / skf.n_splits
    val_lgb = gs_lgb.predict(val_x)
    print('val mae:', mean_absolute_error(np.expm1(val_y), np.expm1(val_lgb)))
    lgb_mae += mean_absolute_error(np.expm1(val_y), np.expm1(val_lgb))/skf.n_splits
    print(datetime.datetime.now())
    print(params_list)
print('MAE of val with lgb:', lgb_mae)

--------------------- 1 fold ---------------------
2020-08-27 04:22:37.455078
val mae: 494.53909542936333
2020-08-27 05:01:16.009369
[{'depth': 6, 'learning_rate': 0.1}]
--------------------- 2 fold ---------------------
2020-08-27 05:01:16.013357
val mae: 496.5083744920362
2020-08-27 05:39:55.225751
[{'depth': 6, 'learning_rate': 0.1}, {'depth': 6, 'learning_rate': 0.1}]
--------------------- 3 fold ---------------------
2020-08-27 05:39:55.230736
val mae: 503.20864604908263
2020-08-27 06:18:49.020245
[{'depth': 6, 'learning_rate': 0.1}, {'depth': 6, 'learning_rate': 0.1}, {'depth': 6, 'learning_rate': 0.1}]
--------------------- 4 fold ---------------------
2020-08-27 06:18:49.025233
val mae: 493.8136790087865
2020-08-27 06:57:52.049557
[{'depth': 6, 'learning_rate': 0.1}, {'depth': 6, 'learning_rate': 0.1}, {'depth': 6, 'learning_rate': 0.1}, {'depth': 6, 'learning_rate': 0.1}]
--------------------- 5 fold ---------------------
2020-08-27 06:57:52.053546
val mae: 494.17437452166183


In [7]:
# Catboost grid search
skf = KFold(n_splits=5)
cat = CatBoostRegressor(loss_function='RMSE',task_type='GPU', 
#                         cat_features = ['brand','model','kilometer','fuelType','bodyType'],
                       early_stopping_rounds=300, n_estimators=3000
                       )
param_grid_cat = {
        'depth': [6, 7, 8]
    }

param_list_cat = []

gs_cat = GridSearchCV(cat, param_grid_cat)
cat_mae = 0
sub_cat = 0
for i, (trn_idx, val_idx) in enumerate(skf.split(X_train, Y_train)):
    print('--------------------- {} fold ---------------------'.format(i+1))
    print(datetime.datetime.now())
    trn_x, trn_y = X_train.iloc[trn_idx].reset_index(drop=True), Y_train[trn_idx]
    val_x, val_y = X_train.iloc[val_idx].reset_index(drop=True), Y_train[val_idx]
    # fit函数参数：eval_set=[(x_test,y_test)]  评估数据集,list类型,eval_set用来作为早期停止的验证集，一般我们放x_test和y_test
    # fit函数参数：eval_metric="mlogloss"      评估标准(多分类问题，使用mlogloss作为损失函数)
    # fit函数参数：early_stopping_rounds= 10   如果模型的loss十次内没有减小，则提前结束模型训练
    # fit函数参数：verbose = True              True显示，False不显示
    gs_cat.fit(
        trn_x, trn_y,
        verbose=False
    )
    sub_cat += np.expm1(gs_cat.predict(X_test)) / skf.n_splits
    val_cat= gs_cat.predict(val_x)
    print('val mae:', mean_absolute_error(np.expm1(val_y), np.expm1(val_cat)))
    cat_mae += mean_absolute_error(np.expm1(val_y), np.expm1(val_cat))/skf.n_splits
    param_list_cat.append(gs_cat.best_params_)
    print(param_list_cat)
    print(datetime.datetime.now())

print('MAE of val with xgb:', cat_mae)

--------------------- 1 fold ---------------------
2020-08-27 09:33:55.394485
val mae: 639.0691784241376
[{'depth': 8}]
2020-08-27 09:53:08.993182
--------------------- 2 fold ---------------------
2020-08-27 09:53:08.995177
val mae: 644.4911278385617
[{'depth': 8}, {'depth': 8}]
2020-08-27 10:12:16.760236
--------------------- 3 fold ---------------------
2020-08-27 10:12:16.762230
val mae: 645.396266936107
[{'depth': 8}, {'depth': 8}, {'depth': 8}]
2020-08-27 10:31:21.417263
--------------------- 4 fold ---------------------
2020-08-27 10:31:21.420255
val mae: 661.5011848952386
[{'depth': 8}, {'depth': 8}, {'depth': 8}, {'depth': 8}]
2020-08-27 10:50:22.189596
--------------------- 5 fold ---------------------
2020-08-27 10:50:22.192589
val mae: 657.3969323950894
[{'depth': 8}, {'depth': 8}, {'depth': 8}, {'depth': 8}, {'depth': 8}]
2020-08-27 11:09:24.100630
MAE of val with xgb: 649.5709380978269


In [97]:
save_file1 = open('sample_cat.pkl', 'wb')
# 将列表写入文件
pickle.dump(cat_mae, save_file1)
pickle.dump(sub_cat, save_file1)
save_file1.close()

In [98]:
save_file = open('sample_lgb.pkl', 'wb')
# 将列表写入文件
pickle.dump(lgb_mae, save_file)
pickle.dump(sub_lgb, save_file)
save_file.close()

In [15]:
sub_Weighted = (1-lgb_mae/(cat_mae+lgb_mae))*sub_lgb+(1-cat_mae/(cat_mae+lgb_mae))*sub_cat

In [19]:
res = {}
res['SaleID'] = test['SaleID']
res['price'] = sub_Weighted

res_df = pd.DataFrame(res)

res_df.to_csv('./data/cat_lgb_gs_sample.csv', index=None)

In [8]:
# 自己的

In [9]:
df2 = pd.read_csv('./data/merged_data_for_tree.gz')

In [10]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 199037 entries, 0 to 199036
Columns: 189 entries, SaleID to poly_feat_v_0 v_3 v_8 v_12
dtypes: float64(116), int64(73)
memory usage: 287.0 MB


In [11]:
X_train1 = df2[df2['price'].notnull()].drop(['price','regDate','creatDate','SaleID','regionCode','train', 'name', 'seller', 'offerType'],axis=1)
Y_train1 = df2[df2['price'].notnull()]['price']
X_test1 = df2[df2['price'].isnull()].drop(['price','regDate','creatDate','SaleID','regionCode','train', 'name', 'seller', 'offerType'],axis=1)
print("test information:")
X_test1.info()
print("X_train information:")
X_train1.info()

test information:
<class 'pandas.core.frame.DataFrame'>
Int64Index: 50000 entries, 149037 to 199036
Columns: 180 entries, model to poly_feat_v_0 v_3 v_8 v_12
dtypes: float64(115), int64(65)
memory usage: 69.0 MB
X_train information:
<class 'pandas.core.frame.DataFrame'>
Int64Index: 149037 entries, 0 to 149036
Columns: 180 entries, model to poly_feat_v_0 v_3 v_8 v_12
dtypes: float64(115), int64(65)
memory usage: 205.8 MB


In [12]:
X_train1.columns

Index(['model', 'brand', 'power', 'kilometer', 'v_0', 'v_1', 'v_2', 'v_3',
       'v_4', 'v_5',
       ...
       'poly_feat_v_0 v_8', 'poly_feat_v_0 v_12', 'poly_feat_v_3 v_8',
       'poly_feat_v_3 v_12', 'poly_feat_v_8 v_12', 'poly_feat_v_0 v_3 v_8',
       'poly_feat_v_0 v_3 v_12', 'poly_feat_v_0 v_8 v_12',
       'poly_feat_v_3 v_8 v_12', 'poly_feat_v_0 v_3 v_8 v_12'],
      dtype='object', length=180)

In [13]:
# light GBM grid search

param = {}
param['gpu_id'] = 0
param['tree_method'] = 'gpu_hist'
lgb = LGBMRegressor(
    n_estimators=1000,

    boosting_type= 'gbdt',
    objective = 'regression_l1',
    num_leaves=31,
    min_child_samples = 20,
    feature_fraction = 0.8,
    bagging_freq = 1,
    bagging_fraction = 0.8,
    lambda_l2 = 2,
    random_state=2020,
    metric='mae',
    device = 'gpu'
)

param_grid_gb = {
        'depth': [6, 7, 8],
    'learning_rate': [0.1,0.3]
    }
gs_lgb1 = GridSearchCV(lgb, param_grid_gb)


skf = KFold(n_splits=5)

params_list1 = []

lgb_mae1 = 0
sub_lgb1 = 0
#交叉验证
for i, (trn_idx, val_idx) in enumerate(skf.split(X_train1, Y_train1)):
    print('--------------------- {} fold ---------------------'.format(i+1))
    print(datetime.datetime.now())
    trn_x, trn_y = X_train1.iloc[trn_idx].reset_index(drop=True), Y_train1[trn_idx]
    val_x, val_y = X_train1.iloc[val_idx].reset_index(drop=True), Y_train1[val_idx]
    # fit函数参数：eval_set=[(x_test,y_test)]  评估数据集,list类型,eval_set用来作为早期停止的验证集，一般我们放x_test和y_test
    # fit函数参数：eval_metric="mlogloss"      评估标准(多分类问题，使用mlogloss作为损失函数)
    # fit函数参数：early_stopping_rounds= 10   如果模型的loss十次内没有减小，则提前结束模型训练
    # fit函数参数：verbose = True              True显示，False不显示
    #import pdb;pdb.set_trace()
    gs_lgb1.fit(
        trn_x, trn_y,
        eval_set=[(val_x, val_y)], 
                eval_metric='mae',
        early_stopping_rounds=300,
        #verbose_eval=300
        verbose=False
    )
    
    params_list1.append(gs_lgb1.best_params_)

    sub_lgb1 += np.expm1(gs_lgb1.predict(X_test1)) / skf.n_splits
    val_lgb1 = gs_lgb1.predict(val_x)
    print('val mae:', mean_absolute_error(np.expm1(val_y), np.expm1(val_lgb1)))
    lgb_mae1 += mean_absolute_error(np.expm1(val_y), np.expm1(val_lgb1))/skf.n_splits
    print(datetime.datetime.now())
    print(params_list1)
print('MAE of val with lgb:', lgb_mae1)

--------------------- 1 fold ---------------------
2020-08-27 11:09:34.680551
val mae: 492.81884632727883
2020-08-27 11:51:25.789853
[{'depth': 6, 'learning_rate': 0.1}]
--------------------- 2 fold ---------------------
2020-08-27 11:51:25.795838
val mae: 496.25993988474005
2020-08-27 12:34:03.187906
[{'depth': 6, 'learning_rate': 0.1}, {'depth': 6, 'learning_rate': 0.1}]
--------------------- 3 fold ---------------------
2020-08-27 12:34:03.195886
val mae: 506.60961970315265
2020-08-27 13:19:48.479596
[{'depth': 6, 'learning_rate': 0.1}, {'depth': 6, 'learning_rate': 0.1}, {'depth': 6, 'learning_rate': 0.1}]
--------------------- 4 fold ---------------------
2020-08-27 13:19:48.485581
val mae: 489.45286118136863
2020-08-27 14:06:20.217588
[{'depth': 6, 'learning_rate': 0.1}, {'depth': 6, 'learning_rate': 0.1}, {'depth': 6, 'learning_rate': 0.1}, {'depth': 6, 'learning_rate': 0.1}]
--------------------- 5 fold ---------------------
2020-08-27 14:06:20.226563
val mae: 504.7037225891942

In [14]:
# Catboost grid search
skf = KFold(n_splits=5)
cat1 = CatBoostRegressor(loss_function='RMSE',task_type='GPU', 
#                         cat_features = ['brand','model','kilometer','fuelType','bodyType'],
                       early_stopping_rounds=300,n_estimators=3000
                       )
param_grid_cat1 = {
        'depth': [6, 7, 8]
    }

param_list_cat1 = []

gs_cat1 = GridSearchCV(cat1, param_grid_cat1)
cat_mae1 = 0
sub_cat1 = 0
for i, (trn_idx, val_idx) in enumerate(skf.split(X_train1, Y_train1)):
    print('--------------------- {} fold ---------------------'.format(i+1))
    print(datetime.datetime.now())
    trn_x, trn_y = X_train1.iloc[trn_idx].reset_index(drop=True), Y_train1[trn_idx]
    val_x, val_y = X_train1.iloc[val_idx].reset_index(drop=True), Y_train1[val_idx]
    # fit函数参数：eval_set=[(x_test,y_test)]  评估数据集,list类型,eval_set用来作为早期停止的验证集，一般我们放x_test和y_test
    # fit函数参数：eval_metric="mlogloss"      评估标准(多分类问题，使用mlogloss作为损失函数)
    # fit函数参数：early_stopping_rounds= 10   如果模型的loss十次内没有减小，则提前结束模型训练
    # fit函数参数：verbose = True              True显示，False不显示
    gs_cat1.fit(
        trn_x, trn_y,
        verbose=False
    )
    sub_cat1 += np.expm1(gs_cat1.predict(X_test1)) / skf.n_splits
    val_cat1 = gs_cat1.predict(val_x)
    print('val mae:', mean_absolute_error(np.expm1(val_y), np.expm1(val_cat1)))
    cat_mae1 += mean_absolute_error(np.expm1(val_y), np.expm1(val_cat1))/skf.n_splits
    param_list_cat1.append(gs_cat1.best_params_)
    print(param_list_cat1)
    print(datetime.datetime.now())

print('MAE of val with xgb:', cat_mae1)

--------------------- 1 fold ---------------------
2020-08-27 14:51:55.241506
val mae: 683.59799850938
[{'depth': 8}]
2020-08-27 15:10:47.235782
--------------------- 2 fold ---------------------
2020-08-27 15:10:47.239772
val mae: 620.5030622023072
[{'depth': 8}, {'depth': 8}]
2020-08-27 15:29:08.575036
--------------------- 3 fold ---------------------
2020-08-27 15:29:08.579025
val mae: 665.7393007205118
[{'depth': 8}, {'depth': 8}, {'depth': 8}]
2020-08-27 15:46:52.285462
--------------------- 4 fold ---------------------
2020-08-27 15:46:52.289451
val mae: 629.6107849146656
[{'depth': 8}, {'depth': 8}, {'depth': 8}, {'depth': 8}]
2020-08-27 16:05:08.630275
--------------------- 5 fold ---------------------
2020-08-27 16:05:08.635261
val mae: 630.860063381233
[{'depth': 8}, {'depth': 8}, {'depth': 8}, {'depth': 8}, {'depth': 8}]
2020-08-27 16:23:35.319711
MAE of val with xgb: 646.0622419456196


In [22]:
sub_Weighted1 = (1-lgb_mae1/(cat_mae1+lgb_mae1))*sub_lgb1+(1-cat_mae1/(cat_mae1+lgb_mae1))*sub_cat1

In [24]:
res = {}
res['SaleID'] = test['SaleID']
res['price'] = sub_Weighted1

res_df = pd.DataFrame(res)

res_df.to_csv('./data/cat_lgb_gs_1.csv', index=None)

In [95]:
save_file2 = open('my_cat.pkl', 'wb')
# 将列表写入文件
pickle.dump(cat_mae1, save_file2)
pickle.dump(sub_cat1, save_file2)
save_file2.close()

In [96]:
save_file3 = open('my_lgb.pkl', 'wb')
# 将列表写入文件
pickle.dump(lgb_mae1, save_file3)
pickle.dump(sub_lgb1, save_file3)
save_file3.close()

In [55]:
# stacking
# from sklearn.ensemble import StackingRegressor
from sklearn.ensemble import BaggingRegressor
from mlxtend.regressor import StackingRegressor
# from lightgbm.sklearn import LGBMRegressor

In [51]:
LGBMRegressor.__base__

lightgbm.sklearn.LGBMModel

In [58]:
lgb_instance = LGBMRegressor(
    n_estimators=10000,
    boosting_type= 'gbdt',
    objective = 'regression_l1',
    num_leaves=31,
    min_child_samples = 20,
    feature_fraction = 0.8,
    bagging_freq = 1,
    bagging_fraction = 0.8,
    lambda_l2 = 2,
    random_state=2020,
    metric='mae',
    device = 'gpu',
    learning_rate = 0.02,
    verbose=4
)

cat_reg = CatBoostRegressor(loss_function='RMSE',task_type='GPU', verbose=False)

estimators = [lgb_instance, cat_reg]

reg = StackingRegressor(
     regressors=estimators, 
    meta_regressor=BaggingRegressor(n_estimators=100,random_state=42)
     )


In [59]:
reg.fit(X_train, Y_train)



StackingRegressor(meta_regressor=BaggingRegressor(n_estimators=100,
                                                  random_state=42),
                  regressors=[LGBMRegressor(bagging_fraction=0.8,
                                            bagging_freq=1, device='gpu',
                                            feature_fraction=0.8, lambda_l2=2,
                                            learning_rate=0.02, metric='mae',
                                            n_estimators=10000,
                                            objective='regression_l1',
                                            random_state=2020, verbose=4),
                              <catboost.core.CatBoostRegressor object at 0x000001E3FF968808>])

In [60]:
y_pre = reg.predict(X_test)

In [62]:
y_pre = np.expm1(y_pre)

In [63]:
res = {}
res['SaleID'] = test['SaleID']
res['price'] = y_pre

res_df = pd.DataFrame(res)

res_df.to_csv('./data/stacking_ensemble.csv', index=None)

In [84]:
cat_features=['brand','model','kilometer','fuelType','bodyType']

X_train[cat_features] = X_train[cat_features].astype(int)
X_train['brand'].head()

0     6
1     1
2    15
3    10
4     5
Name: brand, dtype: int32

In [85]:
cat_reg= CatBoostRegressor(loss_function='RMSE',task_type='GPU', verbose=True, cat_features=cat_features)
cat_reg.fit(X_train, Y_train)

Learning rate set to 0.084261
0:	learn: 1.2142988	total: 76.8ms	remaining: 1m 16s
1:	learn: 1.2078055	total: 138ms	remaining: 1m 9s
2:	learn: 1.1987691	total: 229ms	remaining: 1m 16s
3:	learn: 1.1915208	total: 283ms	remaining: 1m 10s
4:	learn: 1.1857408	total: 382ms	remaining: 1m 16s
5:	learn: 1.1851781	total: 473ms	remaining: 1m 18s
6:	learn: 1.1739317	total: 541ms	remaining: 1m 16s
7:	learn: 1.1699936	total: 636ms	remaining: 1m 18s
8:	learn: 1.1636908	total: 742ms	remaining: 1m 21s
9:	learn: 1.1627082	total: 831ms	remaining: 1m 22s
10:	learn: 1.1471680	total: 922ms	remaining: 1m 22s
11:	learn: 1.1280723	total: 1.01s	remaining: 1m 23s
12:	learn: 1.1264812	total: 1.11s	remaining: 1m 24s
13:	learn: 1.1083924	total: 1.21s	remaining: 1m 25s
14:	learn: 1.1028913	total: 1.31s	remaining: 1m 26s
15:	learn: 1.0918144	total: 1.38s	remaining: 1m 24s
16:	learn: 1.0904352	total: 1.47s	remaining: 1m 25s
17:	learn: 1.0803327	total: 1.54s	remaining: 1m 24s
18:	learn: 1.0667508	total: 1.6s	remaining: 

158:	learn: 0.4494426	total: 14.1s	remaining: 1m 14s
159:	learn: 0.4487501	total: 14.2s	remaining: 1m 14s
160:	learn: 0.4471183	total: 14.3s	remaining: 1m 14s
161:	learn: 0.4451141	total: 14.4s	remaining: 1m 14s
162:	learn: 0.4439006	total: 14.5s	remaining: 1m 14s
163:	learn: 0.4431630	total: 14.6s	remaining: 1m 14s
164:	learn: 0.4425122	total: 14.7s	remaining: 1m 14s
165:	learn: 0.4399405	total: 14.7s	remaining: 1m 14s
166:	learn: 0.4387798	total: 14.8s	remaining: 1m 14s
167:	learn: 0.4368515	total: 14.9s	remaining: 1m 13s
168:	learn: 0.4352781	total: 15s	remaining: 1m 13s
169:	learn: 0.4348846	total: 15.1s	remaining: 1m 13s
170:	learn: 0.4342275	total: 15.2s	remaining: 1m 13s
171:	learn: 0.4336907	total: 15.3s	remaining: 1m 13s
172:	learn: 0.4333005	total: 15.4s	remaining: 1m 13s
173:	learn: 0.4324072	total: 15.5s	remaining: 1m 13s
174:	learn: 0.4313273	total: 15.6s	remaining: 1m 13s
175:	learn: 0.4305152	total: 15.7s	remaining: 1m 13s
176:	learn: 0.4296273	total: 15.8s	remaining: 1m

317:	learn: 0.3629201	total: 28.5s	remaining: 1m 1s
318:	learn: 0.3626600	total: 28.6s	remaining: 1m 1s
319:	learn: 0.3625261	total: 28.7s	remaining: 1m 1s
320:	learn: 0.3621722	total: 28.8s	remaining: 1m
321:	learn: 0.3620908	total: 28.9s	remaining: 1m
322:	learn: 0.3616159	total: 29s	remaining: 1m
323:	learn: 0.3615441	total: 29.1s	remaining: 1m
324:	learn: 0.3614184	total: 29.2s	remaining: 1m
325:	learn: 0.3613455	total: 29.2s	remaining: 1m
326:	learn: 0.3612761	total: 29.3s	remaining: 1m
327:	learn: 0.3611278	total: 29.4s	remaining: 1m
328:	learn: 0.3609714	total: 29.5s	remaining: 1m
329:	learn: 0.3604536	total: 29.6s	remaining: 1m
330:	learn: 0.3602313	total: 29.7s	remaining: 1m
331:	learn: 0.3599794	total: 29.8s	remaining: 1m
332:	learn: 0.3597699	total: 29.9s	remaining: 59.9s
333:	learn: 0.3590817	total: 30s	remaining: 59.8s
334:	learn: 0.3589600	total: 30.1s	remaining: 59.8s
335:	learn: 0.3589010	total: 30.2s	remaining: 59.7s
336:	learn: 0.3587980	total: 30.3s	remaining: 59.6s


478:	learn: 0.3518639	total: 43.5s	remaining: 47.3s
479:	learn: 0.3518635	total: 43.5s	remaining: 47.2s
480:	learn: 0.3518632	total: 43.6s	remaining: 47.1s
481:	learn: 0.3518612	total: 43.7s	remaining: 47s
482:	learn: 0.3518608	total: 43.8s	remaining: 46.9s
483:	learn: 0.3518604	total: 43.9s	remaining: 46.8s
484:	learn: 0.3518599	total: 44s	remaining: 46.7s
485:	learn: 0.3518595	total: 44.1s	remaining: 46.6s
486:	learn: 0.3518592	total: 44.2s	remaining: 46.5s
487:	learn: 0.3518589	total: 44.3s	remaining: 46.4s
488:	learn: 0.3518433	total: 44.4s	remaining: 46.4s
489:	learn: 0.3518431	total: 44.4s	remaining: 46.3s
490:	learn: 0.3518427	total: 44.5s	remaining: 46.2s
491:	learn: 0.3518425	total: 44.6s	remaining: 46.1s
492:	learn: 0.3518422	total: 44.7s	remaining: 46s
493:	learn: 0.3518420	total: 44.8s	remaining: 45.9s
494:	learn: 0.3518418	total: 44.9s	remaining: 45.8s
495:	learn: 0.3518412	total: 45s	remaining: 45.7s
496:	learn: 0.3518410	total: 45.1s	remaining: 45.6s
497:	learn: 0.351840

637:	learn: 0.3518212	total: 58.1s	remaining: 32.9s
638:	learn: 0.3518212	total: 58.1s	remaining: 32.8s
639:	learn: 0.3518212	total: 58.2s	remaining: 32.8s
640:	learn: 0.3518212	total: 58.3s	remaining: 32.7s
641:	learn: 0.3518212	total: 58.4s	remaining: 32.6s
642:	learn: 0.3518212	total: 58.5s	remaining: 32.5s
643:	learn: 0.3518212	total: 58.6s	remaining: 32.4s
644:	learn: 0.3518212	total: 58.7s	remaining: 32.3s
645:	learn: 0.3518212	total: 58.8s	remaining: 32.2s
646:	learn: 0.3518212	total: 58.9s	remaining: 32.1s
647:	learn: 0.3518212	total: 59s	remaining: 32s
648:	learn: 0.3518212	total: 59.1s	remaining: 31.9s
649:	learn: 0.3518212	total: 59.1s	remaining: 31.8s
650:	learn: 0.3518212	total: 59.2s	remaining: 31.8s
651:	learn: 0.3518212	total: 59.3s	remaining: 31.7s
652:	learn: 0.3518212	total: 59.4s	remaining: 31.6s
653:	learn: 0.3518212	total: 59.5s	remaining: 31.5s
654:	learn: 0.3518211	total: 59.6s	remaining: 31.4s
655:	learn: 0.3518211	total: 59.7s	remaining: 31.3s
656:	learn: 0.35

798:	learn: 0.3518209	total: 1m 12s	remaining: 18.3s
799:	learn: 0.3518209	total: 1m 12s	remaining: 18.2s
800:	learn: 0.3518209	total: 1m 13s	remaining: 18.1s
801:	learn: 0.3518209	total: 1m 13s	remaining: 18s
802:	learn: 0.3518210	total: 1m 13s	remaining: 18s
803:	learn: 0.3518209	total: 1m 13s	remaining: 17.9s
804:	learn: 0.3518210	total: 1m 13s	remaining: 17.8s
805:	learn: 0.3518210	total: 1m 13s	remaining: 17.7s
806:	learn: 0.3518209	total: 1m 13s	remaining: 17.6s
807:	learn: 0.3518210	total: 1m 13s	remaining: 17.5s
808:	learn: 0.3518210	total: 1m 13s	remaining: 17.4s
809:	learn: 0.3518210	total: 1m 13s	remaining: 17.3s
810:	learn: 0.3518209	total: 1m 13s	remaining: 17.2s
811:	learn: 0.3518210	total: 1m 14s	remaining: 17.1s
812:	learn: 0.3518209	total: 1m 14s	remaining: 17s
813:	learn: 0.3518209	total: 1m 14s	remaining: 17s
814:	learn: 0.3518210	total: 1m 14s	remaining: 16.9s
815:	learn: 0.3518210	total: 1m 14s	remaining: 16.8s
816:	learn: 0.3518210	total: 1m 14s	remaining: 16.7s
8

955:	learn: 0.3518208	total: 1m 27s	remaining: 4.02s
956:	learn: 0.3518208	total: 1m 27s	remaining: 3.93s
957:	learn: 0.3518208	total: 1m 27s	remaining: 3.84s
958:	learn: 0.3518208	total: 1m 27s	remaining: 3.75s
959:	learn: 0.3518208	total: 1m 27s	remaining: 3.66s
960:	learn: 0.3518208	total: 1m 27s	remaining: 3.57s
961:	learn: 0.3518208	total: 1m 27s	remaining: 3.48s
962:	learn: 0.3518208	total: 1m 28s	remaining: 3.38s
963:	learn: 0.3518208	total: 1m 28s	remaining: 3.29s
964:	learn: 0.3518208	total: 1m 28s	remaining: 3.2s
965:	learn: 0.3518208	total: 1m 28s	remaining: 3.11s
966:	learn: 0.3518208	total: 1m 28s	remaining: 3.02s
967:	learn: 0.3518208	total: 1m 28s	remaining: 2.93s
968:	learn: 0.3518208	total: 1m 28s	remaining: 2.84s
969:	learn: 0.3518208	total: 1m 28s	remaining: 2.74s
970:	learn: 0.3518208	total: 1m 28s	remaining: 2.65s
971:	learn: 0.3518208	total: 1m 28s	remaining: 2.56s
972:	learn: 0.3518208	total: 1m 29s	remaining: 2.47s
973:	learn: 0.3518208	total: 1m 29s	remaining: 

<catboost.core.CatBoostRegressor at 0x1e47ba3ae88>

In [90]:
X_train.columns

Index(['model', 'brand', 'bodyType', 'fuelType', 'gearbox', 'power',
       'kilometer', 'notRepairedDamage', 'v_0', 'v_1',
       ...
       'new5*year', 'new6*year', 'new7*year', 'new8*year', 'new9*year',
       'new10*year', 'new11*year', 'new12*year', 'new13*year', 'new14*year'],
      dtype='object', length=139)

In [89]:
cat_reg.get_feature_importance()

array([5.54511440e+00, 5.29306869e-01, 3.07823693e-01, 1.50441225e+00,
       0.00000000e+00, 7.27617135e-01, 9.07696946e-01, 0.00000000e+00,
       7.86185043e-03, 1.83276701e-01, 1.77559468e+00, 7.21346330e-01,
       9.28439928e-01, 3.92467851e-01, 1.51325441e+00, 8.24270740e-02,
       4.19217348e-01, 9.20486111e-01, 2.07413736e+00, 2.90805401e+00,
       1.29643136e+00, 1.81339867e-02, 3.15818599e+00, 9.40870665e-02,
       3.41665666e+00, 7.18023555e-04, 1.58674106e-01, 1.39663782e-01,
       2.65587713e-02, 2.53926617e-04, 1.57698759e-02, 6.80024691e-06,
       4.90612118e-01, 7.54449894e-01, 5.91762554e-03, 1.40788897e-03,
       9.38758624e-02, 3.98037808e-03, 2.98421857e-03, 3.45950582e-02,
       2.34087627e-01, 7.78577959e-02, 3.81259007e-02, 7.88419711e-01,
       5.47490609e-02, 1.36788638e-01, 3.45708508e-02, 9.58423866e-01,
       0.00000000e+00, 0.00000000e+00, 3.72413336e-03, 0.00000000e+00,
       1.80871998e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
      

In [None]:
lgb_instance.fit(X_train, Y_train)