In [1]:
# 加载模块
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import KFold
from lightgbm.sklearn import LGBMRegressor
from xgboost.sklearn import XGBRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
import datetime
from sklearn.model_selection import GridSearchCV
import pickle
from catboost import CatBoostRegressor

In [2]:
df = pd.read_csv('./data/sample_data.gz')
df1 = df.copy() 

In [3]:
cat_cols = ['brand','model','kilometer','fuelType','bodyType']
df1[cat_cols] = df1[cat_cols].astype(int)

In [4]:
test = df1[df1['price'].isnull()]
X_train = df1[df1['price'].notnull()].drop(['price','regDate','creatDate','SaleID','regionCode'],axis=1)
Y_train = df1[df1['price'].notnull()]['price']
X_test = df1[df1['price'].isnull()].drop(['price','regDate','creatDate','SaleID','regionCode'],axis=1)
print("test information:")
test.info()
print("X_train information:")
X_train.info()

test information:
<class 'pandas.core.frame.DataFrame'>
Int64Index: 50000 entries, 150000 to 199999
Columns: 144 entries, SaleID to new14*year
dtypes: float64(101), int32(5), int64(36), object(2)
memory usage: 54.4+ MB
X_train information:
<class 'pandas.core.frame.DataFrame'>
Int64Index: 150000 entries, 0 to 149999
Columns: 139 entries, model to new14*year
dtypes: float64(100), int32(5), int64(34)
memory usage: 157.4 MB


In [30]:
skf = KFold(n_splits=5)

In [5]:
pickle_file = open('save_lgb_model.pkl','rb')
lgb = pickle.load(pickle_file)
lgb.get_params

<bound method LGBMModel.get_params of LGBMRegressor(bagging_fraction=0.8, bagging_freq=1, device='gpu',
              feature_fraction=0.8, lambda_l2=2, learning_rate=0.02,
              metric='mae', n_estimators=10000, objective='regression_l1',
              random_state=2020)>

In [26]:
drop_idx = np.where(lgb.feature_importances_ < np.percentile(lgb.feature_importances_, 25))

In [27]:
X_train = X_train.drop(columns=X_train.columns[drop_idx])
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 150000 entries, 0 to 149999
Columns: 104 entries, model to new14*year
dtypes: float64(79), int32(4), int64(21)
memory usage: 122.9 MB


In [None]:
X_test_copy = X_test.drop(columns=X_test.columns[drop_idx])

In [35]:

lgb_mae = 0
sub_lgb = 0
#交叉验证
for i, (trn_idx, val_idx) in enumerate(skf.split(X_train, Y_train)):
    print('--------------------- {} fold ---------------------'.format(i+1))
    trn_x, trn_y = X_train.iloc[trn_idx].reset_index(drop=True), Y_train[trn_idx]
    val_x, val_y = X_train.iloc[val_idx].reset_index(drop=True), Y_train[val_idx]
    # fit函数参数：eval_set=[(x_test,y_test)]  评估数据集,list类型,eval_set用来作为早期停止的验证集，一般我们放x_test和y_test
    # fit函数参数：eval_metric="mlogloss"      评估标准(多分类问题，使用mlogloss作为损失函数)
    # fit函数参数：early_stopping_rounds= 10   如果模型的loss十次内没有减小，则提前结束模型训练
    # fit函数参数：verbose = True              True显示，False不显示
    #import pdb;pdb.set_trace()
    lgb.fit(
        trn_x, trn_y,
        eval_set=[(val_x, val_y)], 
                eval_metric='mae',
        early_stopping_rounds=300,
        #verbose_eval=300
        verbose=False
    )
    
    sub_lgb += np.expm1(lgb.predict(X_test_copy)) / skf.n_splits
    val_lgb = lgb.predict(val_x)
    print('val mae:', mean_absolute_error(np.expm1(val_y), np.expm1(val_lgb)))
    lgb_mae += mean_absolute_error(np.expm1(val_y), np.expm1(val_lgb))/skf.n_splits
    print(lgb.feature_importances_)
print('MAE of val with lgb:', lgb_mae)

--------------------- 1 fold ---------------------


KeyboardInterrupt: 

KeyboardInterrupt: 

In [28]:
lgb.feature_importances_

array([ 2157,  1602,   617,   393,   558,  7291,  1460,  2148,  1926,
        7565,  4481,  1557,  3556,  3229,  8471,  3376,  3136,  3295,
        5179,  4699,  1222,  3132,  8290,  2168,  3608,  1212,  3090,
        4528,  2509,     0,   546,  4318,  3793,  2944,  1455,  1360,
        1733,  1075,  1063,  1512,  1171,  1896,  2778,  3415,  1908,
        1672,  3817,  3165,   272,  1185,  1753,   767,   441,    57,
         167,    90,    37,   536,    29,     7,    83,   127,   122,
         652,   557,   212,   405,   389,    58,     3,   978,   558,
           0,     0,   714,   245,   398,  1459,  3204,  1942,   314,
        3767,  2925,  4276,   637,   587,   604,  2769,   376,   883,
         519,  3685,    81,  4494,  1464,  6795,   920,   209,  1214,
        3687,   318,   222,   121,   874,  1484,   711,   180,   135,
           0,  6546,  1351,  3669,  4763,     0,   988,  2663,  1025,
        1071,     0,   889,  1957,  2045,   762,     0,  3786,  4844,
        3850,  6449,

In [9]:
len(lgb.feature_importances_)

139

In [5]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

In [6]:
cat = CatBoostRegressor(loss_function='RMSE',task_type='GPU', 
                        cat_features = cat_cols,
                       early_stopping_rounds=300,
                        n_estimators = 4000,
                        depth = 8,
                       )

In [7]:
sub_cat = 0
# val_cat = 0
cat_mae = 0

skf = KFold(n_splits=5)

for i, (trn_idx, val_idx) in enumerate(skf.split(X_train, Y_train)):
    print('--------------------- {} fold ---------------------'.format(i+1))
    trn_x, trn_y = X_train.iloc[trn_idx].reset_index(drop=True), Y_train[trn_idx]
    val_x, val_y = X_train.iloc[val_idx].reset_index(drop=True), Y_train[val_idx]
    # fit函数参数：eval_set=[(x_test,y_test)]  评估数据集,list类型,eval_set用来作为早期停止的验证集，一般我们放x_test和y_test
    # fit函数参数：eval_metric="mlogloss"      评估标准(多分类问题，使用mlogloss作为损失函数)
    # fit函数参数：early_stopping_rounds= 10   如果模型的loss十次内没有减小，则提前结束模型训练
    # fit函数参数：verbose = True              True显示，False不显示
    cat.fit(
        trn_x, trn_y,
#         eval_set=[(val_x, val_y)], 
#                 eval_metric='mae',
#         early_stopping_rounds=300,
        verbose=False,
#         plot=True
    )
    sub_cat += np.expm1(cat.predict(X_test)) / skf.n_splits
    val_cat= cat.predict(val_x)
    print('val mae:', mean_absolute_error(np.expm1(val_y), np.expm1(val_cat)))
    cat_mae += mean_absolute_error(np.expm1(val_y), np.expm1(val_cat))/skf.n_splits
print('MAE of val with lgb:', cat_mae)

--------------------- 1 fold ---------------------
val mae: 1288.729795373252
--------------------- 2 fold ---------------------
val mae: 1261.7963483120664
--------------------- 3 fold ---------------------
val mae: 1262.8251674019361
--------------------- 4 fold ---------------------
val mae: 1264.4928233426695
--------------------- 5 fold ---------------------
val mae: 1332.5109711418381
MAE of val with lgb: 1282.0710211143526


In [8]:
save_lgb = open('save_lgb.pkl', 'rb')
lgb_mae = pickle.load(save_lgb)
sub_lgb = pickle.load(save_lgb)
save_lgb.close()

In [9]:
save_xgb = open('save_xgb.pkl', 'rb')
xgb_mae = pickle.load(save_xgb)
sub_xgb = pickle.load(save_xgb)
save_xgb.close()

In [16]:
sub_Weighted = 3/6*sub_lgb + 2/6*sub_xgb + 1/6*sub_cat
sub_Weighted

array([1251.92311995, 2008.83201421, 8379.32150253, ..., 5445.0496962 ,
       4757.22316788, 5143.77697158])

In [17]:
res = {}
res['SaleID'] = test['SaleID']
res['price'] = sub_Weighted

res_df = pd.DataFrame(res)

res_df.to_csv('./data/submit_lgb_xgb_cat.csv', index=None)