In [1]:
# 加载模块
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import KFold
from lightgbm.sklearn import LGBMRegressor
from xgboost.sklearn import XGBRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
import time
from sklearn.model_selection import GridSearchCV

In [3]:
df = pd.read_csv('./data/sample_data.gz')
df1 = df.copy()

In [4]:
test = df1[df1['price'].isnull()]
X_train = df1[df1['price'].notnull()].drop(['price','regDate','creatDate','SaleID','regionCode'],axis=1)
Y_train = df1[df1['price'].notnull()]['price']
X_test = df1[df1['price'].isnull()].drop(['price','regDate','creatDate','SaleID','regionCode'],axis=1)
print("test information:")
test.info()
print("X_train information:")
X_train.info()

test information:
<class 'pandas.core.frame.DataFrame'>
Int64Index: 50000 entries, 150000 to 199999
Columns: 144 entries, SaleID to new14*year
dtypes: float64(105), int64(37), object(2)
memory usage: 55.3+ MB
X_train information:
<class 'pandas.core.frame.DataFrame'>
Int64Index: 150000 entries, 0 to 149999
Columns: 139 entries, model to new14*year
dtypes: float64(104), int64(35)
memory usage: 160.2 MB


In [6]:
skf = KFold(n_splits=5)

param = {}
param['gpu_id'] = 0
param['tree_method'] = 'gpu_hist'
clf = LGBMRegressor(
    n_estimators=10000,
    learning_rate=0.02,
    boosting_type= 'gbdt',
    objective = 'regression_l1',
    max_depth = -1,
    num_leaves=31,
    min_child_samples = 20,
    feature_fraction = 0.8,
    bagging_freq = 1,
    bagging_fraction = 0.8,
    lambda_l2 = 2,
    random_state=2020,
    metric='mae',
    device = 'gpu'
)


lgb_mae = 0
sub_lgb = 0
#交叉验证
for i, (trn_idx, val_idx) in enumerate(skf.split(X_train, Y_train)):
    print('--------------------- {} fold ---------------------'.format(i+1))
    trn_x, trn_y = X_train.iloc[trn_idx].reset_index(drop=True), Y_train[trn_idx]
    val_x, val_y = X_train.iloc[val_idx].reset_index(drop=True), Y_train[val_idx]
    # fit函数参数：eval_set=[(x_test,y_test)]  评估数据集,list类型,eval_set用来作为早期停止的验证集，一般我们放x_test和y_test
    # fit函数参数：eval_metric="mlogloss"      评估标准(多分类问题，使用mlogloss作为损失函数)
    # fit函数参数：early_stopping_rounds= 10   如果模型的loss十次内没有减小，则提前结束模型训练
    # fit函数参数：verbose = True              True显示，False不显示
    #import pdb;pdb.set_trace()
    clf.fit(
        trn_x, trn_y,
        eval_set=[(val_x, val_y)], 
                eval_metric='mae',
        early_stopping_rounds=300,
        #verbose_eval=300
        verbose=False
    )
    
    sub_lgb += np.expm1(clf.predict(X_test)) / skf.n_splits
    val_lgb = clf.predict(val_x)
    print('val mae:', mean_absolute_error(np.expm1(val_y), np.expm1(val_lgb)))
    lgb_mae += mean_absolute_error(np.expm1(val_y), np.expm1(val_lgb))/skf.n_splits
print('MAE of val with lgb:', lgb_mae)

--------------------- 1 fold ---------------------
val mae: 458.37453299308913
--------------------- 2 fold ---------------------
val mae: 458.6727836807148
--------------------- 3 fold ---------------------
val mae: 464.8628969892603
--------------------- 4 fold ---------------------
val mae: 458.7475290688512
--------------------- 5 fold ---------------------
val mae: 459.2733296111283
MAE of val with lgb: 459.98621446860875


In [7]:
xlf= XGBRegressor(tree_method='gpu_hist', gpu_id='0', n_estimators=1000, gamma=0, subsample=0.8,\
        colsample_bytree=0.9, max_depth=7)#, objective ='reg:squarederror'
param_grid = {
        'learning_rate': [0.01, 0.05, 0.1, 0.2],
    }
gbm = GridSearchCV(xlf, param_grid)
xgb_mae = 0
sub_xgb = 0
for i, (trn_idx, val_idx) in enumerate(skf.split(X_train, Y_train)):
    print('--------------------- {} fold ---------------------'.format(i+1))
    trn_x, trn_y = X_train.iloc[trn_idx].reset_index(drop=True), Y_train[trn_idx]
    val_x, val_y = X_train.iloc[val_idx].reset_index(drop=True), Y_train[val_idx]
    # fit函数参数：eval_set=[(x_test,y_test)]  评估数据集,list类型,eval_set用来作为早期停止的验证集，一般我们放x_test和y_test
    # fit函数参数：eval_metric="mlogloss"      评估标准(多分类问题，使用mlogloss作为损失函数)
    # fit函数参数：early_stopping_rounds= 10   如果模型的loss十次内没有减小，则提前结束模型训练
    # fit函数参数：verbose = True              True显示，False不显示
    gbm.fit(
        trn_x, trn_y,
        eval_set=[(val_x, val_y)], 
                eval_metric='mae',
        early_stopping_rounds=300,
        verbose=False
    )
    sub_xgb += np.expm1(gbm.predict(X_test)) / skf.n_splits
    val_xgb= gbm.predict(val_x)
    #print('val mae:', mean_absolute_error(np.expm1(val_y), np.expm1(val_xgb)))
    xgb_mae += mean_absolute_error(np.expm1(val_y), np.expm1(val_xgb))/skf.n_splits
print('MAE of val with xgb:', xgb_mae)

--------------------- 1 fold ---------------------
--------------------- 2 fold ---------------------
--------------------- 3 fold ---------------------
--------------------- 4 fold ---------------------
--------------------- 5 fold ---------------------
MAE of val with xgb: 509.85574668451943


In [8]:
sub_Weighted = (1-lgb_mae/(xgb_mae+lgb_mae))*sub_lgb+(1-xgb_mae/(xgb_mae+lgb_mae))*sub_xgb
sub_Weighted

array([1258.11720694, 1953.9416682 , 8372.53365622, ..., 5560.60565637,
       4771.6679077 , 5163.25814832])

In [9]:
res = {}
res['SaleID'] = test['SaleID']
res['price'] = sub_Weighted

res_df = pd.DataFrame(res)

res_df.to_csv('./data/submit_weighted.csv', index=None)

In [25]:
res['price']

array([1302.43386172, 1990.54114532, 8235.71275788, ..., 5644.25605559,
       4838.73657645, 4784.86552468])

In [10]:
import pickle

In [11]:
save_lgb = open('save_lgb.pkl', 'wb')
pickle.dump(lgb_mae, save_lgb)
pickle.dump(sub_lgb, save_lgb)
save_lgb.close()

In [12]:
save_xgb = open('save_xgb.pkl', 'wb')
pickle.dump(xgb_mae, save_xgb)
pickle.dump(sub_xgb, save_xgb)
save_xgb.close()

In [13]:
save_lgb_model = open('save_lgb_model.pkl', 'wb')
pickle.dump(clf, save_lgb_model)
save_lgb_model.close()

In [14]:
from lightgbm.sklearn import LGBMRegressor

In [15]:
ss = LGBMRegressor()

In [16]:
ss.feature_importances_

NotFittedError: No feature_importances found. Need to call fit beforehand.