In [None]:
# coding: utf-8
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# 设置图片可显示中文
plt.rcParams['font.sans-serif'].insert(0, 'SimHei')
plt.rcParams['axes.unicode_minus'] = False

In [None]:
# 读取训练数据并粗略观察
train_data = pd.read_csv('./data/used_car_train_20200313.csv', sep=' ')
train_data.head()

In [None]:
pd.set_option('display.max_columns', None)

In [None]:
# 150000条
train_data.info()

### 数据预处理

In [None]:
# 删除有空值的行（由于数据体量足够，暂不考虑对空值进行回归模型填充）
train_data.dropna(inplace=True)

In [None]:
train_data.shape
# 删除后还有135884条

In [None]:
# train_data.set_index('SaleID', inplace=True)
train_data.reset_index(inplace=True)

In [None]:
# 日期类型数据处理，提取年月日
train_data['regDate_year'] = train_data['regDate'].astype('str').apply(lambda x: int(x[0:4]))
train_data['regDate_month'] = train_data['regDate'].astype('str').apply(lambda x: int(x[4:6]))
train_data['regDate_day'] = train_data['regDate'].astype('str').apply(lambda x: int(x[6:]))

train_data['creatDate_year'] = train_data['creatDate'].astype('str').apply(lambda x: int(x[0:4]))
train_data['creatDate_month'] = train_data['creatDate'].astype('str').apply(lambda x: int(x[4:6]))
train_data['creatDate_day'] = train_data['creatDate'].astype('str').apply(lambda x: int(x[6:]))

train_data

In [None]:
train_data.notRepairedDamage.unique()

In [None]:
# notRepairedDamage字段有-也属于空值，替换为空值后进行删除
train_data.notRepairedDamage.replace('-', np.nan, inplace=True)

In [None]:
train_data.dropna(inplace=True)

In [None]:
train_data.notRepairedDamage.replace(['0.0','1.0'], [0, 1], inplace=True)

In [None]:
train_data.info()

In [None]:
feature_cols = [col for col in train_data.columns if col not in ['SaleID','price','regDate','creatDate']]
feature_data = train_data[feature_cols]
label_data = train_data['price']

In [None]:
plt.hist(label_data)
plt.show()
plt.close()

In [None]:
# %pip install sklearn lightgbm xgboost seaborn scipy IPython

In [None]:
import seaborn as sns
from scipy.special import jn
from IPython.display import display, clear_output
from sklearn import linear_model
from sklearn import preprocessing
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor
 
# 数据降维处理的
from sklearn.decomposition import PCA,FastICA,FactorAnalysis,SparsePCA
 
import lightgbm as lgb
import xgboost as xgb
 
# 参数搜索和评价的
from sklearn.model_selection import GridSearchCV,cross_val_score,StratifiedKFold,train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [None]:
# xgb模型搭建
xgb_model = xgb.XGBRegressor(
    n_estimators=1600, 
    learning_rate=0.05, 
    gamma=0, 
    subsample=0.8,
    colsample_bytree=0.8,
    max_depth=12
)

In [None]:
scores_train = []
scores = []

sk = StratifiedKFold(n_splits=10, shuffle=True, random_state=0)
for train_ind, val_ind in sk.split(feature_data, label_data):
    
    train_x = feature_data.iloc[train_ind].values
    train_y = label_data.iloc[train_ind]
    val_x = feature_data.iloc[val_ind].values
    val_y = label_data.iloc[val_ind]
    
    xgb_model.fit(train_x, train_y)
    pred_train_xgb = xgb_model.predict(train_x)
    pred_xgb = xgb_model.predict(val_x)
    
    score_train = mean_absolute_error(train_y, pred_train_xgb)
    scores_train.append(score_train)
    score = mean_absolute_error(val_y, pred_xgb)
    scores.append(score)
print('Train mae:', np.mean(score_train))
print('Val mae', np.mean(scores))

In [None]:
# 建立xgb训练模型
def build_model_xgb(x_train, y_train):
    model = xgb.XGBRegressor(
        n_estimators=1600, 
        learning_rate=0.05, 
        gamma=0, 
        subsample=0.8,
        colsample_bytree=0.8, 
        max_depth=12
    )
    model.fit(x_train, y_train)
    return model

In [None]:
# 建立lgb训练模型
def build_model_lgb(x_train, y_train):
    estimator = lgb.LGBMRegressor(num_leaves=127,n_estimators = 1600)
    param_grid = {
        'learning_rate': [0.01, 0.05, 0.1, 0.2],
    }
    gbm = GridSearchCV(estimator, param_grid)
    gbm.fit(x_train, y_train)
    return gbm

In [None]:
x_train, x_val, y_train, y_val = train_test_split(feature_data, label_data, test_size = 0.2)

In [None]:
print('Train lgb...')
model_lgb = build_model_lgb(x_train, y_train)
val_lgb = model_lgb.predict(x_val)
MAE_lgb = mean_absolute_error(y_val, val_lgb)
print('MAE of val with lgb:', MAE_lgb)

In [None]:
print('Train xgb...')
model_xgb = build_model_xgb(x_train, y_train)
val_xgb = model_xgb.predict(x_val)
MAE_xgb = mean_absolute_error(y_val, val_xgb)
print('MAE of val with xgb:', MAE_xgb)

In [None]:
# 这里我们采取了简单的加权融合的方式
val_Weighted = (1 - MAE_lgb / (MAE_xgb + MAE_lgb)) * val_lgb + (1 - MAE_xgb / (MAE_xgb + MAE_lgb)) * val_xgb
val_Weighted[val_Weighted < 0] = 10 
print('MAE of val with Weighted ensemble:', mean_absolute_error(y_val, val_Weighted))

In [None]:
testb_data = pd.read_csv('./data/used_car_testB_20200421.csv', sep=' ')
testb_data

In [None]:
testb_data['regDate_year'] = testb_data['regDate'].astype('str').apply(lambda x: int(x[0:4]))
testb_data['regDate_month'] = testb_data['regDate'].astype('str').apply(lambda x: int(x[4:6]))
testb_data['regDate_day'] = testb_data['regDate'].astype('str').apply(lambda x: int(x[6:]))

testb_data['creatDate_year'] = testb_data['creatDate'].astype('str').apply(lambda x: int(x[0:4]))
testb_data['creatDate_month'] = testb_data['creatDate'].astype('str').apply(lambda x: int(x[4:6]))
testb_data['creatDate_day'] = testb_data['creatDate'].astype('str').apply(lambda x: int(x[6:]))

testb_data

In [None]:
testb_data.reset_index(inplace=True)

In [None]:
test_data = testb_data[feature_cols]

In [None]:
test_data = test_data.fillna(-1)

In [None]:
test_data.notRepairedDamage.replace(['0.0','-','1.0'], [0, 1, 1], inplace=True)

In [None]:
print('Predict lgb...')
model_lgb_pre = build_model_lgb(feature_data, label_data)
subA_lgb = model_lgb_pre.predict(test_data)
subA_lgb

In [None]:
print('Predict xgb...')
model_xgb_pre = build_model_xgb(feature_data, label_data)
subA_xgb = model_xgb_pre.predict(test_data)
subA_xgb

In [None]:
scores_Weighted = (1 - MAE_lgb / (MAE_xgb + MAE_lgb)) * subA_lgb + (1 - MAE_xgb / (MAE_xgb + MAE_lgb)) * subA_xgb

In [None]:
new_test_data = pd.read_csv('./data/used_car_testB_20200421.csv', sep=' ')
new_test_data

In [None]:
scores = pd.DataFrame()
scores['SaleID'] = new_test_data.SaleID
scores['price'] = scores_Weighted
scores.to_csv("./sample_lgb_xgb.csv", index=False)