In [1]:
# coding: utf-8
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# 设置图片可显示中文
plt.rcParams['font.sans-serif'].insert(0, 'SimHei')
plt.rcParams['axes.unicode_minus'] = False

In [None]:
# 读取训练数据并粗略观察
train_data = pd.read_csv('./data/used_car_train_20200313.csv', sep=' ')
train_data.head()

In [12]:
# 150000条
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150000 entries, 0 to 149999
Data columns (total 31 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   SaleID             150000 non-null  int64  
 1   name               150000 non-null  int64  
 2   regDate            150000 non-null  int64  
 3   model              149999 non-null  float64
 4   brand              150000 non-null  int64  
 5   bodyType           145494 non-null  float64
 6   fuelType           141320 non-null  float64
 7   gearbox            144019 non-null  float64
 8   power              150000 non-null  int64  
 9   kilometer          150000 non-null  float64
 10  notRepairedDamage  150000 non-null  object 
 11  regionCode         150000 non-null  int64  
 12  seller             150000 non-null  int64  
 13  offerType          150000 non-null  int64  
 14  creatDate          150000 non-null  int64  
 15  price              150000 non-null  int64  
 16  v_

### 数据预处理

In [13]:
# 删除有空值的行（由于数据体量足够，暂不考虑对空值进行回归模型填充）
train_data.dropna(inplace=True)

In [14]:
train_data.shape
# 删除后还有135884条

(135884, 31)

In [15]:
# 日期类型数据处理，提取年月日
train_data['regDate_year'] = train_data['regDate'].astype('str').apply(lambda x: int(x[0:4]))
train_data['regDate_month'] = train_data['regDate'].astype('str').apply(lambda x: int(x[4:6]))
train_data['regDate_day'] = train_data['regDate'].astype('str').apply(lambda x: int(x[6:]))

train_data['creatDate_year'] = train_data['creatDate'].astype('str').apply(lambda x: int(x[0:4]))
train_data['creatDate_month'] = train_data['creatDate'].astype('str').apply(lambda x: int(x[4:6]))
train_data['creatDate_day'] = train_data['creatDate'].astype('str').apply(lambda x: int(x[6:]))

train_data

Unnamed: 0,SaleID,name,regDate,model,brand,bodyType,fuelType,gearbox,power,kilometer,notRepairedDamage,regionCode,seller,offerType,creatDate,price,v_0,v_1,v_2,v_3,v_4,v_5,v_6,v_7,v_8,v_9,v_10,v_11,v_12,v_13,v_14,regDate_year,regDate_month,regDate_day,creatDate_year,creatDate_month,creatDate_day
0,0,736,20040402,30.0,6,1.0,0.0,0.0,60,12.5,0.0,1046,0,0,20160404,1850,43.357796,3.966344,0.050257,2.159744,1.143786,0.235676,0.101988,0.129549,0.022816,0.097462,-2.881803,2.804097,-2.420821,0.795292,0.914762,2004,4,2,2016,4,4
1,1,2262,20030301,40.0,1,2.0,0.0,0.0,0,15.0,-,4366,0,0,20160309,3600,45.305273,5.236112,0.137925,1.380657,-1.422165,0.264777,0.121004,0.135731,0.026597,0.020582,-4.900482,2.096338,-1.030483,-1.722674,0.245522,2003,3,1,2016,3,9
2,2,14874,20040403,115.0,15,1.0,0.0,0.0,163,12.5,0.0,2806,0,0,20160402,6222,45.978359,4.823792,1.319524,-0.998467,-0.996911,0.251410,0.114912,0.165147,0.062173,0.027075,-4.846749,1.803559,1.565330,-0.832687,-0.229963,2004,4,3,2016,4,2
3,3,71865,19960908,109.0,10,0.0,0.0,1.0,193,15.0,0.0,434,0,0,20160312,2400,45.687478,4.492574,-0.050616,0.883600,-2.228079,0.274293,0.110300,0.121964,0.033395,0.000000,-4.509599,1.285940,-0.501868,-2.438353,-0.478699,1996,9,8,2016,3,12
4,4,111080,20120103,110.0,5,1.0,0.0,0.0,68,5.0,0.0,6977,0,0,20160313,5200,44.383511,2.031433,0.572169,-1.571239,2.246088,0.228036,0.073205,0.091880,0.078819,0.121534,-1.896240,0.910783,0.931110,2.834518,1.923482,2012,1,3,2016,3,13
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
149995,149995,163978,20000607,121.0,10,4.0,0.0,1.0,163,15.0,0.0,4576,0,0,20160327,5900,45.316543,-3.139095,-1.269707,-0.736609,-1.505820,0.280264,0.000310,0.048441,0.071158,0.019174,1.988114,-2.983973,0.589167,-1.304370,-0.302592,2000,6,7,2016,3,27
149996,149996,184535,20091102,116.0,11,0.0,0.0,0.0,125,10.0,0.0,2826,0,0,20160312,9500,45.972058,-3.143764,-0.023523,-2.366699,0.698012,0.253217,0.000777,0.084079,0.099681,0.079371,1.839166,-2.774615,2.553994,0.924196,-0.272160,2009,11,2,2016,3,12
149997,149997,147587,20101003,60.0,11,1.0,1.0,0.0,90,6.0,0.0,3302,0,0,20160328,7500,44.733481,-3.105721,0.595454,-2.279091,1.423661,0.233353,0.000705,0.118872,0.100118,0.097914,2.439812,-1.630677,2.290197,1.891922,0.414931,2010,10,3,2016,3,28
149998,149998,45907,20060312,34.0,10,3.0,1.0,0.0,156,15.0,0.0,1877,0,0,20160401,4999,45.658634,-3.204785,-0.441680,-1.179812,0.620680,0.256369,0.000252,0.081479,0.083558,0.081498,2.075380,-2.633719,1.414937,0.431981,-1.659014,2006,3,12,2016,4,1


In [16]:
numerical_cols = train_data.select_dtypes(exclude = 'object').columns
print(numerical_cols)

categorical_cols = train_data.select_dtypes(include = 'object').columns
print(categorical_cols)

Index(['SaleID', 'name', 'regDate', 'model', 'brand', 'bodyType', 'fuelType',
       'gearbox', 'power', 'kilometer', 'regionCode', 'seller', 'offerType',
       'creatDate', 'price', 'v_0', 'v_1', 'v_2', 'v_3', 'v_4', 'v_5', 'v_6',
       'v_7', 'v_8', 'v_9', 'v_10', 'v_11', 'v_12', 'v_13', 'v_14',
       'regDate_year', 'regDate_month', 'regDate_day', 'creatDate_year',
       'creatDate_month', 'creatDate_day'],
      dtype='object')
Index(['notRepairedDamage'], dtype='object')


In [17]:
feature_cols = [col for col in numerical_cols if col not in ['SaleID','price','regDate','creatDate']]
feature_cols = [col for col in feature_cols if 'Type' not in col]
feature_cols

['name',
 'model',
 'brand',
 'gearbox',
 'power',
 'kilometer',
 'regionCode',
 'seller',
 'v_0',
 'v_1',
 'v_2',
 'v_3',
 'v_4',
 'v_5',
 'v_6',
 'v_7',
 'v_8',
 'v_9',
 'v_10',
 'v_11',
 'v_12',
 'v_13',
 'v_14',
 'regDate_year',
 'regDate_month',
 'regDate_day',
 'creatDate_year',
 'creatDate_month',
 'creatDate_day']

In [19]:
x_data = train_data[feature_cols]
y_data = train_data['price']
x_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 135884 entries, 0 to 149999
Data columns (total 29 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   name             135884 non-null  int64  
 1   model            135884 non-null  float64
 2   brand            135884 non-null  int64  
 3   gearbox          135884 non-null  float64
 4   power            135884 non-null  int64  
 5   kilometer        135884 non-null  float64
 6   regionCode       135884 non-null  int64  
 7   seller           135884 non-null  int64  
 8   v_0              135884 non-null  float64
 9   v_1              135884 non-null  float64
 10  v_2              135884 non-null  float64
 11  v_3              135884 non-null  float64
 12  v_4              135884 non-null  float64
 13  v_5              135884 non-null  float64
 14  v_6              135884 non-null  float64
 15  v_7              135884 non-null  float64
 16  v_8              135884 non-null  floa

In [None]:
plt.hist(label_data)
plt.show()
plt.close()

In [None]:
# %pip install sklearn lightgbm xgboost seaborn scipy IPython

In [None]:
import seaborn as sns
from scipy.special import jn
from IPython.display import display, clear_output
from sklearn import linear_model
from sklearn import preprocessing
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor
 
# 数据降维处理的
from sklearn.decomposition import PCA,FastICA,FactorAnalysis,SparsePCA
 
import lightgbm as lgb
import xgboost as xgb
 
# 参数搜索和评价的
from sklearn.model_selection import GridSearchCV,cross_val_score,StratifiedKFold,train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [None]:
# xgb模型搭建
xgb_model = xgb.XGBRegressor(
    n_estimators=1600, 
    learning_rate=0.05, 
    gamma=0, 
    subsample=0.8,
    colsample_bytree=0.8,
    max_depth=12
)

In [None]:
scores_train = []
scores = []

sk = StratifiedKFold(n_splits=10, shuffle=True, random_state=0)
for train_ind, val_ind in sk.split(feature_data, label_data):
    
    train_x = feature_data.iloc[train_ind].values
    train_y = label_data.iloc[train_ind]
    val_x = feature_data.iloc[val_ind].values
    val_y = label_data.iloc[val_ind]
    
    xgb_model.fit(train_x, train_y)
    pred_train_xgb = xgb_model.predict(train_x)
    pred_xgb = xgb_model.predict(val_x)
    
    score_train = mean_absolute_error(train_y, pred_train_xgb)
    scores_train.append(score_train)
    score = mean_absolute_error(val_y, pred_xgb)
    scores.append(score)
print('Train mae:', np.mean(score_train))
print('Val mae', np.mean(scores))

In [None]:
# 建立xgb训练模型
def build_model_xgb(x_train, y_train):
    model = xgb.XGBRegressor(
        n_estimators=1600, 
        learning_rate=0.05, 
        gamma=0, 
        subsample=0.8,
        colsample_bytree=0.8, 
        max_depth=12
    )
    model.fit(x_train, y_train)
    return model

In [None]:
# 建立lgb训练模型
def build_model_lgb(x_train, y_train):
    estimator = lgb.LGBMRegressor(num_leaves=127,n_estimators = 1600)
    param_grid = {
        'learning_rate': [0.01, 0.05, 0.1, 0.2],
    }
    gbm = GridSearchCV(estimator, param_grid)
    gbm.fit(x_train, y_train)
    return gbm

In [None]:
x_train, x_val, y_train, y_val = train_test_split(feature_data, label_data, test_size = 0.2)

In [None]:
print('Train lgb...')
model_lgb = build_model_lgb(x_train, y_train)
val_lgb = model_lgb.predict(x_val)
MAE_lgb = mean_absolute_error(y_val, val_lgb)
print('MAE of val with lgb:', MAE_lgb)

In [None]:
print('Train xgb...')
model_xgb = build_model_xgb(x_train, y_train)
val_xgb = model_xgb.predict(x_val)
MAE_xgb = mean_absolute_error(y_val, val_xgb)
print('MAE of val with xgb:', MAE_xgb)

In [None]:
# 这里我们采取了简单的加权融合的方式
val_Weighted = (1 - MAE_lgb / (MAE_xgb + MAE_lgb)) * val_lgb + (1 - MAE_xgb / (MAE_xgb + MAE_lgb)) * val_xgb
val_Weighted[val_Weighted < 0] = 10 
print('MAE of val with Weighted ensemble:', mean_absolute_error(y_val, val_Weighted))

In [None]:
testb_data = pd.read_csv('./data/used_car_testB_20200421.csv', sep=' ')
testb_data

In [None]:
testb_data['regDate_year'] = testb_data['regDate'].astype('str').apply(lambda x: int(x[0:4]))
testb_data['regDate_month'] = testb_data['regDate'].astype('str').apply(lambda x: int(x[4:6]))
testb_data['regDate_day'] = testb_data['regDate'].astype('str').apply(lambda x: int(x[6:]))

testb_data['creatDate_year'] = testb_data['creatDate'].astype('str').apply(lambda x: int(x[0:4]))
testb_data['creatDate_month'] = testb_data['creatDate'].astype('str').apply(lambda x: int(x[4:6]))
testb_data['creatDate_day'] = testb_data['creatDate'].astype('str').apply(lambda x: int(x[6:]))

testb_data

In [None]:
testb_data.reset_index(inplace=True)

In [None]:
test_data = testb_data[feature_cols]

In [None]:
test_data = test_data.fillna(-1)

In [None]:
test_data.notRepairedDamage.replace(['0.0','-','1.0'], [0, 1, 1], inplace=True)

In [None]:
print('Predict lgb...')
model_lgb_pre = build_model_lgb(feature_data, label_data)
subA_lgb = model_lgb_pre.predict(test_data)
subA_lgb

In [None]:
print('Predict xgb...')
model_xgb_pre = build_model_xgb(feature_data, label_data)
subA_xgb = model_xgb_pre.predict(test_data)
subA_xgb

In [None]:
scores_Weighted = (1 - MAE_lgb / (MAE_xgb + MAE_lgb)) * subA_lgb + (1 - MAE_xgb / (MAE_xgb + MAE_lgb)) * subA_xgb

In [None]:
new_test_data = pd.read_csv('./data/used_car_testB_20200421.csv', sep=' ')
new_test_data

In [None]:
scores = pd.DataFrame()
scores['SaleID'] = new_test_data.SaleID
scores['price'] = scores_Weighted
scores.to_csv("./sample_lgb_xgb.csv", index=False)