In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')


### 加载数据

In [2]:
# 加载训练集数据
train_data = pd.read_csv('Datas/used_car_train_20200313.csv', sep=' ')
print(train_data.info(verbose=True,null_counts=True),'\n')

# 加载测试集数据
test_data = pd.read_csv('Datas/used_car_testB_20200421.csv',sep=' ')
print(test_data.info(verbose=True,null_counts=True))


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150000 entries, 0 to 149999
Data columns (total 31 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   SaleID             150000 non-null  int64  
 1   name               150000 non-null  int64  
 2   regDate            150000 non-null  int64  
 3   model              149999 non-null  float64
 4   brand              150000 non-null  int64  
 5   bodyType           145494 non-null  float64
 6   fuelType           141320 non-null  float64
 7   gearbox            144019 non-null  float64
 8   power              150000 non-null  int64  
 9   kilometer          150000 non-null  float64
 10  notRepairedDamage  150000 non-null  object 
 11  regionCode         150000 non-null  int64  
 12  seller             150000 non-null  int64  
 13  offerType          150000 non-null  int64  
 14  creatDate          150000 non-null  int64  
 15  price              150000 non-null  int64  
 16  v_

### Exploratory Data Analysis

In [3]:
# QuickEDA
# import pandas_profiling
# temp = pandas_profiling.ProfileReport(train_data,minimal=True)
# temp.to_file("./exampleS.html")
# temp


In [4]:
# 查看数值类型特征
print('数值类型特征：',list(train_data.select_dtypes(exclude='object').columns),'\n')

# 查看分类类型特征
print('分类类型特征：',list(train_data.select_dtypes(include='object').columns))

数值类型特征： ['SaleID', 'name', 'regDate', 'model', 'brand', 'bodyType', 'fuelType', 'gearbox', 'power', 'kilometer', 'regionCode', 'seller', 'offerType', 'creatDate', 'price', 'v_0', 'v_1', 'v_2', 'v_3', 'v_4', 'v_5', 'v_6', 'v_7', 'v_8', 'v_9', 'v_10', 'v_11', 'v_12', 'v_13', 'v_14'] 

分类类型特征： ['notRepairedDamage']


In [5]:
train_data.head()

Unnamed: 0,SaleID,name,regDate,model,brand,bodyType,fuelType,gearbox,power,kilometer,...,v_5,v_6,v_7,v_8,v_9,v_10,v_11,v_12,v_13,v_14
0,0,736,20040402,30.0,6,1.0,0.0,0.0,60,12.5,...,0.235676,0.101988,0.129549,0.022816,0.097462,-2.881803,2.804097,-2.420821,0.795292,0.914762
1,1,2262,20030301,40.0,1,2.0,0.0,0.0,0,15.0,...,0.264777,0.121004,0.135731,0.026597,0.020582,-4.900482,2.096338,-1.030483,-1.722674,0.245522
2,2,14874,20040403,115.0,15,1.0,0.0,0.0,163,12.5,...,0.25141,0.114912,0.165147,0.062173,0.027075,-4.846749,1.803559,1.56533,-0.832687,-0.229963
3,3,71865,19960908,109.0,10,0.0,0.0,1.0,193,15.0,...,0.274293,0.1103,0.121964,0.033395,0.0,-4.509599,1.28594,-0.501868,-2.438353,-0.478699
4,4,111080,20120103,110.0,5,1.0,0.0,0.0,68,5.0,...,0.228036,0.073205,0.09188,0.078819,0.121534,-1.89624,0.910783,0.93111,2.834518,1.923482


In [6]:
missing = train_data.isnull().sum()
missing = missing[missing > 0]
missing

model          1
bodyType    4506
fuelType    8680
gearbox     5981
dtype: int64

### 数据预处理


#### 缺失值处理

In [7]:
# 缺失值填充
def DfFillNa(df):
    df['model'] = df['model'].fillna(df['model'].mode()[0])
    df['bodyType'] = df['bodyType'].fillna(df['bodyType'].mode()[0])
    df['fuelType'] = df['fuelType'].fillna(df['fuelType'].mode()[0])
    df['gearbox'] = df['gearbox'].fillna(df['gearbox'].mode()[0])
    missing = df.isnull().sum()
    missing = missing[missing > 0]
    print(missing)
    return df
#
train_data = DfFillNa(train_data)
test_data = DfFillNa(test_data)


Series([], dtype: int64)
Series([], dtype: int64)


#### 异常值处理

In [8]:
# 异常值转换
def AbnormalReplace(df):
    df['notRepairedDamage'] = df['notRepairedDamage'].replace({'-':train_data.notRepairedDamage.mode()[0]})
    print(df.notRepairedDamage.value_counts())
    df['power'] = df['power'].apply(lambda x: 600 if x>600 else x)
    df['power'] = df['power'].apply(lambda x: 0 if x<0 else x)
    print(df['power'].max(), df['power'].min())
    return df
#
train_data = AbnormalReplace(train_data)
test_data = AbnormalReplace(test_data)


0.0    135685
1.0     14315
Name: notRepairedDamage, dtype: int64
600 0
0.0    45293
1.0     4707
Name: notRepairedDamage, dtype: int64
600 0


#### 分类数据编码

In [9]:
def CategoryEncoding(df):
    df[['notRepairedDamage']] = df[['notRepairedDamage']].astype('float').astype('int')
    print(df[['notRepairedDamage']].head())
    return df
#
train_data = CategoryEncoding(train_data)
test_data = CategoryEncoding(test_data)


   notRepairedDamage
0                  0
1                  0
2                  0
3                  0
4                  0
   notRepairedDamage
0                  0
1                  0
2                  0
3                  0
4                  0


#### 特征筛选

In [10]:
# 定义筛选函数
def FeatureFilter(df,Cl):
    return  df.drop(columns=Cl)

# 定义筛选策略
# 非重复特征、单一值特征、过度倾斜特征没有区分意义，要排除掉
Dcl1 = ['SaleID','seller','offerType']

# 实施筛选
train_data = FeatureFilter(train_data, Dcl1)
test_data = FeatureFilter(test_data, Dcl1)


#### 特征衍生

##### 时间多尺度

In [11]:
import datetime
# 时间多尺度：年、月、日，时间差，持续时间
def TimeMultiscale(df,FeatureNames):
    today = datetime.datetime.now().strftime('%Y-%m-%d')
    today = pd.to_datetime(today,format='%Y-%m-%d')
    for FeatureName in FeatureNames:
        df[FeatureName+'_year'] = df[FeatureName].apply(lambda x: str(x)[0:4])
        df[FeatureName+'_month'] = df[FeatureName].apply(lambda x: str(x)[4:6])
        df[FeatureName+'_day'] = df[FeatureName].apply(lambda x: str(x)[6:])
        df[FeatureName] = pd.to_datetime(df[FeatureName],format='%Y%m%d',errors='coerce')
        df[FeatureName+'_duration'] = (today - df[FeatureName]).dt.days
        df[FeatureName+'_duration'] = df[FeatureName+'_duration'].fillna(df[FeatureName+'_duration'].mode()[0])
        # df[FeatureName+'_diff'] = (df[FeatureName] - df[FeatureName].min()).dt.days
        # df[FeatureName+'_diff'] = df[FeatureName+'_diff'].fillna(df[FeatureName+'_diff'].mode()[0])
        df.drop(columns=FeatureName, inplace=True)
        df[[FeatureName+'_year', FeatureName+'_month', FeatureName+'_day', FeatureName+'_duration']] = df[[FeatureName+'_year', FeatureName+'_month', FeatureName+'_day', FeatureName+'_duration']].astype('int')
    return df

In [12]:
#
train_data = TimeMultiscale(train_data, ['regDate', 'creatDate'])
print(train_data.columns)
#
test_data = TimeMultiscale(test_data, ['regDate', 'creatDate'])
print(test_data.columns)


Index(['name', 'model', 'brand', 'bodyType', 'fuelType', 'gearbox', 'power',
       'kilometer', 'notRepairedDamage', 'regionCode', 'price', 'v_0', 'v_1',
       'v_2', 'v_3', 'v_4', 'v_5', 'v_6', 'v_7', 'v_8', 'v_9', 'v_10', 'v_11',
       'v_12', 'v_13', 'v_14', 'regDate_year', 'regDate_month', 'regDate_day',
       'regDate_duration', 'creatDate_year', 'creatDate_month',
       'creatDate_day', 'creatDate_duration'],
      dtype='object')
Index(['name', 'model', 'brand', 'bodyType', 'fuelType', 'gearbox', 'power',
       'kilometer', 'notRepairedDamage', 'regionCode', 'v_0', 'v_1', 'v_2',
       'v_3', 'v_4', 'v_5', 'v_6', 'v_7', 'v_8', 'v_9', 'v_10', 'v_11', 'v_12',
       'v_13', 'v_14', 'regDate_year', 'regDate_month', 'regDate_day',
       'regDate_duration', 'creatDate_year', 'creatDate_month',
       'creatDate_day', 'creatDate_duration'],
      dtype='object')


##### 品牌价值

In [13]:
# 品牌价值
def AddBrandValue(train_data, test_data):
    brand_data = train_data.groupby('brand')
    brand_all_info = {}
    for brand_index, brand_temp in brand_data:
        #
        brand_info = {}
        brand_temp = brand_temp[brand_temp['price']>0]
        #
        brand_info['brand_amount'] = len(brand_temp)
        brand_info['brand_priceMax'] = brand_temp.price.max()
        brand_info['brand_priceMin'] = brand_temp.price.min()
        brand_info['brand_priceQ25'] = brand_temp.price.quantile(0.25)
        brand_info['brand_priceMedian'] = brand_temp.price.median()
        brand_info['brand_priceQ75'] = brand_temp.price.quantile(0.75)
        brand_info['brand_priceMean'] = brand_temp.price.mean()
        brand_info['brand_priceStd'] = brand_temp.price.std()
        brand_info['brand_pricePtp'] = (brand_temp.price.max() - brand_temp.price.min())
        brand_info['brand_priceQPtp'] = (brand_temp.price.quantile(0.75) - brand_temp.price.quantile(0.25))
        #
        brand_all_info[brand_index] = brand_info
    #
    brand_info = pd.DataFrame(brand_all_info).T.reset_index().rename(columns={'index':'brand'})
    return pd.merge(train_data,brand_info, how='left', on='brand'), pd.merge(test_data,brand_info, how='left', on='brand')


In [14]:
#
train_data, test_data = AddBrandValue(train_data, test_data)
print(train_data.columns)
print(test_data.columns)


Index(['name', 'model', 'brand', 'bodyType', 'fuelType', 'gearbox', 'power',
       'kilometer', 'notRepairedDamage', 'regionCode', 'price', 'v_0', 'v_1',
       'v_2', 'v_3', 'v_4', 'v_5', 'v_6', 'v_7', 'v_8', 'v_9', 'v_10', 'v_11',
       'v_12', 'v_13', 'v_14', 'regDate_year', 'regDate_month', 'regDate_day',
       'regDate_duration', 'creatDate_year', 'creatDate_month',
       'creatDate_day', 'creatDate_duration', 'brand_amount', 'brand_priceMax',
       'brand_priceMin', 'brand_priceQ25', 'brand_priceMedian',
       'brand_priceQ75', 'brand_priceMean', 'brand_priceStd', 'brand_pricePtp',
       'brand_priceQPtp'],
      dtype='object')
Index(['name', 'model', 'brand', 'bodyType', 'fuelType', 'gearbox', 'power',
       'kilometer', 'notRepairedDamage', 'regionCode', 'v_0', 'v_1', 'v_2',
       'v_3', 'v_4', 'v_5', 'v_6', 'v_7', 'v_8', 'v_9', 'v_10', 'v_11', 'v_12',
       'v_13', 'v_14', 'regDate_year', 'regDate_month', 'regDate_day',
       'regDate_duration', 'creatDate_year', '

数据准备完成

In [15]:
# 树模型使用特征
X_train_TR = train_data.drop(columns='price')
X_test_TR = test_data

# 梯度下降模型使用特征
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
ss.fit(train_data.drop(columns='price'))
X_train_GD = ss.transform(train_data.drop(columns='price'))
X_test_GD = ss.transform(test_data)
X_train_GD =  pd.DataFrame(X_train_GD,columns=train_data.drop(columns='price').columns)
X_test_GD =  pd.DataFrame(X_test_GD,columns=test_data.columns)

# 标签（通用）
y_train = train_data[['price']]


### 模型训练

#### 模型定义

In [16]:
# XGboost
import xgboost as xgb
xgrM = xgb.XGBRegressor(max_depth=6, learning_rate=0.05, n_estimators=10000,
            objective='reg:linear', tree_method='gpu_hist',
            subsample=0.8, colsample_bytree=0.8,
            min_child_samples=6, eval_metric='auc', reg_lambda=0.5,
            random_state=42
)
# LGBM
import lightgbm as lgb
lgrM = lgb.LGBMRegressor(
            num_leaves=2**6-1, reg_alpha=0.25, reg_lambda=0.5, objective='regression',
            max_depth=-1, learning_rate=0.005, min_child_samples=6, random_state=42,
            n_estimators=10000, subsample=0.8, colsample_bytree=0.8
        )

# NN
from tensorflow import keras
#
# nnr = keras.Sequential([
#     keras.layers.Flatten(input_shape=[X_train_GD.shape[1]]),
#     keras.layers.Dense(250, activation='relu', kernel_regularizer=keras.regularizers.l2(0.02)),
#     keras.layers.Dense(250, activation='relu', kernel_regularizer=keras.regularizers.l2(0.02)),
#     keras.layers.Dense(250, activation='relu', kernel_regularizer=keras.regularizers.l2(0.02)),
#     keras.layers.Dense(1, kernel_regularizer=keras.regularizers.l2(0.02))
# ])
# nnr.compile(loss='mean_absolute_error', optimizer='Adam',metrics=['mae'])
#
#
# nnr2 = keras.Sequential([
#     keras.layers.Flatten(input_shape=[X_train_GD.shape[1]]),
#     keras.layers.Dense(512, activation='relu', kernel_regularizer=keras.regularizers.l2(0.02)),
#     keras.layers.Dense(256, activation='relu', kernel_regularizer=keras.regularizers.l2(0.02)),
#     keras.layers.Dense(128, activation='relu', kernel_regularizer=keras.regularizers.l2(0.02)),
#     keras.layers.Dense(64, activation='relu', kernel_regularizer=keras.regularizers.l2(0.02)),
#     keras.layers.Dense(1, kernel_regularizer=keras.regularizers.l2(0.02))
# ])
# nnr2.compile(loss='mean_absolute_error', optimizer='Adam',metrics=['mae'])

nnr = keras.Sequential([
    keras.layers.Flatten(input_shape=[X_train_GD.shape[1]]),
    keras.layers.Dense(250, activation='relu'),
    keras.layers.Dense(250, activation='relu'),
    keras.layers.Dense(250, activation='relu'),
    keras.layers.Dense(1, kernel_regularizer=keras.regularizers.l2(0.02))
])
nnr.compile(loss='mean_absolute_error', optimizer='Adam',metrics=['mae'])


nnr2 = keras.Sequential([
    keras.layers.Flatten(input_shape=[X_train_GD.shape[1]]),
    keras.layers.Dense(512, activation='relu'),
    keras.layers.Dense(256, activation='relu'),
    keras.layers.Dense(128, activation='relu'),
    keras.layers.Dense(64, activation='relu'),
    keras.layers.Dense(1, kernel_regularizer=keras.regularizers.l2(0.02))
])
nnr2.compile(loss='mean_absolute_error', optimizer='Adam',metrics=['mae'])





# 融合模型
mm = keras.Sequential([
    keras.layers.Flatten(input_shape=[3]),
    keras.layers.Dense(3),
    keras.layers.Dense(1)
])
mm.compile(loss='mean_absolute_error', optimizer='Adam')


#### 模型评估

In [17]:
#
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import StratifiedKFold,train_test_split

# 模型交叉验证
def KFoldCrossValidation(X, y, Mdol, batch_size=None, epochs=None, cv=5, X_test=0):
    scores_train = []
    scores = []
    pred_test = np.zeros(len(X_test)).reshape(len(X_test),1)
    sk = StratifiedKFold(n_splits=cv,shuffle=True,random_state=0)
    #
    for train_ind,val_ind in sk.split(X,y):
        # 数据集分割
        train_x, train_y = X.iloc[train_ind].values, y.iloc[train_ind].values
        val_x, val_y = X.iloc[val_ind].values, y.iloc[val_ind].values
        # 模型训练
        if batch_size and epochs: Mdol.fit(train_x,train_y, batch_size=batch_size, epochs=epochs)
        else: Mdol.fit(train_x,train_y)
        # 预测
        pred_train = Mdol.predict(train_x)
        pred = Mdol.predict(val_x)
        pred_test += Mdol.predict(X_test)
        # 评估
        scores_train.append(mean_absolute_error(train_y,pred_train))
        scores.append(mean_absolute_error(val_y,pred))
    # 模型效果展示
    print('TrainScores:', scores_train)
    print('ValScores:', scores)
    print('TrainMean:', np.mean(scores_train), 'TrainStd:', np.std(scores_train))
    print('ValMean:  ', np.mean(scores), 'ValStd:  ', np.std(scores))
    print('\n')
    return (pred_test/cv)

In [18]:
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import StratifiedKFold,train_test_split
from keras.callbacks import EarlyStopping,ModelCheckpoint,RemoteMonitor,CSVLogger,LearningRateScheduler
import tensorflow.keras.backend as tfbackend
# 模型交叉验证
def KFoldCrossValidationNN(X, y, Mdol, batch_size=None, epochs=None, cv=5, X_test=0):
    #
    scores_train = []
    scores = []
    pred_test = np.zeros(len(X_test)).reshape(len(X_test),1)
    sk = StratifiedKFold(n_splits=cv,shuffle=True,random_state=0)
    # # 配置backcall
    # callback = ModelCheckpoint(filepath=f"ModelFile/Mdol.ckpt", monitor="val_mean_absolute_error", verbose=1,save_best_only=True, save_weights_only=False)
    # # 调整训练过程的学习率
    # def scheduler(epoch):
    #     # 到规定的epoch，学习率减小为原来的1/10
    #     if epoch  == 1400 :
    #         lr = tfbackend.get_value(nnr.optimizer.lr)
    #         tfbackend.set_value(nnr.optimizer.lr, lr * 0.1)
    #         print("lr changed to {}".format(lr * 0.1))
    #     if epoch  == 1700 :
    #         lr = tfbackend.get_value(nnr.optimizer.lr)
    #         tfbackend.set_value(nnr.optimizer.lr, lr * 0.1)
    #         print("lr changed to {}".format(lr * 0.1))
    #     if epoch  == 1900 :
    #         lr = tfbackend.get_value(nnr.optimizer.lr)
    #         tfbackend.set_value(nnr.optimizer.lr, lr * 0.1)
    #         print("lr changed to {}".format(lr * 0.1))
    #     return tfbackend.get_value(nnr.optimizer.lr)
    # reduce_lr = LearningRateScheduler(scheduler)
    #
    for train_ind,val_ind in sk.split(X,y):
        # 数据集分割
        train_x, train_y = X.iloc[train_ind].values, y.iloc[train_ind].values
        val_x, val_y = X.iloc[val_ind].values, y.iloc[val_ind].values
        # 模型训练
        # Mdol.fit(train_x,train_y, batch_size=batch_size, epochs=epochs, validation_data=(val_x, val_y), callbacks=[reduce_lr,callback])
        Mdol.fit(train_x,train_y, batch_size=batch_size, epochs=epochs, verbose=0)
        # 预测
        pred_train = Mdol.predict(train_x)
        pred = Mdol.predict(val_x)
        pred_test += Mdol.predict(X_test)
        # 评估
        scores_train.append(mean_absolute_error(train_y,pred_train))
        scores.append(mean_absolute_error(val_y,pred))
    # 模型效果展示
    print('TrainScores:', scores_train)
    print('ValScores:', scores)
    print('TrainMean:', np.mean(scores_train), 'TrainStd:', np.std(scores_train))
    print('ValMean:  ', np.mean(scores), 'ValStd:  ', np.std(scores))
    print('\n')
    return (pred_test/cv)

In [19]:
# XGboost
# KFoldCrossValidation(X_train_TR, y_train, xgrM, cv=5)


In [20]:
# LGBM
# KFoldCrossValidation(X_train_TR, y_train, lgrM, cv=5)


In [None]:
# NN

pred_nnr_test = KFoldCrossValidation(X_train_GD, y_train, nnr2, batch_size=2048, epochs=300, cv=5, X_test=X_test_GD)
pred_nnr_test = [11 if i[0]<0 else i[0] for i in pred_nnr_test]


Epoch 1/300
Epoch 2/300
Epoch 3/300
Epoch 4/300
Epoch 5/300
Epoch 6/300
Epoch 7/300
Epoch 8/300
Epoch 9/300
Epoch 10/300
Epoch 11/300
Epoch 12/300
Epoch 13/300
Epoch 14/300
Epoch 15/300
Epoch 16/300
Epoch 17/300
Epoch 18/300
Epoch 19/300
Epoch 20/300
Epoch 21/300
Epoch 22/300
Epoch 23/300
Epoch 24/300
Epoch 25/300
Epoch 26/300

In [None]:
# pred_nnr_test = KFoldCrossValidation(X_train_GD, y_train, nnr2, batch_size=2048, epochs=500, cv=2, X_test=X_test_GD)



In [None]:
df = pd.read_csv('Datas/used_car_sample_submit.csv')
df['price'] = pred_nnr_test
df.to_csv('result.csv',index=False)