In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')


### 加载数据

In [2]:
# 加载训练集数据
train_data = pd.read_csv('Datas/used_car_train_20200313.csv', sep=' ')
print(train_data.info(verbose=True,null_counts=True),'\n')

# 加载测试集数据
test_data = pd.read_csv('Datas/used_car_testB_20200421.csv',sep=' ')
print(test_data.info(verbose=True,null_counts=True))


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150000 entries, 0 to 149999
Data columns (total 31 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   SaleID             150000 non-null  int64  
 1   name               150000 non-null  int64  
 2   regDate            150000 non-null  int64  
 3   model              149999 non-null  float64
 4   brand              150000 non-null  int64  
 5   bodyType           145494 non-null  float64
 6   fuelType           141320 non-null  float64
 7   gearbox            144019 non-null  float64
 8   power              150000 non-null  int64  
 9   kilometer          150000 non-null  float64
 10  notRepairedDamage  150000 non-null  object 
 11  regionCode         150000 non-null  int64  
 12  seller             150000 non-null  int64  
 13  offerType          150000 non-null  int64  
 14  creatDate          150000 non-null  int64  
 15  price              150000 non-null  int64  
 16  v_

### Exploratory Data Analysis

In [3]:
# QuickEDA
# import pandas_profiling
# temp = pandas_profiling.ProfileReport(train_data,minimal=True)
# temp.to_file("./exampleS.html")
# temp


In [4]:
# 查看数值类型特征
print('数值类型特征：',list(train_data.select_dtypes(exclude='object').columns),'\n')

# 查看分类类型特征
print('分类类型特征：',list(train_data.select_dtypes(include='object').columns))

数值类型特征： ['SaleID', 'name', 'regDate', 'model', 'brand', 'bodyType', 'fuelType', 'gearbox', 'power', 'kilometer', 'regionCode', 'seller', 'offerType', 'creatDate', 'price', 'v_0', 'v_1', 'v_2', 'v_3', 'v_4', 'v_5', 'v_6', 'v_7', 'v_8', 'v_9', 'v_10', 'v_11', 'v_12', 'v_13', 'v_14'] 

分类类型特征： ['notRepairedDamage']


In [5]:
train_data.head()

Unnamed: 0,SaleID,name,regDate,model,brand,bodyType,fuelType,gearbox,power,kilometer,...,v_5,v_6,v_7,v_8,v_9,v_10,v_11,v_12,v_13,v_14
0,0,736,20040402,30.0,6,1.0,0.0,0.0,60,12.5,...,0.235676,0.101988,0.129549,0.022816,0.097462,-2.881803,2.804097,-2.420821,0.795292,0.914762
1,1,2262,20030301,40.0,1,2.0,0.0,0.0,0,15.0,...,0.264777,0.121004,0.135731,0.026597,0.020582,-4.900482,2.096338,-1.030483,-1.722674,0.245522
2,2,14874,20040403,115.0,15,1.0,0.0,0.0,163,12.5,...,0.25141,0.114912,0.165147,0.062173,0.027075,-4.846749,1.803559,1.56533,-0.832687,-0.229963
3,3,71865,19960908,109.0,10,0.0,0.0,1.0,193,15.0,...,0.274293,0.1103,0.121964,0.033395,0.0,-4.509599,1.28594,-0.501868,-2.438353,-0.478699
4,4,111080,20120103,110.0,5,1.0,0.0,0.0,68,5.0,...,0.228036,0.073205,0.09188,0.078819,0.121534,-1.89624,0.910783,0.93111,2.834518,1.923482


In [6]:
missing = train_data.isnull().sum()
missing = missing[missing > 0]
missing

model          1
bodyType    4506
fuelType    8680
gearbox     5981
dtype: int64

### 数据预处理


#### 缺失值处理

In [7]:
# 缺失值填充
def DfFillNa(df):
    df['model'] = df['model'].fillna(df['model'].mode()[0])
    df['bodyType'] = df['bodyType'].fillna(df['bodyType'].mode()[0])
    df['fuelType'] = df['fuelType'].fillna(df['fuelType'].mode()[0])
    df['gearbox'] = df['gearbox'].fillna(df['gearbox'].mode()[0])
    missing = df.isnull().sum()
    missing = missing[missing > 0]
    print(missing)
    return df
#
train_data = DfFillNa(train_data)
test_data = DfFillNa(test_data)


Series([], dtype: int64)
Series([], dtype: int64)


#### 异常值处理

In [8]:
# 异常值转换
def AbnormalReplace(df):
    df['notRepairedDamage'] = df['notRepairedDamage'].replace({'-':train_data.notRepairedDamage.mode()[0]})
    print(df.notRepairedDamage.value_counts())
    return df
#
train_data = AbnormalReplace(train_data)
test_data = AbnormalReplace(test_data)


0.0    135685
1.0     14315
Name: notRepairedDamage, dtype: int64
0.0    45293
1.0     4707
Name: notRepairedDamage, dtype: int64


#### 分类数据编码

In [9]:
def CategoryEncoding(df):
    df[['notRepairedDamage']] = df[['notRepairedDamage']].astype('float').astype('int')
    print(df[['notRepairedDamage']].head())
    return df
#
train_data = CategoryEncoding(train_data)
test_data = CategoryEncoding(test_data)


   notRepairedDamage
0                  0
1                  0
2                  0
3                  0
4                  0
   notRepairedDamage
0                  0
1                  0
2                  0
3                  0
4                  0


#### 特征筛选

In [10]:
# 定义筛选函数
def FeatureFilter(df,Cl):
    return  df.drop(columns=Cl)

# 定义筛选策略
# 非重复特征、单一值特征、过度倾斜特征没有区分意义，要排除掉
Dcl1 = ['SaleID','seller','offerType']

# # 时间特征第一版先排除
# Dcl2 = ['creatDate','regDate']

# 实施筛选
train_data = FeatureFilter(train_data, Dcl1)
test_data = FeatureFilter(test_data, Dcl1)


#### 特征衍生

In [11]:
import datetime
# 时间多尺度：年、月、日
def TimeMultiscale(df,FeatureNames):
    today = datetime.datetime.now().strftime('%Y-%m-%d')
    today = pd.to_datetime(today,format='%Y-%m-%d')
    for FeatureName in FeatureNames:
        df[FeatureName+'_year'] = df[FeatureName].apply(lambda x: str(x)[0:4])
        df[FeatureName+'_month'] = df[FeatureName].apply(lambda x: str(x)[4:6])
        df[FeatureName+'_day'] = df[FeatureName].apply(lambda x: str(x)[6:])
        df[FeatureName] = pd.to_datetime(df[FeatureName],format='%Y%m%d',errors='coerce')
        df[FeatureName+'_duration'] = (today - df[FeatureName]).dt.days
        df[FeatureName+'_duration'] = df[FeatureName+'_duration'].fillna(df[FeatureName+'_duration'].mode()[0])
        df.drop(columns=FeatureName, inplace=True)
        df[[FeatureName+'_year', FeatureName+'_month', FeatureName+'_day', FeatureName+'_duration']] = df[[FeatureName+'_year', FeatureName+'_month', FeatureName+'_day', FeatureName+'_duration']].astype('int')
    return df


In [12]:
#
train_data = TimeMultiscale(train_data, ['regDate', 'creatDate'])
print(train_data.columns)
#
test_data = TimeMultiscale(test_data, ['regDate', 'creatDate'])
print(test_data.columns)


Index(['name', 'model', 'brand', 'bodyType', 'fuelType', 'gearbox', 'power',
       'kilometer', 'notRepairedDamage', 'regionCode', 'price', 'v_0', 'v_1',
       'v_2', 'v_3', 'v_4', 'v_5', 'v_6', 'v_7', 'v_8', 'v_9', 'v_10', 'v_11',
       'v_12', 'v_13', 'v_14', 'regDate_year', 'regDate_month', 'regDate_day',
       'regDate_duration', 'creatDate_year', 'creatDate_month',
       'creatDate_day', 'creatDate_duration'],
      dtype='object')
Index(['name', 'model', 'brand', 'bodyType', 'fuelType', 'gearbox', 'power',
       'kilometer', 'notRepairedDamage', 'regionCode', 'v_0', 'v_1', 'v_2',
       'v_3', 'v_4', 'v_5', 'v_6', 'v_7', 'v_8', 'v_9', 'v_10', 'v_11', 'v_12',
       'v_13', 'v_14', 'regDate_year', 'regDate_month', 'regDate_day',
       'regDate_duration', 'creatDate_year', 'creatDate_month',
       'creatDate_day', 'creatDate_duration'],
      dtype='object')


数据准备完成

In [18]:
X_train = train_data.drop(columns='price')
y_train = train_data[['price']]
X_test = test_data

模型训练

In [14]:
import xgboost as xgb
import lightgbm as lgb
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import cross_val_score,StratifiedKFold,train_test_split

### 模型评估

In [15]:
# XGboost
xgrM = xgb.XGBRegressor(max_depth=6, learning_rate=0.05, n_estimators=2000,
            objective='reg:linear', tree_method='gpu_hist',
            subsample=0.8, colsample_bytree=0.8,
            min_child_samples=3, eval_metric='auc', reg_lambda=0.5,
            random_state=42
)
# LGBM
lgrM = lgb.LGBMRegressor (
            num_leaves=2**6-1, reg_alpha=0.25, reg_lambda=0.25, objective='regression',
            max_depth=-1, learning_rate=0.005, min_child_samples=3, random_state=2021,
            n_estimators=2000, subsample=0.8, colsample_bytree=0.8
        )


In [215]:
## 模型交叉验证
def KFoldCrossValidation(X, y, Mdol, cv):
    scores_train = []
    scores = []
    sk = StratifiedKFold(n_splits=cv,shuffle=True,random_state=0)
    #
    for train_ind,val_ind in sk.split(X,y):
        #
        train_x, train_y = X.iloc[train_ind].values, y.iloc[train_ind].values
        val_x, val_y = X.iloc[val_ind].values, y.iloc[val_ind].values
        #
        Mdol.fit(train_x,train_y)
        pred_train=Mdol.predict(train_x)
        pred=Mdol.predict(val_x)
        #
        scores_train.append(mean_absolute_error(train_y,pred_train))
        scores.append(mean_absolute_error(val_y,pred))
    #
    print('TrainScores:', scores_train)
    print('ValScores:', scores)
    print('TrainMean:', np.mean(scores_train), 'TrainStd:', np.std(scores_train))
    print('ValMean:  ', np.mean(scores), 'ValStd:  ', np.std(scores))
    print('\n')


In [216]:
#
KFoldCrossValidation(X_train, y_train, xgrM, 5)
#
KFoldCrossValidation(X_train, y_train, lgrM, 5)

Parameters: { "min_child_samples" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { "min_child_samples" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { "min_child_samples" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { "min_child_samples" } might not be used.

  This may not be accurate due to some parameters are only 

### 模型训练

In [16]:
# 数据集分割
x_train,x_val,y_train,y_val = train_test_split(X_train,y_train,test_size=0.2)

In [17]:
xgrM.fit(x_train, y_train)
val_xgb = xgrM.predict(x_val)
MAE_xgb = mean_absolute_error(y_val,val_xgb)
MAE_xgb
lgrM.fit(x_train, y_train)
val_lgb = lgrM.predict(x_val)
MAE_lgb = mean_absolute_error(y_val,val_lgb)
MAE_lgb

Parameters: { "min_child_samples" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




627.4586076906778

In [65]:
val_Weighted = (1-MAE_lgb/(MAE_xgb+MAE_lgb))*val_lgb+(1-MAE_xgb/(MAE_xgb+MAE_lgb))*val_xgb
val_Weighted = [11 if i<11 else i for i in val_Weighted]


In [21]:
df = pd.read_csv('Datas/used_car_sample_submit.csv')

In [217]:
xgrM.fit(X_train, y_train)
test_xgb = xgrM.predict(X_test)
test_xgb = [11 if i<=0 else i for i in test_xgb]

Parameters: { "min_child_samples" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




[1277.3472,
 1898.5541,
 8496.933,
 1192.7019,
 1962.4825,
 1237.9575,
 524.02637,
 3358.5747,
 10056.489,
 617.4188,
 626.91656,
 2781.427,
 5792.512,
 8276.499,
 1517.7468,
 255.97115,
 1804.3505,
 8430.389,
 6857.0034,
 905.8019,
 24324.938,
 7637.946,
 1887.307,
 1594.5396,
 720.9709,
 7954.8823,
 468.69925,
 12530.172,
 3880.69,
 7926.5703,
 17886.707,
 13355.642,
 594.0285,
 14383.589,
 917.2426,
 1885.5195,
 13967.422,
 6080.3,
 1591.1154,
 345.56186,
 623.70557,
 27075.25,
 459.3643,
 8276.579,
 18601.176,
 545.6945,
 16183.062,
 18008.646,
 2089.6458,
 3188.9746,
 662.33777,
 4459.2603,
 2740.199,
 408.0093,
 14048.127,
 10516.301,
 4978.169,
 3149.1143,
 1016.5029,
 5236.1885,
 8568.896,
 1082.8252,
 5434.6196,
 8262.605,
 1266.2195,
 1518.2024,
 468.8802,
 10958.179,
 166.73096,
 17145.91,
 9257.132,
 338.0557,
 6775.8096,
 18904.797,
 3951.9282,
 15004.489,
 2999.0022,
 403.3266,
 376.48514,
 2865.8684,
 3113.453,
 2149.6172,
 4090.264,
 406.31808,
 1235.2323,
 1736.1451,
 

In [219]:
df['price'] = test_xgb
df.to_csv('result20210417.csv',index=False)


In [19]:
lgrM.fit(X_train, y_train)
test_lgr = lgrM.predict(X_test)
test_lgr = [11 if i<=0 else i for i in test_lgr]

In [22]:
df['price'] = test_lgr
df.to_csv('result_lgr_20210417.csv',index=False)

In [23]:
test_lgr = lgrM.predict(X_test)
xgrM.fit(X_train, y_train)
test_xgb = xgrM.predict(X_test)

test_Weighted = (1-MAE_lgb/(MAE_xgb+MAE_lgb))*test_lgr+(1-MAE_xgb/(MAE_xgb+MAE_lgb))*test_xgb
test_Weighted = [11 if i<11 else i for i in test_Weighted]

Parameters: { "min_child_samples" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




In [24]:
df['price'] = test_Weighted
df.to_csv('result_Weighted_20210417.csv',index=False)