In [3]:
# import basic tools
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler

Train_data = pd.read_csv('data/used_car_train_20200313.csv', sep=' ')
Test_data = pd.read_csv('data/used_car_testB_20200421.csv', sep=' ')

In [4]:
# check data
Train_data

Unnamed: 0,SaleID,name,regDate,model,brand,bodyType,fuelType,gearbox,power,kilometer,...,v_5,v_6,v_7,v_8,v_9,v_10,v_11,v_12,v_13,v_14
0,0,736,20040402,30.0,6,1.0,0.0,0.0,60,12.5,...,0.235676,0.101988,0.129549,0.022816,0.097462,-2.881803,2.804097,-2.420821,0.795292,0.914762
1,1,2262,20030301,40.0,1,2.0,0.0,0.0,0,15.0,...,0.264777,0.121004,0.135731,0.026597,0.020582,-4.900482,2.096338,-1.030483,-1.722674,0.245522
2,2,14874,20040403,115.0,15,1.0,0.0,0.0,163,12.5,...,0.251410,0.114912,0.165147,0.062173,0.027075,-4.846749,1.803559,1.565330,-0.832687,-0.229963
3,3,71865,19960908,109.0,10,0.0,0.0,1.0,193,15.0,...,0.274293,0.110300,0.121964,0.033395,0.000000,-4.509599,1.285940,-0.501868,-2.438353,-0.478699
4,4,111080,20120103,110.0,5,1.0,0.0,0.0,68,5.0,...,0.228036,0.073205,0.091880,0.078819,0.121534,-1.896240,0.910783,0.931110,2.834518,1.923482
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
149995,149995,163978,20000607,121.0,10,4.0,0.0,1.0,163,15.0,...,0.280264,0.000310,0.048441,0.071158,0.019174,1.988114,-2.983973,0.589167,-1.304370,-0.302592
149996,149996,184535,20091102,116.0,11,0.0,0.0,0.0,125,10.0,...,0.253217,0.000777,0.084079,0.099681,0.079371,1.839166,-2.774615,2.553994,0.924196,-0.272160
149997,149997,147587,20101003,60.0,11,1.0,1.0,0.0,90,6.0,...,0.233353,0.000705,0.118872,0.100118,0.097914,2.439812,-1.630677,2.290197,1.891922,0.414931
149998,149998,45907,20060312,34.0,10,3.0,1.0,0.0,156,15.0,...,0.256369,0.000252,0.081479,0.083558,0.081498,2.075380,-2.633719,1.414937,0.431981,-1.659014


In [5]:
# 合并方便后面的操作
df = pd.concat([Train_data, Test_data], ignore_index=True)


# 数据预处理

## 数据清洗
1. 空值转换
2. 数据类型转换

In [6]:
df["seller"].value_counts()

seller
0    199999
1         1
Name: count, dtype: int64

In [7]:
df["offerType"].value_counts()

offerType
0    200000
Name: count, dtype: int64

## 不合法数据处理

In [8]:
df.drop(['seller'], axis=1, inplace=True)
df.drop(['offerType'], axis=1, inplace=True)

### power数据范围的问题
power与price之间的相关性系数为0.566101，不能完全忽略

处理策略：
1. 中位数，平均数修改
2. 随机森林填补

In [9]:
# 查看power值大于600的数据分布情况
df[df['power'] > 600]['power'].count()

194

In [10]:
# 使用map函数，以power列的中位数来替换数值超出范围的power
df['power'] = df['power'].map(lambda x: df['power'].mean() if x > 600 else x)

In [11]:
# 检查是否替换成功
df['power'].plot.hist()

ImportError: matplotlib is required for plotting when the default backend "matplotlib" is selected.

### 日期数据清洗
1. 日期数据不合法：将int => str   str => datetime  当存在非法日期时，string是无法转换到datetime的

In [None]:
# there is an error date in regDate like xxxx00xx
def correct_date_errors(date_series):
    def fix_date(date):
        date_str = str(date)
        if date_str[4:6] == '00':
            date_str = date_str[:4] + '01' + date_str[6:]
        if date_str[6:] == '00':
            date_str = date_str[:6] + '01'
        return int(date_str)
    
    corrected_dates = date_series.apply(fix_date)
    return pd.to_datetime(corrected_dates, format='%Y%m%d')

# Convert date columns
df['regDate'] = correct_date_errors(df['regDate'])
df['creatDate'] = correct_date_errors(df['creatDate'])

# Extract year, and age
df['regYear'] = df['regDate'].dt.year
df['carAge'] = df['creatDate'].dt.year - df['regYear']

df.drop(['regDate'], axis=1, inplace=True)
df.drop(['creatDate'], axis=1, inplace=True)
df.drop(['regYear'], axis=1, inplace=True)

## 数据清洗
1. 空值转换
2. 数据类型转换

In [None]:
#notRepairedDamage的值是0和1，然后为-的值设置为0.5，在将它进行标签转换，0->1;0.5->2;1->3;这样符合神经网络的特征提取，不确定值位于两个确定值的中间
df.replace(to_replace = '-', value = 0.5, inplace = True)
le = LabelEncoder()
df['notRepairedDamage'] = le.fit_transform(df['notRepairedDamage'].astype(str))


## 线性相关性


1. 在选择需要删除的特征之前，考虑线性相关系数低的。第一步选中系数绝对值小于0.1的特征， 第二步，抛开线性相关系数，从现实角度思考每个特征对售价的影响
2. 删去 'name', 'v_2', 'v_6', 'v_1', 'v_14', 'regionCode', 'v_13', 'brand', 'v_7'

In [None]:
# 再次查看各特征与销售价格之间的线性相关系数
df.corr().unstack()['price'].sort_values(ascending=False)

price                1.000000
v_12                 0.692823
v_8                  0.685798
v_0                  0.628397
power                0.566168
gearbox              0.329075
bodyType             0.241303
fuelType             0.200536
v_5                  0.164317
model                0.136983
v_2                  0.085322
v_6                  0.068970
v_1                  0.060914
v_14                 0.035911
regionCode           0.014036
name                 0.002030
SaleID              -0.001043
v_13                -0.013993
brand               -0.043799
v_7                 -0.053024
v_4                 -0.147085
v_9                 -0.206205
notRepairedDamage   -0.232940
v_10                -0.246175
v_11                -0.275320
kilometer           -0.440519
carAge              -0.611814
v_3                 -0.730946
dtype: float64

In [None]:
drop_column = ['name', 'v_2', 'v_6', 'v_1', 'v_14', 'regionCode', 'v_13', 'brand', 'v_7']
df = df.drop(drop_column, axis=1)

In [None]:
df.columns

Index(['SaleID', 'model', 'bodyType', 'fuelType', 'gearbox', 'power',
       'kilometer', 'notRepairedDamage', 'price', 'v_0', 'v_3', 'v_4', 'v_5',
       'v_8', 'v_9', 'v_10', 'v_11', 'v_12', 'carAge'],
      dtype='object')

In [None]:
#填充众数
df.fillna(df.median(),inplace= True)

# select usefule feature
feature = ['model', 'bodyType', 'fuelType', 'gearbox', 'power',
            'kilometer', 'notRepairedDamage', 'v_0', 'v_3', 'v_4', 'v_5',
            'v_8', 'v_9', 'v_10', 'v_11', 'v_12', 'carAge']

## 切割数据,导出数据,作为神经网络的训练数据
nn_data = pd.DataFrame(df,columns=feature)
nn_data['price']=np.array(df['price'])
nn_data['SaleID']=np.array(df['SaleID'])

#特征归一化
scaler = MinMaxScaler()
scaler.fit(df[feature].values)
df = scaler.transform(df[feature].values)

print(nn_data.shape)
train_num = Train_data.shape[0]
test_num = Test_data.shape[0]
nn_data[0:int(train_num)].to_csv('data/train_nn.csv', index=0, sep=' ')
nn_data[train_num:train_num+test_num].to_csv('data/test_nn.csv', index=0, sep=' ')

print('NN模型数据已经准备完毕~~~~~~~')

(200000, 19)
NN模型数据已经准备完毕~~~~~~~


In [12]:
## import basic tools
import warnings
warnings.filterwarnings('ignore')
import tensorflow as tf
import tensorflow.keras.backend as K
from tensorflow.keras.callbacks import LearningRateScheduler
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error
import os



2024-06-11 06:18:54.479403: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-06-11 06:18:54.500648: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-06-11 06:18:54.500671: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-06-11 06:18:54.501315: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-06-11 06:18:54.505152: I tensorflow/core/platform/cpu_feature_guar

In [19]:
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

Num GPUs Available:  1


In [18]:
tf.config.list_logical_devices('GPU')

[LogicalDevice(name='/device:GPU:0', device_type='GPU')]

In [23]:
# 读取神经网络模型数据
path = os.path.abspath(os.path.dirname(os.getcwd()) + os.path.sep + ".")
Train_NN_data = pd.read_csv('data/train_nn.csv', sep=' ')
Test_NN_data = pd.read_csv('data/test_nn.csv', sep=' ')

numerical_cols = Train_NN_data.columns
feature_cols = [col for col in numerical_cols if col not in ['price','SaleID']]
# 提前特征列，标签列构造训练样本和测试样本
X_data = Train_NN_data[feature_cols]
X_test = Test_NN_data[feature_cols]

x = np.array(X_data)
y = np.array(Train_NN_data['price'])
x_test = np.array(X_test)

# 调整训练过程的学习率
def scheduler(epoch, lr):
    # 到规定的epoch，学习率减小为原来的1/10
    if epoch in [1400, 1700, 1900]:
        new_lr = lr * 0.1
        print(f"Epoch {epoch}: Learning rate changed to {new_lr}")
        return new_lr
    return lr

reduce_lr = LearningRateScheduler(scheduler)

kfolder = KFold(n_splits=5, shuffle=True, random_state=2018)
oof_nn = np.zeros(len(x))
predictions_nn = np.zeros(len(x_test))
predictions_train_nn = np.zeros(len(x))
kfold = kfolder.split(x, y)
fold_ = 0

for train_index, vali_index in kfold:
    k_x_train = x[train_index]
    k_y_train = y[train_index]
    k_x_vali = x[vali_index]
    k_y_vali = y[vali_index]

    model = tf.keras.Sequential()
    model.add(tf.keras.layers.Dense(512, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.02)))
    model.add(tf.keras.layers.Dense(256, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.02)))
    model.add(tf.keras.layers.Dense(128, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.02)))
    model.add(tf.keras.layers.Dense(64, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.02)))
    model.add(tf.keras.layers.Dense(1, kernel_regularizer=tf.keras.regularizers.l2(0.02)))

    model.compile(loss='mean_absolute_error',
                  optimizer=tf.keras.optimizers.Adam(),
                  metrics=['mae'])

    model.fit(k_x_train, k_y_train, batch_size=2048, epochs=2000, validation_data=(k_x_vali, k_y_vali), callbacks=[reduce_lr])
    oof_nn[vali_index] = model.predict(k_x_vali).reshape((model.predict(k_x_vali).shape[0],))
    predictions_nn += model.predict(x_test).reshape((model.predict(x_test).shape[0],)) / kfolder.n_splits
    predictions_train_nn += model.predict(x).reshape((model.predict(x).shape[0],)) / kfolder.n_splits

print("NN score: {:<8.8f}".format(mean_absolute_error(oof_nn, y)))

Epoch 1/2000
Epoch 2/2000
Epoch 3/2000
Epoch 4/2000
Epoch 5/2000
Epoch 6/2000
Epoch 7/2000
Epoch 8/2000
Epoch 9/2000
Epoch 10/2000
Epoch 11/2000
Epoch 12/2000
Epoch 13/2000
Epoch 14/2000
Epoch 15/2000
Epoch 16/2000
Epoch 17/2000
Epoch 18/2000
Epoch 19/2000
Epoch 20/2000
Epoch 21/2000
Epoch 22/2000
Epoch 23/2000
Epoch 24/2000
Epoch 25/2000
Epoch 26/2000
Epoch 27/2000
Epoch 28/2000
Epoch 29/2000
Epoch 30/2000
Epoch 31/2000
Epoch 32/2000
Epoch 33/2000
Epoch 34/2000
Epoch 35/2000
Epoch 36/2000
Epoch 37/2000
Epoch 38/2000
Epoch 39/2000
Epoch 40/2000
Epoch 41/2000
Epoch 42/2000
Epoch 43/2000
Epoch 44/2000
Epoch 45/2000
Epoch 46/2000
Epoch 47/2000
Epoch 48/2000
Epoch 49/2000
Epoch 50/2000
Epoch 51/2000
Epoch 52/2000
Epoch 53/2000
Epoch 54/2000
Epoch 55/2000
Epoch 56/2000
Epoch 57/2000
Epoch 58/2000
Epoch 59/2000
Epoch 60/2000
Epoch 61/2000
Epoch 62/2000
Epoch 63/2000
Epoch 64/2000
Epoch 65/2000
Epoch 66/2000
Epoch 67/2000
Epoch 68/2000
Epoch 69/2000
Epoch 70/2000
Epoch 71/2000
Epoch 72/2000
E

In [30]:
# 测试集输出
sample = pd.read_csv('data/used_car_sample_submit.csv', sep=',')

predictions = predictions_nn
predictions[predictions < 0] = 0
sub = pd.DataFrame()
sub['SaleID'] = sample['SaleID']
sub['price'] = predictions
sub.to_csv('data/nn_test.csv', index=False)

# 验证集输出
oof_nn[oof_nn < 0] = 0
sub = pd.DataFrame()
sub['SaleID'] = Train_data['SaleID']
sub['price'] = oof_nn
sub.to_csv('data/nn_train.csv', index=False)