In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import missingno as mns

warnings.filterwarnings('ignore')

In [None]:
data = pd.read_table('../附件/附件1：估价训练数据.txt', header=None)
columns = ['carId', 'tradeTime', 'brand', 'serial', 'model', 'mileage', 'color', 'cityId', 'carCode', 'tansferCount',
           'seatings', 'registerDate', 'licenseDate', 'country', 'makeType', 'modelYear', 'displacement', 'gearbox',
           'oilType', 'newPrice']
for i in range(15):
    columns.append('unknown' + str(i))
del i
columns.append('price')
data.columns = columns
data

In [None]:
data.tail(10)

In [None]:
# 缺失值可视化
mns.matrix(data)
plt.savefig('./solve1/missing_matrix.png', dpi=800)

In [None]:
# 数据分布情况
data.hist(bins=50, figsize=(25, 20))
plt.cla()
plt.savefig('./solve1/hist.png', dpi=800)

In [None]:
# 特征重命名，做特征融合

for col in ['tradeTime', 'registerDate', 'licenseDate']:
    data[col] = pd.to_datetime(data[col])
    del col

"""
@:param productionTime: 注册日期与展销日期的间隔
@:param useTime:    挂牌日期与展销时间的间隔
@:param time:   此款车型出的年限
将时间型数据转化为连续的数值型数据（年限）
"""
data.rename(columns={'registerDate': 'productionTime', 'licenseDate': 'useTime', 'modelYear': 'time'}, inplace=True)

data['productionTime'] = data['tradeTime'] - data['productionTime']
data['useTime'] = data['tradeTime'] - data['useTime']
data['time'] = 2021 - data['time']

data['productionTime'] = data['productionTime'].astype(np.str)
data['useTime'] = data['useTime'].astype(np.str)

data['productionTime'] = data['productionTime'].str.split()
data['productionTime'] = [np.float(i[0])/365 for i in data['productionTime'].tolist()]
data['useTime'] = data['useTime'].str.split()
data['useTime'] = [np.float(i[0])/365 for i in data['useTime'].tolist()]
data.info()

In [None]:
# 推断un11 是车子尺寸，将其扩增为三个连续型特征
data['unknown11'] = data['unknown11'].str.split('*')
data['unknown11_1'] = [np.float(i[0])/1000 for i in data['unknown11'].tolist()]
data['unknown11_2'] = [np.float(i[1])/1000 for i in data['unknown11'].tolist()]
data['unknown11_3'] = [np.float(i[2])/1000 for i in data['unknown11'].tolist()]
data.drop(columns=['unknown11'], inplace=True)

In [None]:
# un12 疑似时间数据，将其转化成时间格式，以展销时间减去，得到连续的数值型数据

_ = data[data['unknown12'].notna()]
_['unknown12'] = pd.to_datetime(_['unknown12'], format='%Y%m')
_['unknown12'] = _['tradeTime'] - _['unknown12']
_['unknown12'] = _['unknown12'].astype(np.str)
_['unknown12'] = _['unknown12'].str.split()
_['unknown12'] = [np.float(i[0])/365 for i in _['unknown12'].tolist()]
data.loc[data['unknown12'].notna(), 'unknown12'] = _['unknown12']
data

In [None]:
# 时间数据无法加入到回归模型中，假设今天是2021年最后一天，以今天减去展销时间得到放在平台上拍卖的时间（连续型）
data['tradeTime'] = pd.to_datetime('2021-12-31') - data['tradeTime']
data['tradeTime'] = data['tradeTime'].astype(np.str)
data['tradeTime'] = data['tradeTime'].str.split()
data['tradeTime'] = [np.float(i[0])/365 for i in data['tradeTime'].tolist()]

data

In [None]:
missing = (data.shape[0] - data.count()) / data.shape[0]
missing = missing[missing > 0.]
missing

In [None]:
data.drop(columns=['unknown6', 'unknown14'], inplace=True, axis=1)

data = data[data['carCode'].notna()]
data = data[data['gearbox'].notna()]
missing = missing[missing > .01]
missing = missing[missing < .6]
data

In [None]:
# 查看缺失特征与各特征之间的互信息

features = [i for i in data.columns if i not in missing.index and i != 'price']
from sklearn.feature_selection import mutual_info_classif
# features.remove('licenseDate')
# features.remove('registerDate')
# features.remove('tradeTime')
temp = pd.DataFrame(index=features, columns=missing.index)
for index in temp.index:
    for col in temp.columns:
        try:
            _ = data[[index, col]]
            _.dropna(axis=0, inplace=True)
            info = mutual_info_classif(_[index].values.reshape(-1, 1), _[col])
            temp.loc[index, col] = info[0]
            del _
            del info
        except ValueError:
            pass

temp = temp.astype(np.float)
temp.dropna(axis=1, inplace=True)
temp

In [None]:
sns.heatmap(temp, cmap='RdBu')
plt.savefig('./solve1/corr1.png', dpi=800)

In [None]:
plt.figure(figsize=(8, 8))
mask = np.zeros_like(temp[temp>1],dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
sns.set_style(style="white")
# 显示强相关模式的相关系数热力值，低于参考值的部分显示为白色，从而获取强相关项
# 得到互信息值的热力图
sns.heatmap(temp[temp>=1],annot=True,mask=mask,cbar=True, linewidths=.5)
plt.savefig('./solve1/corr2.png', dpi=800)

In [None]:
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.model_selection import train_test_split
# 基于决策树的特征选择
# features.remove('tradeTime')
# features.remove('unknown11')
importance = pd.DataFrame(index=features, columns=missing.index)
score = []
for col in missing.index:
    _ = data[data[col].notna()]
    X = _[features]
    y = _[col]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    dt = DecisionTreeClassifier(random_state=42)
    try:
        dt.fit(X_train, y_train)
    except ValueError:
        y = y.astype(np.int)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, random_state=42)
    dt.fit(X_train, y_train)
    score.append(dt.score(X_test, y_test))
    importance[col] = dt.feature_importances_
    dt = None
    del _, X, y, X_train, X_test, y_train, y_test, dt

_ = importance.T
_['score'] = score
importance = _.T

plt.figure(figsize=(8, 8))
sns.heatmap(importance.iloc[:-1])
plt.savefig('./solve1/DecisionTreeFeatureSelection.png', dpi=800)

In [None]:
data[missing.index]

In [None]:
# 基于模型的缺失值处理，利用树模型对缺失值进行预测，以准确率和F1值综合评判模型效果
from sklearn.metrics import f1_score, r2_score
from sklearn.model_selection import train_test_split

feature = [i for i in data.columns if i not in missing.index]
acc = pd.DataFrame(index=missing.index, columns=['accuracy', 'score'])

score = []
f1 = []

for col in missing.index:
    dt = DecisionTreeClassifier(random_state=42)
    if col in ['time', 'unknown12']:
        dt = DecisionTreeRegressor(random_state=42)
    temp = data[data[col].notna()]
    X = temp[feature]
    y = temp[col]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, random_state=42)
    try:
        dt.fit(X_train, y_train)
    except ValueError:
        y = y.astype(np.int)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, random_state=42)
    dt.fit(X_train, y_train)
    score.append(dt.score(X_test, y_test))
    predict = dt.predict(X_test)
    if type(dt) is DecisionTreeRegressor:
        f1.append(r2_score(y_test, predict))
        continue
    f1.append(f1_score(y_test, predict, average='weighted'))

acc['accuracy'] = score
acc['score'] = f1
acc

In [None]:
# 准确率和F1均大于90%的使用该模型进行填充预测
acc = acc[acc>.9].dropna(axis=0)
acc = acc.sort_values(by='accuracy', ascending=False)
acc

In [None]:
from pickle import dump
from sklearn.preprocessing import LabelEncoder

cols = []
for col in acc.index:
    missing = (data.shape[0] - data.count()) / data.shape[0]
    missing = missing[missing > 0.]
    dt = DecisionTreeClassifier(random_state=42)

    if col in ['time', 'unknown12']:
        dt = DecisionTreeRegressor(random_state=42)

    temp = data[data[col].notna()]
    nan = data[data[col].isna()]

    X = temp[[i for i in temp.columns if i not in missing.index and i != 'price']]
    y = temp[col]
    X_ = nan[[i for i in temp.columns if i not in missing.index and i != 'price']]
    cols.append(X.columns)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, random_state=42)

    try:
        dt.fit(X_train, y_train)
    except ValueError:
        y = y.astype(np.int)
        dt.fit(X_train, y_train)

    print(col, dt.score(X_test, y_test))
    try:
        y_pre = dt.predict(X_)
        data.loc[data[col].isnull(), col] = y_pre
    except ValueError:
        pass

    PATH = "../models/" + str(col) + "-" + str(dt) + '.pk'
    with open(PATH, 'wb') as f:
        dump(dt, f)

    if col == 'unknown10':
        label = LabelEncoder()
        data['unknown10'] = label.fit_transform(data['unknown10'])
        with open('../models/label.pk', 'wb') as f:
            dump(label, f)

with open('./dict.pk', 'wb') as f:
    dump(cols, f)
    

In [None]:
# from pandas_profiling import ProfileReport
#
# pr = ProfileReport(data)
# pr.to_file('./solve1/report.html')

In [None]:
data

In [None]:
data['country'].unique()

In [None]:
sns.countplot(data['country'])

In [None]:
data[data['country'] == 0][['brand', 'serial']].value_counts()

In [None]:
data[data['brand']==68][['country', 'serial']].value_counts()

In [None]:
data[data['brand']==72][['country', 'serial']].value_counts()

In [None]:
data['country'].replace(0, 779413, inplace=True)

In [None]:
think = data[data['price'] > data['newPrice']][['price', 'newPrice']]
think['point'] = think['newPrice'] - think['price']
_ = think[(think['point'] / think['newPrice']).abs() > .3]
_

In [None]:
data.drop(index=_.index, axis=0, inplace=True)
data

In [None]:
sns.distplot(data['price'])

In [None]:
from sklearn.preprocessing import StandardScaler

ss = StandardScaler()

level = pd.cut(data['price'], bins=30, labels=[i for i in range(1, 31)])

data['level'] = level

X = data[[i for i in data.columns if i not in ['price', 'level']]]
y = data[['price', 'level']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, random_state=42)

In [None]:
number = ['tradeTime', 'mileage', 'productionTime', 'newPrice', 'unknown12', 'unknown11_1',
          'unknown11_2', 'unknown11_3', 'useTime']
classif = [i for i in data.columns if i not in number and i not in y.columns]

In [None]:
X_train_classif = X_train[classif]
X_train_number = X_train[number]

In [None]:
def ape(model, train, target):
    predict = model.predict(train)
    loss = np.abs(target - predict)
    print('ape: ', str(loss / target))
    return loss / target

def mape(model, train, target):
    ap = ape(model, train, target)
    shape = ap.shape[0]
    print('mape: ', str(np.sum(ap) / shape))
    return np.sum(ap) / shape

def accuracy5(model, train, target):
    a = ape(model, train, target)
    total = a.shape[0]
    ape5 = a[a<=.05].shape[0]
    print('acc5: ', str(ape5 / total))
    return ape5 / total

def metrics(model, train, target):
    m = mape(model, train, target)
    accuracy = accuracy5(model, train, target)
    return 0.2*(1-m) + 0.8*accuracy

In [None]:
from keras.layers import Dense, Dropout, concatenate
from keras.models import Model, Input

X_train_number = ss.fit_transform(X_train_number)
X_test_number = ss.fit_transform(X_test[number])
X_test_classif = X_test[classif]

In [None]:
# input_num = Input(shape=(9,), name='num_input1')
# input_class = Input(shape=(31,), name='class_input1')
# input_num_ = Input(shape=(9,), name='num_input2')
# input_class_ = Input(shape=(31,), name='class_input2')
#
# x1 = Dense(128, activation='relu', name='level_1')(input_num)
# x11 = Dropout(0.8)(x1)
# x2 = Dense(128, activation='sigmoid', name='level_2')(input_class_)
# x21 = Dropout(0.8)(x2)
# x3 = Dense(128, activation='relu', name='regression_1')(input_num_)
# x31 = Dropout(0.8)(x3)
# x4 = Dense(128, activation='sigmoid', name='regression_2')(input_class)
# x41 = Dropout(0.8)(x4)
#
# x12 = Dense(32, activation='relu', name='level_11')(x11)
# x13 = Dropout(0.75)(x12)
# x22 = Dense(32, activation='sigmoid', name='level_21')(x21)
# x23 = Dropout(.75)(x22)
# x32 = Dense(32, activation='relu', name='regression_11')(x31)
# x33 = Dropout(.75)(x32)
# x42 = Dense(32, activation='sigmoid', name='regression_21')(x41)
# x43 = Dropout(.75)(x42)
#
# X1 = concatenate([x13, x23])
# X2 = concatenate([x33, x43])
#
# X11 = Dense(128, activation='relu', name='level_concat')(X1)
# X11 = Dropout(.8)(X11)
# y1 = Dense(100, activation='softmax', name='level')(X11)
# X21 = Dense(64, activation='relu', name='regression_concat')(X2)
#
# X3 = concatenate([x21, y1])
# X31 = Dense(64, activation='relu', name='concat')(X3)
# X32 = Dropout(.9)(X31)
# y2 = Dense(1, name='target')(X32)

In [None]:
# model = Model([input_num, input_class, input_num_, input_class_], [y1, y2])

In [None]:
# model.summary()

In [None]:
# model.compile(optimizer='adam', loss='mse', metrics='mape')
# history = model.fit(x={
#     'num_input1': X_train_number,
#     'class_input1': X_train_classif,
#     'num_input2': X_train_number,
#     'class_input2': X_train_classif,
# }, y={
#     'target': y['price'],
#     'level': y['level']
# },
# epochs=64,
# validation_split=.3,
# batch_size=2048)

In [None]:
# plt.plot(history.history['target_loss'])

In [None]:
# from sklearn.metrics import r2_score
#
# r2_score(y_test['price'], model.predict([X_test_number, X_test_classif, X_test_number, X_test_classif])[1])

In [None]:
# pre = model.predict([X_test_number, X_test_classif, X_test_number, X_test_classif])
# pre[1]

In [None]:
# plt.plot(range(len(pre[1])), pre[1], 'r')
# plt.plot(range(len(pre[1])), y_test['price'], 'b')

In [None]:
from xgboost import XGBRegressor

xgb = XGBRegressor()
xgb.fit(X_train, y_train['price'])
xgb.score(X_test, y_test['price'])

In [None]:
metrics(xgb, X_test, y_test['price'])

In [None]:
sns.countplot(data['level'])

In [None]:
from sklearn.tree import DecisionTreeClassifier

level = data['level'].astype(int)
level[level!=1] = -1
sns.countplot(level)

In [None]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)
X, y = smote.fit_resample(X, level)
sns.countplot(y)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

dt = DecisionTreeClassifier()
dt.fit(X_train,y_train)
dt.score(X_test, y_test)

In [None]:
dt.score(data[[i for i in data.columns if i not in ['price', 'level']]], level)

In [None]:
f1_score(level, dt.predict(data[[i for i in data.columns if i not in ['price', 'level']]]))

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score, make_scorer

X = data[[i for i in data.columns if i not in ['price', 'level']]]
y = data['level']

In [None]:
def predict(model, train):
    _ = model.predict(train)


In [None]:
_ = data[data['level'] == 1]
X = _[[i for i in data.columns if i not in['price', 'level']]]
y = _['price']
X_train, X_test, y_train, y_test= train_test_split(X, y, test_size=.3, random_state=42)

In [None]:
xgr = XGBRegressor(n_estimators=120, max_depth=7, eta=.1, random_state=42)
xgr.fit(X_train, y_train)
xgr.score(X_test, y_test)

In [None]:
cross_val_score(xgr, X_train, y_train, scoring=metrics, cv=5, n_jobs=-1)

In [None]:
plt.plot(xgr.predict(X_test))

In [None]:
corr = data[number].corr()
sns.heatmap(corr, annot=True, linewidths=.1)

In [None]:
from sklearn.linear_model import LinearRegression

lr = LinearRegression(normalize=True)
lr.fit(X_train, y_train)
lr.score(X_test, y_test)

In [None]:
data['price'].plot()

In [None]:
sns.distplot(data['price'])
print(data['price'].skew(), data['price'].kurt())
plt.savefig('./solve1/true.png', dpi=800)

In [None]:
price = np.log1p(data['price'])
sns.distplot(price)
print(price.skew(), price.kurt())
plt.savefig('./solve1/log(1+n).png', dpi=800)

In [None]:
X = data[[i for i in data.columns if i not in ['price', 'level']]]
X_train, X_test, y_train, y_test = train_test_split(X, price, test_size=.3, random_state=42)

lr.fit(X_train, y_train)
lr.score(X_test, y_test)

In [None]:
xgr = XGBRegressor()
xgr.fit(X_train, y_train)
xgr.score(X_test, y_test)

In [None]:
cross_val_score(xgr, X_train, y_train, scoring=metrics, cv=5, n_jobs=-1)

In [None]:
X.shape

In [None]:
# from keras.layers import Dense
# from keras.models import Sequential
#
# model = Sequential()
# model.add(Dense(32, input_dim=40, activation='relu'))
# model.add(Dense(8, activation='relu'))
# model.add(Dense(1))
#
# model.compile(optimizer='Adam', loss='mse')
# his = model.fit(X_train, y_train, epochs=16,  validation_split=.3)

In [None]:
# plt.plot(his.history['loss'], 'r', label='loss')
# plt.plot(his.history['val_loss'], 'b', label='val_loss')
# plt.legend()

In [None]:
# model.predict(X)

In [None]:
from lightgbm.sklearn import LGBMRegressor

gbm = LGBMRegressor()
gbm.fit(X_train.values, y_train)
gbm.score(X_test.values, y_test)

In [None]:
X_train

In [None]:
def Ape(y_predict, y):
    return np.abs(y_predict - y) / y

def Mape(y_predict, y):
    return Ape(y_predict, y).mean()

def Acc5(y_predict, y):
    ape = Ape(y_predict, y)
    ape = ape[ape < .05]
    return ape.shape[0] / y.shape[0]

def met(y_predict, y):
    return .2 * (1 - Mape(y_predict, y)) + .8 * Acc5(y_predict, y)

def true(price_ln):
    return np.e ** price_ln - 1

In [None]:
pre = xgr.predict(X_test)
pre = true(pre)
y_true= true(y_test)
met(pre, y_true)

In [None]:
cross_val_score(xgr, X, price, cv=5, scoring=make_scorer(met))

In [None]:
Acc5(pre, y_true)

In [None]:
from sklearn.linear_model import Lasso, Ridge, LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from xgboost.sklearn import XGBRegressor
from lightgbm.sklearn import LGBMRegressor
from catboost import CatBoostRegressor

models = [
    LinearRegression(),
    Lasso(),
    Ridge(),
    DecisionTreeRegressor(),
    GradientBoostingRegressor(),
    RandomForestRegressor(),
    XGBRegressor(),
    LGBMRegressor(),
    CatBoostRegressor()
]

In [None]:
result = dict()

for model in models:
    model_name = str(model).split('(')[0]
    scores = cross_val_score(model, X_train, y_train, cv=5, scoring=make_scorer(met))
    result[model_name] = scores

result = pd.DataFrame(result)

In [None]:
result

In [None]:
best_model = CatBoostRegressor()
best_model.fit(X_train, y_train)
best_model.score(X_test, y_test)

In [None]:
# from sklearn.model_selection import GridSearchCV
#
# params = {
# 'iterations': [500, 1000],
# 'learning_rate':  [0.01, 0.05, 0.1],
# 'max_depth': [5, 10],
# 'l2_leaf_reg': [1, 3],
# 'task_type': ['GPU']
# }
#
# clf = GridSearchCV(CatBoostRegressor(), param_grid=params, cv=3, scoring=make_scorer(met))
# clf.fit(X_train, y_train)

In [None]:
# print(clf.best_score_, clf.best_params_)

In [None]:
cross_val_score(CatBoostRegressor(iterations=1000, l2_leaf_reg=3, learning_rate=.1, max_depth=10,
                                  task_type='GPU'), X_train, y_train, cv=5, scoring=make_scorer(met))

In [None]:
model = CatBoostRegressor()
model.fit(X_train, y_train,eval_set=(X_test, y_test), plot=True)
plt.savefig('./solve1/fit.png', dpi=800)

In [None]:
fea = model.feature_importances_
fea_name = model.feature_names_
plt.barh(fea_name, fea, height=.5)
plt.savefig('./solve1/importance_.png', dpi=800)

In [None]:
for name, value in zip(fea_name, fea):
    print(name, value)

In [None]:
model.score(X_test, y_test)

In [None]:
y_pre = true(model.predict(X))
met(y_pre, data['price'])

In [None]:
met(model.predict(X), price)

In [None]:
Acc5(model.predict(X), price)

In [None]:
Mape(model.predict(X), price)

In [None]:
Acc5(y_pre, data['price'])

In [None]:
Mape(y_pre, data['price'])