<a href="https://colab.research.google.com/github/MHX1203/DataWhale-DataMining/blob/master/Model_Construction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import numpy as np
import pandas as pd
import os
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('display.max_columns', 200)
sns.set_style('white')
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor, BaggingRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.linear_model import Ridge, RidgeCV
from sklearn.linear_model import ElasticNet, ElasticNetCV
from sklearn.svm import SVR
from mlxtend.regressor import StackingCVRegressor
import lightgbm as lgb
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor

# Stats
from scipy.stats import skew, norm
from scipy.special import boxcox1p
from scipy.stats import boxcox_normmax

# Misc
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import scale
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.decomposition import PCA
import pandas_profiling
pd.set_option('mode.chained_assignment', None)
%matplotlib inline

In [0]:
train_data = pd.read_csv('/content/drive/My Drive/Data/used_car_train_20200313.csv', sep=' ')
test_data = pd.read_csv('/content/drive/My Drive/Data/used_car_testA_20200313.csv', sep=' ')
train_data.shape, test_data.shape

((150000, 31), (50000, 30))

In [0]:
# price = train
train_data['price'] = np.log1p(train_data['price'])

train_data = train_data.dropna(subset=['model'])

train_data = train_data[train_data['power'] <= 600]



In [0]:
price = train_data.price
del train_data['price']
data = pd.concat([train_data, test_data], axis=0)
data.shape

(199856, 30)

In [0]:
"""

bodyType, fuelType和gearbox直接填充众数
"""
data.loc[data['bodyType'].isnull(), 'bodyType'] = data['bodyType'].mode()[0]
print('after handle missing, null value counts in %s: %d'%('bodyType', data['bodyType'].isnull().sum()))

data.loc[data['fuelType'].isnull(), 'fuelType'] = data['fuelType'].mode()[0]
print('after handle missing, null value counts in %s: %d'%('fuelType', data['fuelType'].isnull().sum()))

data.loc[data['gearbox'].isnull(), 'gearbox'] = data['gearbox'].mode()[0]
print('after handle missing, null value counts in %s: %d'%('gearbox', data['gearbox'].isnull().sum()))

data['notRepairedDamage'] = data['notRepairedDamage'].apply(lambda x: float(x) if x != '-' else 2.0)

group_bodyType = data[data['power'] > 0].groupby('bodyType', axis=0).agg('median').reset_index()
group_bodyType = group_bodyType.set_index('bodyType')



data.power = data.apply(lambda x: x.power if x.power != 0 else group_bodyType.loc[x.bodyType].power, axis=1)

data['reg_year'] = data['regDate'].map(lambda x: int(str(x)[:4]))

data['create_year'] = data['creatDate'].map(lambda x: int(str(x)[:4]))

data['delta_year'] = data['create_year'] - data['reg_year']
data['delta_year'] = data['delta_year'].apply(lambda x: 3 if (x // 5) >3 else x // 5)

after handle missing, null value counts in bodyType: 0
after handle missing, null value counts in fuelType: 0
after handle missing, null value counts in gearbox: 0


In [0]:
digit_feats =[col for col in data.columns if col.startswith('v_')]

skews = data[digit_feats].skew().sort_values()
skew_idx = skews[np.abs(skews) > 0.5].index


# 对于存在小于0的特征，先取指数(这里数值较小)，再进行box-cox处理
exp_boxcoxs = ['v_14', 'v_11', 'v_2']
for col in exp_boxcoxs:
  # train_data[col] = np.log1p(train_data[col])
  data[col] = boxcox1p(np.exp(data[col]), boxcox_normmax(np.exp(data[col]) + 1))

for col in skew_idx:
  if col not in exp_boxcoxs:
    data[col] = boxcox1p(data[col], boxcox_normmax(data[col] + 1))

In [0]:
# 删除

In [0]:
train_data = data[:len(train_data)]
train_data.shape

(149856, 33)

In [0]:
train_data['model'].value_counts()

0.0      11756
19.0      9564
4.0       8438
1.0       6018
29.0      5182
         ...  
240.0        2
245.0        2
209.0        2
242.0        2
247.0        1
Name: model, Length: 248, dtype: int64