In [None]:
import os
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import cohen_kappa_score, accuracy_score, mean_absolute_error, f1_score
from sklearn.model_selection import GroupKFold, KFold, StratifiedKFold
from tqdm import tqdm
import lightgbm as lgb
import numpy as np
import pandas as pd
import math
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import gc
from datetime import datetime
from sklearn.preprocessing import PolynomialFeatures
from itertools import combinations

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [None]:
seed = 2020

In [None]:
df_train = pd.read_csv('./raw_data/used_car_train_20200313.csv', sep=' ')
df_test = pd.read_csv('./raw_data/used_car_testA_20200313.csv', sep=' ')
df_sub = pd.read_csv('./raw_data/used_car_sample_submit.csv', sep=' ')

In [None]:
df_feature = pd.concat([df_train, df_test], sort=False)

In [None]:
df_feature['regionCode'].max()

In [None]:
df_feature.head()

In [None]:
from scipy import stats

cols = ['bodyType', 'fuelType', 'gearbox']
df_feature['gp'] = df_feature['brand'].astype(
    'str') + df_feature['model'].astype('str')
gp_col = 'gp'

df_na = df_feature[cols].isna()
# 根据分组计算众数
df_mode = df_feature.groupby(gp_col)[cols].agg(
    lambda x: stats.mode(x)[0][0])

for col in cols:
    na_series = df_na[col]
    names = list(df_feature.loc[na_series, gp_col])

    t = df_mode.loc[names, col]
    t.index = df_feature.loc[na_series, col].index

    df_feature.loc[na_series, col] = t

del df_feature['gp']
df_feature[cols].isnull().sum()

In [None]:
df_feature['notRepairedDamage'] = df_feature['notRepairedDamage'].replace(
    '-', 2)
df_feature['notRepairedDamage'] = df_feature['notRepairedDamage'].astype(
    'float')

In [None]:
df_feature.tail()

In [None]:
del df_feature['seller']
del df_feature['offerType']

In [None]:
df_feature['price'] = np.log1p(df_feature['price'])

# feature engine

In [None]:
v_cols = ['v_'+str(i) for i in range(15)]

df_feature['v_mean'] = df_feature[v_cols].mean(axis=1)
df_feature['v_max'] = df_feature[v_cols].max(axis=1)
df_feature['v_min'] = df_feature[v_cols].min(axis=1)
df_feature['v_std'] = df_feature[v_cols].std(axis=1)

for col in ['v_mean', 'v_max', 'v_min', 'v_std']:
    df_feature[f'name_{col}_mean'] = df_feature.groupby('name')[
        col].transform('mean')
    df_feature[f'name_{col}_std'] = df_feature.groupby('name')[
        col].transform('std')
    df_feature[f'name_{col}_max'] = df_feature.groupby('name')[
        col].transform('max')
    df_feature[f'name_{col}_min'] = df_feature.groupby('name')[
        col].transform('min')

In [None]:
# # 时序特征
# df_sort = df_feature[['brand', 'model', 'creatDate', 'price']]
# df_sort = df_sort.groupby(['brand', 'model', 'creatDate'])[
#     'price'].mean().reset_index()
# df_sort.rename(columns={'price': 'brand_model_day_price_mean'}, inplace=True)
# df_sort = df_sort.sort_values(['brand', 'model', 'creatDate'])
# df_sort['brand_model_day_shift1_price_mean'] = df_sort.groupby(
#     ['brand', 'model'])['brand_model_day_price_mean'].shift(1)
# del df_sort['brand_model_day_price_mean']
# # # df_sort['brand_model_price_rolling3_mean'] = df_sort.groupby(
# # #     ['brand', 'model'])['price'].shift().rolling(window=3, min_periods=3).mean()
# # df_sort['brand_model_price_shift1'] = df_sort.groupby(
# #     ['brand', 'model'])['price'].shift()
# # df_sort.head()
# # print(df_feature.shape)
# df_feature = df_feature.merge(df_sort, how='left')
# # print(df_feature.shape)

In [None]:
df_feature['name_count'] = df_feature.groupby(
    ['name'])['SaleID'].transform('count')

In [None]:
def date_parse(x):
    year = int(str(x)[:4])
    month = int(str(x)[4:6])
    day = int(str(x)[6:8])

    if month < 1:
        month = 1

    date = datetime(year, month, day)
    return date


df_feature['regDate'] = df_feature['regDate'].apply(date_parse)
df_feature['creatDate'] = df_feature['creatDate'].apply(date_parse)
df_feature['regDate_year'] = df_feature['regDate'].dt.year
df_feature['creatDate_year'] = df_feature['creatDate'].dt.year
df_feature['creatDate_month'] = df_feature['creatDate'].dt.month

In [None]:
df_feature['car_age_day'] = (
    df_feature['creatDate'] - df_feature['regDate']).dt.days
df_feature['car_age_year'] = round(df_feature['car_age_day'] / 365, 1)

In [None]:
# 简单统计
def stat(df, df_merge, group_by, agg):
    group = df.groupby(group_by).agg(agg)

    columns = []
    for on, methods in agg.items():
        for method in methods:
            columns.append('{}_{}_{}'.format('_'.join(group_by), on, method))
    group.columns = columns
    group.reset_index(inplace=True)
    df_merge = df_merge.merge(group, on=group_by, how='left')

    del (group)
    gc.collect()

    return df_merge

In [None]:
l = ['name', 'model', 'brand', 'bodyType']
for f1 in tqdm(l):
    for f2 in v_cols:
        df_feature = stat(df_feature, df_feature, [f1], {
            f2: ['mean', 'max', 'min', 'std']})

In [None]:
def statis_feat(df_know, df_unknow):
    l = ['name', 'model', 'brand', 'bodyType']

    combs = list(combinations(l, 2))
    for t1, t2 in combs:
        df_unknow = stat(df_know, df_unknow, [t1, t2], {
                         'price': ['mean', 'max', 'min', 'std', 'median']})

    for f in tqdm(l):
        df_unknow = stat(df_know, df_unknow, [f], {
                         'price': ['mean', 'max', 'min', 'std']})

    return df_unknow

In [None]:
# 5折交叉
df_train = df_feature[~df_feature['price'].isnull()]
df_train = df_train.reset_index(drop=True)
df_test = df_feature[df_feature['price'].isnull()]

df_stas_feat = None
kf = KFold(n_splits=5, random_state=seed, shuffle=True)
for train_index, val_index in kf.split(df_train):
    df_fold_train = df_train.iloc[train_index]
    df_fold_val = df_train.iloc[val_index]

    df_fold_val = statis_feat(df_fold_train, df_fold_val)
    df_stas_feat = pd.concat([df_stas_feat, df_fold_val], axis=0)

    del(df_fold_train)
    del(df_fold_val)
    gc.collect()

df_test = statis_feat(df_train, df_test)
df_feature = pd.concat([df_stas_feat, df_test], axis=0)

del(df_stas_feat)
del(df_train)
del(df_test)
gc.collect()

In [None]:
df_feature['v_0_add_v_4'] = df_feature['v_0'] + df_feature['v_4']
df_feature['v_0_add_v_8'] = df_feature['v_0'] + df_feature['v_8']
df_feature['v_1_add_v_3'] = df_feature['v_1'] + df_feature['v_3']
df_feature['v_1_add_v_4'] = df_feature['v_1'] + df_feature['v_4']
df_feature['v_1_add_v_5'] = df_feature['v_1'] + df_feature['v_5']
df_feature['v_1_add_v_12'] = df_feature['v_1'] + df_feature['v_12']
df_feature['v_2_add_v_3'] = df_feature['v_2'] + df_feature['v_3']
df_feature['v_4_add_v_11'] = df_feature['v_4'] + df_feature['v_11']
df_feature['v_4_add_v_12'] = df_feature['v_4'] + df_feature['v_12']
df_feature['v_0_add_v_12_add_v_14'] = df_feature['v_0'] + \
    df_feature['v_12'] + df_feature['v_14']

In [None]:
df_feature['v_4_add_v_9_minu_v_13'] = df_feature['v_4'] + \
    df_feature['v_9'] - df_feature['v_13']
df_feature['v_2_add_v_4_minu_v_11'] = df_feature['v_2'] + \
    df_feature['v_4'] - df_feature['v_11']
df_feature['v_2_add_v_3_minu_v_11'] = df_feature['v_2'] + \
    df_feature['v_3'] - df_feature['v_11']

In [None]:
df_feature.head()

In [None]:
df_feature.shape

In [None]:
df_feature.to_pickle('feature.pickle')