### 该类型特征的 column name 都以 B_ 开头，意为Brand（品牌）

In [1]:
import pandas as pd
import numpy as np
from pandas.tseries.offsets import *
from xiao_utils import f

In [2]:
# 将level_id字段中的-替换为np.nan
df = pd.read_csv('../../data/origin/[new] yancheng_train_20171226.csv', dtype={'sale_date':str}, na_values=['-'], low_memory=False)
df['sale_date']= pd.to_datetime(df['sale_date'], format='%Y%m')

# 将price_level字段转换成有序类别的类型，并用其数值填入该列。
df['price_level'] = df['price_level'].astype('category', categories=['5WL','5-8W','8-10W','10-15W','15-20W','20-25W','25-35W','35-50W','50-75W'], ordered=True)
df['price_level'] = df['price_level'].cat.codes

# 待选方案：先把power和扭矩字段带/的行复制一份，然后将新行里的销量清零，将原行和新行的power和扭矩字段的值分别赋为slash前后的值。
# 现行方案：先他娘的直接把slash和后面的值删掉。省得影响记录条数相关的统计量。
def process_power_and_torque(s):
    return s.split('/')[0]
df['power'] = df['power'].astype(str).apply(process_power_and_torque).astype(float) #[18600]
df['engine_torque'] = df['engine_torque'].astype(str).apply(process_power_and_torque).astype(float)

# -------------------------------------------------------------
# 把2017年11月的数据拼进来，一块填入其特征，用于最终输出要提交的结果。
empty_Nov = pd.read_csv('../../data/origin/yancheng_testA_20171225.csv', dtype={'predict_date':str}, na_values=['-'], low_memory=False)
empty_Nov['predict_date']= pd.to_datetime(empty_Nov['predict_date'], format='%Y%m')
empty_Nov.rename(columns = {'predict_date': 'sale_date', 'predict_quantity':'sale_quantity'}, inplace = True)


# 读取玩了，先不急着拼，先把车型到品牌的映射关系join进来
class_to_brand = df[['class_id','brand_id']].groupby(['class_id']).mean().reset_index()
empyt_Nov = pd.merge(left=empty_Nov, right=class_to_brand, on='class_id', how='left')
empty_Nov['brand_id']= class_to_brand['brand_id']
# empty_Nov
# class_to_brand

# class_to_brand

# 读取完了，拼上去
df = pd.concat([df, empty_Nov])
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 20297 entries, 0 to 139
Data columns (total 32 columns):
TR                       20157 non-null object
brand_id                 20297 non-null int64
car_height               20157 non-null float64
car_length               20157 non-null float64
car_width                20157 non-null float64
class_id                 20297 non-null int64
compartment              20157 non-null float64
cylinder_number          20157 non-null float64
department_id            20157 non-null float64
displacement             20157 non-null float64
driven_type_id           20157 non-null float64
emission_standards_id    20157 non-null float64
engine_torque            20138 non-null float64
equipment_quality        20157 non-null float64
front_track              20157 non-null float64
fuel_type_id             20154 non-null float64
gearbox_type             20157 non-null object
if_MPV_id                20157 non-null float64
if_charging              20157 non-

In [22]:
tt = df.groupby(['brand_id','sale_date']).sum().reset_index()[['brand_id','sale_date','sale_quantity']]

In [28]:
%qtconsole

In [34]:
# 主要逻辑
def calc_features_on_brand_and_time(df):
    """
    Args:
        df: 完整的数据集
    Return:
        tmp：基于综合时间和品牌信息，构造出的特征们
    """
    g_cls_date = df.groupby(['brand_id','sale_date']).sum().reset_index()[['brand_id','sale_date','sale_quantity']]
    gg = g_cls_date.groupby('brand_id')

    # 过去几年内的每个月销量
    tmp = g_cls_date
    for i in range(62):
        tmp['B_som_' + str(i+1)] = gg.apply(f, -(i+1), -i).reset_index()['sale_date']


    # 该车型过去2~60个月分别的销量和
    tmp['B_ssm_1'] = tmp['B_som_1']
    for i in range(60):
        tmp['B_ssm_' + str(i+2) ] = tmp['B_ssm_' + str(i+1)] + tmp['B_som_' + str(i+2)]
    tmp = tmp.drop('B_ssm_1', axis=1) # 再把这一列删掉，因为和前面的 B_som_1 列是重复的
    
    # 一阶差分，一阶比值
    for i in range(61):
        thismonth = tmp['B_som_' + str(i+1)]
        lastmonth = tmp['B_som_' + str(i+2)]
        tmp['B_fd_' + str(i+1)] = thismonth - lastmonth
        tmp['B_fr_' + str(i+1)] = thismonth / lastmonth
    
    # 二阶差分
    for i in range(60):
        thismonth = tmp['B_fd_' + str(i+1)]
        lastmonth = tmp['B_fd_' + str(i+2)]
        tmp['B_sd_' + str(i+1)] = thismonth - lastmonth
        
    # 二阶比值
    for i in range(60):
        thismonth = tmp['B_fr_' + str(i+1)]
        lastmonth = tmp['B_fr_' + str(i+2)]
        tmp['B_sr_' + str(i+1)] = thismonth / lastmonth
        
    # 比值的差分
    for i in range(60):
        thismonth = tmp['B_fr_' + str(i+1)]
        lastmonth = tmp['B_fr_' + str(i+2)]
        tmp['B_dfr_' + str(i+1)] = thismonth - lastmonth
    
    # 差分的比值
    for i in range(60):
        thismonth = tmp['B_fd_' + str(i+1)]
        lastmonth = tmp['B_fd_' + str(i+2)]
        tmp['B_rfd_' + str(i+1)] = thismonth / lastmonth
        
        
    # 相邻年，一阶差分，一阶比值
    for i in range(4):
        thismonth = tmp['B_som_' + str((i+1)*12)] # last year
        lastmonth = tmp['B_som_' + str((i+2)*12)]
        tmp['B_fdy_' + str(i+1)] = thismonth - lastmonth
        tmp['B_fry_' + str(i+1)] = thismonth / lastmonth
    
    # 相邻年，二阶差分
    for i in range(3):
        thismonth = tmp['B_fdy_' + str(i+1)]
        lastmonth = tmp['B_fdy_' + str(i+2)]
        tmp['B_sdy_' + str(i+1)] = thismonth - lastmonth
        
    # 相邻年，二阶比值
    for i in range(3):
        thismonth = tmp['B_fry_' + str(i+1)]
        lastmonth = tmp['B_fry_' + str(i+2)]
        tmp['B_sry_' + str(i+1)] = thismonth / lastmonth
        
    # 相邻年，比值的差分
    for i in range(3):
        thismonth = tmp['B_fry_' + str(i+1)]
        lastmonth = tmp['B_fry_' + str(i+2)]
        tmp['B_dfry_' + str(i+1)] = thismonth - lastmonth
    
    # 相邻年，差分的比值
    for i in range(3):
        thismonth = tmp['B_fdy_' + str(i+1)]
        lastmonth = tmp['B_fdy_' + str(i+2)]
        tmp['B_rfdy_' + str(i+1)] = thismonth / lastmonth
    
#     #================================================================
#     # 下面准备一下部分品牌的历史各月销量
#     g_date = df[['sale_date','sale_quantity']].groupby('sale_date').sum()
#     g_date = g_date.rename(columns={'sale_quantity':'T_som_0'})
    
#     ratios = pd.merge(g_cls_date, g_date, how='left', left_on='sale_date', right_index=True)
#     ratios['B_rcm_0'] = ratios['sale_quantity'] / ratios['T_som_0']
    
#     gg = ratios.groupby('brand_id')
    
#     tmp['B_rcm_0'] = ratios['B_rcm_0']
#     # 过去几年内的每个月车型销量占比
#     for i in range(62):
#         tmp['B_rcm_' + str(i+1)] = gg.apply(f, -(i+1), -i, 'B_rcm_0').reset_index()['sale_date']
# #         tmp['B_rcm_' + str(i+1)] = tmp['sale_date'].apply(lambda x: g_date[x])
#     tmp = tmp.drop('B_rcm_0', axis=1)
    
#     # 一阶差分，一阶比值
#     for i in range(61):
#         thismonth = tmp['B_rcm_' + str(i+1)]
#         lastmonth = tmp['B_rcm_' + str(i+2)]
#         tmp['B_rcm_fd_' + str(i+1)] = thismonth - lastmonth
#         tmp['B_rcm_fr_' + str(i+1)] = thismonth / lastmonth
    
#     # 二阶差分
#     for i in range(60):
#         thismonth = tmp['B_rcm_fd_' + str(i+1)]
#         lastmonth = tmp['B_rcm_fd_' + str(i+2)]
#         tmp['B_rcm_sd_' + str(i+1)] = thismonth - lastmonth
        
#     # 二阶比值
#     for i in range(60):
#         thismonth = tmp['B_rcm_fr_' + str(i+1)]
#         lastmonth = tmp['B_rcm_fr_' + str(i+2)]
#         tmp['B_rcm_sr_' + str(i+1)] = thismonth / lastmonth
        
#     # 比值的差分
#     for i in range(60):
#         thismonth = tmp['B_rcm_fr_' + str(i+1)]
#         lastmonth = tmp['B_rcm_fr_' + str(i+2)]
#         tmp['B_rcm_dfr_' + str(i+1)] = thismonth - lastmonth
    
#     # 差分的比值
#     for i in range(60):
#         thismonth = tmp['B_rcm_fd_' + str(i+1)]
#         lastmonth = tmp['B_rcm_fd_' + str(i+2)]
#         tmp['B_rcm_rfd_' + str(i+1)] = thismonth / lastmonth


#     # 相邻年，一阶差分，一阶比值
#     for i in range(4):
#         thismonth = tmp['B_rcm_' + str((i+1)*12)]
#         lastmonth = tmp['B_rcm_' + str((i+2)*12)]
#         tmp['B_rcm_fdy_' + str(i+1)] = thismonth - lastmonth
#         tmp['B_rcm_fry_' + str(i+1)] = thismonth / lastmonth
    
#     # 相邻年，二阶差分
#     for i in range(3):
#         thismonth = tmp['B_rcm_fdy_' + str(i+1)]
#         lastmonth = tmp['B_rcm_fdy_' + str(i+2)]
#         tmp['B_rcm_sdy_' + str(i+1)] = thismonth - lastmonth
        
#     # 相邻年，二阶比值
#     for i in range(3):
#         thismonth = tmp['B_rcm_fry_' + str(i+1)]
#         lastmonth = tmp['B_rcm_fry_' + str(i+2)]
#         tmp['B_rcm_sry_' + str(i+1)] = thismonth / lastmonth
        
#     # 相邻年，比值的差分
#     for i in range(3):
#         thismonth = tmp['B_rcm_fry_' + str(i+1)]
#         lastmonth = tmp['B_rcm_fry_' + str(i+2)]
#         tmp['B_rcm_dfry_' + str(i+1)] = thismonth - lastmonth
    
#     # 相邻年，差分的比值
#     for i in range(3):
#         thismonth = tmp['B_rcm_fdy_' + str(i+1)]
#         lastmonth = tmp['B_rcm_fdy_' + str(i+2)]
#         tmp['B_rcm_rfdy_' + str(i+1)] = thismonth / lastmonth
    
    # 注意要把np.inf替换为空值，在上面算月销量比例时，引入了inf，其实应该作为空值。
    # 注意过程中产生的 0 也要都换成空值！因为实际上不可能有有意义的0出现的。
    return tmp.replace([np.inf, -np.inf, 0], np.nan)

In [35]:
tmp1 = calc_features_on_brand_and_time(df)

In [36]:
tmp1

Unnamed: 0,brand_id,sale_date,sale_quantity,B_som_1,B_som_2,B_som_3,B_som_4,B_som_5,B_som_6,B_som_7,...,B_sdy_3,B_sry_1,B_sry_2,B_sry_3,B_dfry_1,B_dfry_2,B_dfry_3,B_rfdy_1,B_rfdy_2,B_rfdy_3
0,12,2012-01-01,47.0,,,,,,,,...,,,,,,,,,,
1,12,2012-02-01,21.0,47.0,,,,,,,...,,,,,,,,,,
2,12,2012-03-01,10.0,21.0,47.0,,,,,,...,,,,,,,,,,
3,12,2012-04-01,10.0,10.0,21.0,47.0,,,,,...,,,,,,,,,,
4,12,2012-05-01,10.0,10.0,10.0,21.0,47.0,,,,...,,,,,,,,,,
5,12,2012-07-01,21.0,,10.0,10.0,10.0,21.0,47.0,,...,,,,,,,,,,
6,12,2012-10-01,5.0,,,21.0,,10.0,10.0,10.0,...,,,,,,,,,,
7,12,2012-11-01,10.0,5.0,,,21.0,,10.0,10.0,...,,,,,,,,,,
8,12,2012-12-01,16.0,10.0,5.0,,,21.0,,10.0,...,,,,,,,,,,
9,12,2013-01-01,31.0,16.0,10.0,5.0,,,21.0,,...,,,,,,,,,,


In [37]:
# 存盘
tmp1.to_csv("../../data/features/B_features.csv",index=False)

In [None]:
# 分品牌统计不同时间的销量信息
def calc_features_on_brand_and_time(df):
    """
    Args:
        df: 完整的数据集
    Return:
        tmp：基于综合时间和品牌信息，构造出的特征们
    """
    g_brand_date = df.groupby(['brand_id','sale_date']).sum().reset_index()[['brand_id','sale_date','sale_quantity']]
    gg = g_brand_date.groupby('brand_id')

    # 过去三年内的每个月销量
    tmp = g_brand_date
    for i in range(37):
        tmp['brand_sale_of_month_' + str(i+1) + '_ago'] = gg.apply(f, -(i+1), -i).reset_index()['sale_date']


    # 该品牌过去2~36个月分别的销量和（更快的方式，应该是直接用上面求出的各个月销量的结果，直接求和！而不是这样重新分组算！）
    tmp['brand_sum_sale_of_last_1_month'] = tmp['brand_sale_of_month_1_ago']
    for i in range(36):
        # tmp['sum_sale_of_last_' + str(i+1) + '_month'] = gg.apply(f, -(i+1), 0).reset_index()['sale_date']
        tmp['brand_sum_sale_of_last_' + str(i+2) + '_month'] = tmp['brand_sum_sale_of_last_' + str(i+1) + '_month'] + \
                                                            tmp['brand_sale_of_month_' + str(i+2) + '_ago']
    tmp = tmp.drop('brand_sum_sale_of_last_1_month', axis=1) # 再把这一列删掉，因为和前面的 sale_of_month_1_ago 列是重复的

    # 该品牌往年这个月比上个月的销量比值
    # 该品牌往年这个月减去上个月的销量差值
    for i in range(3): # 只看过去三年的
        thismonth = tmp['brand_sale_of_month_' + str((i+1)*12) + '_ago']
        lastmonth = tmp['brand_sale_of_month_' + str((i+1)*12+1) + '_ago'] # gg.apply(f, -(i+1)*12-1, -(i+1)*12).reset_index()['sale_date']
        tmp['brand_rate_of_this_month_divby_last_month_' + str(i+1) + '_year_ago'] = thismonth / lastmonth
        tmp['brand_diff_of_this_month_sub_last_month_' + str(i+1) + '_year_ago'] = thismonth - lastmonth

    # 该品牌上个月比上上个月的比值
    thisyear_lastmonth = tmp['brand_sale_of_month_1_ago']
    thisyear_lastlastmonth = tmp['brand_sale_of_month_2_ago']
    tmp['brand_rate_of_last_divby_lastlast'] = thisyear_lastmonth / thisyear_lastlastmonth
    # 该品牌上个月减去上上个月的差值
    tmp['brand_diff_of_last_sub_lastlast'] = thisyear_lastmonth - thisyear_lastlastmonth

    # 重命名一下销量列，明确含义，并避免后面join时冲突
    tmp = tmp.rename(columns={'sale_quantity': 'brand_sale_quantity'})
    # 注意要把np.inf替换为空值，在上面算月销量比例时，引入了inf，其实应该作为空值。
    return tmp.replace([np.inf, -np.inf], np.nan)