In [1]:
import pandas as pd
import numpy as np
from pandas.tseries.offsets import *
from xiao_utils import months_among, f, f1
# %matplotlib inline  # 本句用于在页面中直接画出图

#下面几句为了能把DataFrame显示完整
# pd.set_option('display.max_colwidth', 2000)
# pd.set_option('display.max_columns', 500)
# pd.set_option('display.max_rows', 200)

In [2]:
# 将level_id字段中的-替换为np.nan
df = pd.read_csv('../../data/origin/[new] yancheng_train_20171226.csv', dtype={'sale_date':str}, na_values=['-'], low_memory=False)
df['sale_date']= pd.to_datetime(df['sale_date'], format='%Y%m')

# 将price_level字段转换成有序类别的类型，并用其数值填入该列。
df['price_level'] = df['price_level'].astype('category', categories=['5WL','5-8W','8-10W','10-15W','15-20W','20-25W','25-35W','35-50W','50-75W'], ordered=True)
df['price_level'] = df['price_level'].cat.codes

# 待选方案：先把power和扭矩字段带/的行复制一份，然后将新行里的销量清零，将原行和新行的power和扭矩字段的值分别赋为slash前后的值。
# 现行方案：先他娘的直接把slash和后面的值删掉。省得影响记录条数相关的统计量。
def process_power_and_torque(s):
    return s.split('/')[0]
df['power'] = df['power'].astype(str).apply(process_power_and_torque).astype(float) #[18600]
df['engine_torque'] = df['engine_torque'].astype(str).apply(process_power_and_torque).astype(float)

# -------------------------------------------------------------
# 把2017年11月的数据拼进来，一块填入其特征，用于最终输出要提交的结果。
empty_Nov = pd.read_csv('../../data/origin/yancheng_testA_20171225.csv', dtype={'predict_date':str}, na_values=['-'], low_memory=False)
empty_Nov['predict_date']= pd.to_datetime(empty_Nov['predict_date'], format='%Y%m')
empty_Nov.rename(columns = {'predict_date': 'sale_date', 'predict_quantity':'sale_quantity'}, inplace = True)


# 读取玩了，先不急着拼，先把车型到品牌的映射关系join进来
class_to_brand = df[['class_id','brand_id']].groupby(['class_id']).mean().reset_index()
empyt_Nov = pd.merge(left=empty_Nov, right=class_to_brand, on='class_id', how='left')
empty_Nov['brand_id']= class_to_brand['brand_id']
# empty_Nov
# class_to_brand

# class_to_brand

# 读取完了，拼上去
df = pd.concat([df, empty_Nov])
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 20297 entries, 0 to 139
Data columns (total 32 columns):
TR                       20157 non-null object
brand_id                 20297 non-null int64
car_height               20157 non-null float64
car_length               20157 non-null float64
car_width                20157 non-null float64
class_id                 20297 non-null int64
compartment              20157 non-null float64
cylinder_number          20157 non-null float64
department_id            20157 non-null float64
displacement             20157 non-null float64
driven_type_id           20157 non-null float64
emission_standards_id    20157 non-null float64
engine_torque            20138 non-null float64
equipment_quality        20157 non-null float64
front_track              20157 non-null float64
fuel_type_id             20154 non-null float64
gearbox_type             20157 non-null object
if_MPV_id                20157 non-null float64
if_charging              20157 non-

In [3]:
# 读入时间特征表


### 综合考虑车型和时间的特征（完整的应有5587行，而不包括十月的应略小于5587条）
### 该类型特征的 column name 都以 C_ 开头，意为Class（车型）
约定下列特征的 column name 中的数字（即下面这个列表里的 i ）表示所计算特征对应的目标月份距离当前月份的差值，如：

某行的 sale_date=201609, 则对该行计算的 som_1 表示 201608 的单月销量

1. 过去5年的单月销量【C_som_i，Sale of One Month】
2. 过去2个月到过去5年的分别的销量和【C_ssm_i, Sale of Sum of Months】
3. 过去5年单月销量的一阶差分【C_fd_i, First Difference】
4. 过去5年单月销量的二阶差分【C_sd_i, Second Difference】
5. 过去5年单月销量比相应上个月销量的比值（类似于差分，减法换成除法）【C_fr_i, First order Ratio】
7. 单月销量二阶比值（即上面第5条特征的相邻月之比）【C_sr_i, Second order Ratio】
10. 过去5年各相邻月一阶比值的差分【C_dfr_i, Difference of First-order Ratio】
11. 过去5年各相邻月一阶差分的比值【C_rfd_i, Ratio of First-order Difference】

8. 【未加入】从去年往前看，过去几年中，当前月的销量占当年销量的比例（本车型的）【C_ry_i, Ratio in Year】
2. 【未加入】从去年开始往前看，每年中该车型销量占全年销量的比例【C_rcy_i, Ratio of this Class sale in whole Year sale】

1. 过去5年单月车型销量在单月各车型全部销量中的占比【C_rcm_i, Ratio of this Class sale in Month sale】
1. 基于这个占比，又有一堆特征：
    1. 一阶差分【C_rcm_fd_i, rcm和fd的含义，参考上面的特征描述】
    2. 二阶差分【C_rcm_sd_i】
    1. 一阶比值【C_rcm_fr_i】
    2. 二阶比值【C_rcm_sr_i】
    1. 比值的差分【C_rcm_dfr_i】
    2. 差分的比值【C_rcm_rfd_i】
    
1. 相邻年同月的差分【C_fdy, First-order Difference over Year】
1. 相邻年同月的比值【C_fry, y 表示 over Year】
1. 相邻年同月的二阶差分【C_sdy, y 表示 over Year】
1. 相邻年同月的二阶比值【C_sry, y 表示 over Year】
1. 相邻年同月的比值的差分【C_dfry, y 表示 over Year】
1. 相邻年同月的差分的比值【C_rdfy, y 表示 over Year】

In [None]:
# tt = df.groupby(['class_id','sale_date']).sum().reset_index()[['class_id','sale_date','sale_quantity']]

# g_date = df[['sale_date','sale_quantity']].groupby('sale_date').sum()
# g_date = g_date.rename(columns={'sale_quantity':'T_som_0'})

# tt['sale_date'].apply(lambda x: g_date.loc[[x]]['T_som_0'])# .reset_index()

# g_cls_date = df.groupby(['class_id','sale_date']).sum().reset_index()[['class_id','sale_date','sale_quantity']]
# gg = g_cls_date.groupby('class_id')
# gg.mean()

# g_cls_date



# g_date = df[['sale_date','sale_quantity']].groupby('sale_date').sum()
# g_date = g_date.rename(columns={'sale_quantity':'T_som_0'})
    
# ratios = pd.merge(g_cls_date, g_date, how='left', left_on='sale_date', right_index=True)
# ratios['C_rcm_0'] = ratios['sale_quantity'] / ratios['T_som_0']
    
# gg = ratios.groupby('class_id')

# gg.apply(f, -(0+1), -0, 'C_rcm_0').reset_index()

In [11]:
# 主要逻辑
def calc_features_on_class_and_time(df):
    """
    Args:
        df: 完整的数据集
    Return:
        tmp：基于综合时间和车型信息，构造出的特征们
    """
    g_cls_date = df.groupby(['class_id','sale_date']).sum().reset_index()[['class_id','sale_date','sale_quantity']]
    gg = g_cls_date.groupby('class_id')

    # 过去几年内的每个月销量
    tmp = g_cls_date
    for i in range(62):
        tmp['C_som_' + str(i+1)] = gg.apply(f, -(i+1), -i).reset_index()['sale_date']


    # 该车型过去2~60个月分别的销量和
    tmp['C_ssm_1'] = tmp['C_som_1']
    for i in range(60):
        tmp['C_ssm_' + str(i+2) ] = tmp['C_ssm_' + str(i+1)] + tmp['C_som_' + str(i+2)]
    tmp = tmp.drop('C_ssm_1', axis=1) # 再把这一列删掉，因为和前面的 C_som_1 列是重复的
    
    # 一阶差分，一阶比值
    for i in range(61):
        thismonth = tmp['C_som_' + str(i+1)]
        lastmonth = tmp['C_som_' + str(i+2)]
        tmp['C_fd_' + str(i+1)] = thismonth - lastmonth
        tmp['C_fr_' + str(i+1)] = thismonth / lastmonth
    
    # 二阶差分
    for i in range(60):
        thismonth = tmp['C_fd_' + str(i+1)]
        lastmonth = tmp['C_fd_' + str(i+2)]
        tmp['C_sd_' + str(i+1)] = thismonth - lastmonth
        
    # 二阶比值
    for i in range(60):
        thismonth = tmp['C_fr_' + str(i+1)]
        lastmonth = tmp['C_fr_' + str(i+2)]
        tmp['C_sr_' + str(i+1)] = thismonth / lastmonth
        
    # 比值的差分
    for i in range(60):
        thismonth = tmp['C_fr_' + str(i+1)]
        lastmonth = tmp['C_fr_' + str(i+2)]
        tmp['C_dfr_' + str(i+1)] = thismonth - lastmonth
    
    # 差分的比值
    for i in range(60):
        thismonth = tmp['C_fd_' + str(i+1)]
        lastmonth = tmp['C_fd_' + str(i+2)]
        tmp['C_rfd_' + str(i+1)] = thismonth / lastmonth
        
        
    # 相邻年，一阶差分，一阶比值
    for i in range(4):
        thismonth = tmp['C_som_' + str((i+1)*12)] # last year
        lastmonth = tmp['C_som_' + str((i+2)*12)]
        tmp['C_fdy_' + str(i+1)] = thismonth - lastmonth
        tmp['C_fry_' + str(i+1)] = thismonth / lastmonth
    
    # 相邻年，二阶差分
    for i in range(3):
        thismonth = tmp['C_fdy_' + str(i+1)]
        lastmonth = tmp['C_fdy_' + str(i+2)]
        tmp['C_sdy_' + str(i+1)] = thismonth - lastmonth
        
    # 相邻年，二阶比值
    for i in range(3):
        thismonth = tmp['C_fry_' + str(i+1)]
        lastmonth = tmp['C_fry_' + str(i+2)]
        tmp['C_sry_' + str(i+1)] = thismonth / lastmonth
        
    # 相邻年，比值的差分
    for i in range(3):
        thismonth = tmp['C_fry_' + str(i+1)]
        lastmonth = tmp['C_fry_' + str(i+2)]
        tmp['C_dfry_' + str(i+1)] = thismonth - lastmonth
    
    # 相邻年，差分的比值
    for i in range(3):
        thismonth = tmp['C_fdy_' + str(i+1)]
        lastmonth = tmp['C_fdy_' + str(i+2)]
        tmp['C_rfdy_' + str(i+1)] = thismonth / lastmonth
    
    #================================================================
    # 下面准备一下部分车型和品牌的历史各月销量
    g_date = df[['sale_date','sale_quantity']].groupby('sale_date').sum()
    g_date = g_date.rename(columns={'sale_quantity':'T_som_0'})
    
    ratios = pd.merge(g_cls_date, g_date, how='left', left_on='sale_date', right_index=True)
    ratios['C_rcm_0'] = ratios['sale_quantity'] / ratios['T_som_0']
    
    gg = ratios.groupby('class_id')
    
    tmp['C_rcm_0'] = ratios['C_rcm_0']
    # 过去几年内的每个月车型销量占比
    for i in range(62):
        tmp['C_rcm_' + str(i+1)] = gg.apply(f, -(i+1), -i, 'C_rcm_0').reset_index()['sale_date']
#         tmp['C_rcm_' + str(i+1)] = tmp['sale_date'].apply(lambda x: g_date[x])
    tmp = tmp.drop('C_rcm_0', axis=1)
    
    # 一阶差分，一阶比值
    for i in range(61):
        thismonth = tmp['C_rcm_' + str(i+1)]
        lastmonth = tmp['C_rcm_' + str(i+2)]
        tmp['C_rcm_fd_' + str(i+1)] = thismonth - lastmonth
        tmp['C_rcm_fr_' + str(i+1)] = thismonth / lastmonth
    
    # 二阶差分
    for i in range(60):
        thismonth = tmp['C_rcm_fd_' + str(i+1)]
        lastmonth = tmp['C_rcm_fd_' + str(i+2)]
        tmp['C_rcm_sd_' + str(i+1)] = thismonth - lastmonth
        
    # 二阶比值
    for i in range(60):
        thismonth = tmp['C_rcm_fr_' + str(i+1)]
        lastmonth = tmp['C_rcm_fr_' + str(i+2)]
        tmp['C_rcm_sr_' + str(i+1)] = thismonth / lastmonth
        
    # 比值的差分
    for i in range(60):
        thismonth = tmp['C_rcm_fr_' + str(i+1)]
        lastmonth = tmp['C_rcm_fr_' + str(i+2)]
        tmp['C_rcm_dfr_' + str(i+1)] = thismonth - lastmonth
    
    # 差分的比值
    for i in range(60):
        thismonth = tmp['C_rcm_fd_' + str(i+1)]
        lastmonth = tmp['C_rcm_fd_' + str(i+2)]
        tmp['C_rcm_rfd_' + str(i+1)] = thismonth / lastmonth


    # 相邻年，一阶差分，一阶比值
    for i in range(4):
        thismonth = tmp['C_rcm_' + str((i+1)*12)]
        lastmonth = tmp['C_rcm_' + str((i+2)*12)]
        tmp['C_rcm_fdy_' + str(i+1)] = thismonth - lastmonth
        tmp['C_rcm_fry_' + str(i+1)] = thismonth / lastmonth
    
    # 相邻年，二阶差分
    for i in range(3):
        thismonth = tmp['C_rcm_fdy_' + str(i+1)]
        lastmonth = tmp['C_rcm_fdy_' + str(i+2)]
        tmp['C_rcm_sdy_' + str(i+1)] = thismonth - lastmonth
        
    # 相邻年，二阶比值
    for i in range(3):
        thismonth = tmp['C_rcm_fry_' + str(i+1)]
        lastmonth = tmp['C_rcm_fry_' + str(i+2)]
        tmp['C_rcm_sry_' + str(i+1)] = thismonth / lastmonth
        
    # 相邻年，比值的差分
    for i in range(3):
        thismonth = tmp['C_rcm_fry_' + str(i+1)]
        lastmonth = tmp['C_rcm_fry_' + str(i+2)]
        tmp['C_rcm_dfry_' + str(i+1)] = thismonth - lastmonth
    
    # 相邻年，差分的比值
    for i in range(3):
        thismonth = tmp['C_rcm_fdy_' + str(i+1)]
        lastmonth = tmp['C_rcm_fdy_' + str(i+2)]
        tmp['C_rcm_rfdy_' + str(i+1)] = thismonth / lastmonth
    
    # 注意要把np.inf替换为空值，在上面算月销量比例时，引入了inf，其实应该作为空值。
    # 注意过程中产生的 0 也要都换成空值！因为实际上不可能有有意义的0出现的。
    return tmp.replace([np.inf, -np.inf, 0], np.nan)

In [12]:
tmp = calc_features_on_class_and_time(df)

In [13]:
tmp

Unnamed: 0,class_id,sale_date,sale_quantity,C_som_1,C_som_2,C_som_3,C_som_4,C_som_5,C_som_6,C_som_7,...,C_rcm_sdy_3,C_rcm_sry_1,C_rcm_sry_2,C_rcm_sry_3,C_rcm_dfry_1,C_rcm_dfry_2,C_rcm_dfry_3,C_rcm_rfdy_1,C_rcm_rfdy_2,C_rcm_rfdy_3
0,103507,2015-03-01,58.0,,,,,,,,...,,,,,,,,,,
1,103507,2015-04-01,232.0,58.0,,,,,,,...,,,,,,,,,,
2,103507,2015-05-01,226.0,232.0,58.0,,,,,,...,,,,,,,,,,
3,103507,2015-06-01,286.0,226.0,232.0,58.0,,,,,...,,,,,,,,,,
4,103507,2015-07-01,297.0,286.0,226.0,232.0,58.0,,,,...,,,,,,,,,,
5,103507,2015-08-01,355.0,297.0,286.0,226.0,232.0,58.0,,,...,,,,,,,,,,
6,103507,2015-09-01,442.0,355.0,297.0,286.0,226.0,232.0,58.0,,...,,,,,,,,,,
7,103507,2015-10-01,1050.0,442.0,355.0,297.0,286.0,226.0,232.0,58.0,...,,,,,,,,,,
8,103507,2015-11-01,481.0,1050.0,442.0,355.0,297.0,286.0,226.0,232.0,...,,,,,,,,,,
9,103507,2015-12-01,1433.0,481.0,1050.0,442.0,355.0,297.0,286.0,226.0,...,,,,,,,,,,


In [14]:
tmp.to_csv("../../data/features/C_features.csv",index=False)