In [1]:
import pandas as pd
import numpy as np
pd.options.mode.chained_assignment = None

In [2]:
offline_train = pd.read_csv('../data_original/ccf_offline_stage1_train.csv')

In [3]:
offline_test = pd.read_csv('../data_original/ccf_offline_stage1_test_revised.csv')

In [4]:
def DataHandle(dataset):
    '''对数据集进行预处理'''
    temp = dataset.copy()
    temp.rename(columns = {'Discount_rate': 'Discount'}, inplace = True)
    
    #新增两列，把两个时间都转为datetime
    if 'Date' in temp.columns:
        temp['Date_datetime'] = [pd.to_datetime(x, errors = 'coerce') for x in temp['Date']]
    temp['Date_received_datetime'] = [pd.to_datetime(x, errors = 'coerce') for x in temp['Date_received']]
    
    #计算时间间隔
    if 'Date_datetime' in temp.columns:
        temp['Day_gap'] = [x.days for x in temp['Date_datetime'] - temp['Date_received_datetime']]
        temp['Day_gap'].replace(np.nan, -1, inplace = True)
    
    #Date或者Date_received是否为周末
    if 'Date_datetime' in temp.columns:
        temp['Date_is_weekend'] = [1 if x.isoweekday() in [6, 7] else 0 if x.isoweekday() in [1, 2, 3, 4, 5] else -1 for x in temp['Date_datetime']]
    temp['Date_received_is_weekend'] = [1 if x.isoweekday() in [6, 7] else 0 if x.isoweekday() in [1, 2, 3, 4, 5] else -1 for x in temp['Date_received_datetime']]
    
    #券的类型(满减为0，折扣为1)
    temp['Discount_type'] = [0 if ':' in x else 1 if '.' in x else -1 for x in temp['Discount']]
    
    #券的折扣率
    temp['Discount_rate'] = [1 - int(x.split(':')[1]) / int(x.split(':')[0]) if ':' in x else float(x) if '.' in x else -1 for x in temp['Discount']]
    
    #最低消费
    temp['Min_cost'] = [int(x.split(':')[0]) if ':' in x else 0 if '.' in x else -1 for x in temp['Discount']]
    
    #距离中的null转为-1
    temp['Distance'].replace('null', -1, inplace = True)
    
    #把某些列转为要的类型
    if 'Date_datetime' in temp.columns:
        temp['Date'] = temp['Date'].astype('str')
    temp['Date_received'] = temp['Date_received'].astype('str')
    temp['Coupon_id'] = temp['Coupon_id'].astype('str')
    temp['Merchant_id'] = temp['Merchant_id'].astype('int')
    temp['User_id'] = temp['User_id'].astype('int')
    temp['Distance'] = temp['Distance'].astype('int')
    
    return temp

In [5]:
offline_train_handled =DataHandle(offline_train)

In [6]:
offline_test['Date_received'] = offline_test['Date_received'].astype('str')

In [7]:
offline_test_handled = DataHandle(offline_test)

In [8]:
def SetLabel(dataset):
    '''打标'''
    temp = dataset.copy()
    
    #领券未消费的标为0
    offline_train_part_1 = temp[(temp['Date_received'] != 'null') & (temp['Date'] == 'null')]
    offline_train_part_1['Label'] = 0
    
    #消费时间如果在15天之内，标记为1，否则标记为0
    offline_train_part_2 = temp[(temp['Date'] != 'null') & (temp['Date_received'] != 'null')]
    offline_train_part_2['Label'] = [1 if x <= 15 else 0 for x in offline_train_part_2['Day_gap']]
    
    #将两部分合并在一起，保存为csv文件
    offline_train_with_label = pd.concat([offline_train_part_1, offline_train_part_2], axis = 0)
    offline_train_with_label = offline_train_with_label.sample(frac = 1)
    
    return offline_train_with_label

In [9]:
offline_train_handled_with_label = SetLabel(offline_train_handled)
offline_train_handled_with_label['Date_received'] = offline_train_handled_with_label['Date_received'].astype('str')

In [10]:
def SplitDataset(dataset):
    '''划分特征区间'''
    dataset_1 = dataset[(dataset['Date_received'] >= '20160115') & (dataset['Date_received'] <= '20160315')]
    dataset_2 = dataset[(dataset['Date_received'] >= '20160301') & (dataset['Date_received'] <= '20160430')]
    dataset_3 = dataset[(dataset['Date_received'] >= '20160416') & (dataset['Date_received'] <= '20160615')]
    
    return dataset_1, dataset_2, dataset_3


def SplitLabel(dataset, dataset_test):
    '''划分标签区间'''
    label_1 = dataset[(dataset['Date_received'] >= '20160331') & (dataset['Date_received'] <= '20160430')].drop('Date', axis = 1)
    label_2 = dataset[(dataset['Date_received'] >= '20160516') & (dataset['Date_received'] <= '20160615')].drop('Date', axis = 1)
    label_3 = dataset_test
    
    return label_1, label_2, label_3

def SplitKong(dataset):
    '''划分空闲区间'''
    kong_1 = dataset[(dataset['Date'] >= '20160316') & (dataset['Date'] <= '20160330')]
    kong_2 = dataset[(dataset['Date'] >= '20160501') & (dataset['Date'] <= '20160515')]
    kong_3 = dataset[(dataset['Date'] >= '20160616') & (dataset['Date'] <= '20160630')]
    
    return kong_1, kong_2, kong_3

In [11]:
dataset_1, dataset_2, dataset_3 = SplitDataset(offline_train_handled_with_label)

In [12]:
def ExtractUserFeature(dataset):
    '''提取用户特征'''
    user = dataset[['User_id']].drop_duplicates()
    
    #用户领券次数
    user['user_get_count'] = dataset.groupby('User_id', sort = False)['Date_received'].count().values
    
    #用户领券并消费的次数
    temp = dataset[(dataset['Date'] != 'null') & (dataset['Date_received'] != 'null')]
    temp_user = temp[['User_id']].drop_duplicates()
    temp_user['user_get_cost_count'] = temp.groupby('User_id', sort = False)['Date'].count().values
    user = user.merge(temp_user, how = 'left', on = 'User_id')
    user['user_get_cost_count'].replace(np.nan, 0, inplace = True)
    
    #用户领券未消费的次数
    temp = dataset[(dataset['Date'] == 'null') & (dataset['Date_received'] != 'null')]
    temp_user = temp[['User_id']].drop_duplicates()
    temp_user['user_get_not_cost_count'] = temp.groupby('User_id', sort = False)['Date'].count().values
    user = user.merge(temp_user, how = 'left', on = 'User_id')
    user['user_get_not_cost_count'].replace(np.nan, 0, inplace = True)
    
    #用户核销的次数
    temp = dataset[dataset['Label'] == 1]
    temp_user = temp[['User_id']].drop_duplicates()
    temp_user['user_hexiao_count'] = temp.groupby('User_id', sort = False)['Label'].count().values
    user = user.merge(temp_user, how = 'left', on = 'User_id')
    user['user_hexiao_count'].replace(np.nan, 0, inplace = True)
    
    #用户领券并消费的次数 / 用户领券次数
    user['user_get_cost_get_per'] = [x for x in user['user_get_cost_count'] / user['user_get_count']]
    
    #用户核销的次数 / 用户领券次数
    user['user_hexiao_get_per'] = [x for x in user['user_hexiao_count'] / user['user_get_count']]
    
    #用户核销的次数 / 用户领券并消费的次数
    user['user_hexiao_get_cost_per'] = [x for x in user['user_hexiao_count'] / user['user_get_cost_count']]
    user['user_hexiao_get_cost_per'].replace(np.nan, 0, inplace = True)
    
    #用户领取券的平均折扣率
    dataset['Discount_rate'] = dataset['Discount_rate'].astype('float')
    user['user_get_discount_rate_mean'] = dataset.groupby('User_id', sort = False)['Discount_rate'].mean().values
    
    #用户领取并消费的券的平均折扣率
    temp = dataset[(dataset['Date'] != 'null') & (dataset['Date_received'] != 'null')]
    temp['Discount_rate'] = temp['Discount_rate'].astype('float')
    temp_user = temp[['User_id']].drop_duplicates()
    temp_user['user_get_cost_discount_rate_mean'] = temp.groupby('User_id', sort = False)['Discount_rate'].mean().values
    user = user.merge(temp_user, how = 'left', on = 'User_id')
    user['user_get_cost_discount_rate_mean'].replace(np.nan, -1, inplace = True)
    
    #用户领取的券的平均距离
    dataset['Distance'].replace(-1, np.nan, inplace = True)
    user['user_get_distance_mean'] = dataset.groupby('User_id', sort = False)['Distance'].mean().values
    
    #用户领取并消费的券的平均距离
    temp = dataset[(dataset['Date'] != 'null') & (dataset['Date_received'] != 'null')]
    temp_user = temp[['User_id']].drop_duplicates()
    temp_user['user_get_cost_distance_mean'] = temp.groupby('User_id', sort = False)['Distance'].mean().values
    user = user.merge(temp_user, how = 'left', on = 'User_id')
    user['user_get_cost_distance_mean'].replace(np.nan, -1, inplace = True)
    
    #用户从领取到消费的平均时间
    temp = dataset[(dataset['Date'] != 'null') & (dataset['Date_received'] != 'null')]
    temp_user = temp[['User_id']].drop_duplicates()
    temp_user['user_get_get_cost_day_mean'] = temp.groupby('User_id', sort = False)['Day_gap'].mean().values
    user = user.merge(temp_user, how = 'left', on = 'User_id')
    user['user_get_get_cost_day_mean'].replace(np.nan, -1, inplace = True)
    
    #用户领取了多少种类的券
    temp_user = dataset[['User_id']].drop_duplicates()
    temp_user_1 = dataset[['User_id', 'Coupon_id']].drop_duplicates()
    temp_user_1['temp'] = dataset.groupby(['User_id', 'Coupon_id'], sort = False)['Coupon_id'].count().values
    temp_user['user_get_kind_count'] = temp_user_1.groupby('User_id', sort = False)['temp'].count().values
    user = user.merge(temp_user, how = 'left', on = 'User_id')
    user['user_get_kind_count'].replace(np.nan, 0, inplace = True)
    
    #用户领取并消费了多少种类的券
    temp = dataset[(dataset['Date_received'] != 'null') & (dataset['Date'] != 'null')]
    temp_user = temp[['User_id']].drop_duplicates()
    temp_user_1 = temp[['User_id', 'Coupon_id']].drop_duplicates()
    temp_user_1['temp'] = temp.groupby(['User_id', 'Coupon_id'], sort = False)['Coupon_id'].count().values
    temp_user['user_get_cost_kind_count'] = temp_user_1.groupby('User_id', sort = False)['temp'].count().values
    user = user.merge(temp_user, how = 'left', on = 'User_id')
    user['user_get_cost_kind_count'].replace(np.nan, 0, inplace = True)
    
    #用户领取了多少种不同折扣率的券
    temp_user = dataset[['User_id']].drop_duplicates()
    temp_user_1 = dataset[['User_id', 'Discount_rate']].drop_duplicates()
    temp_user_1['temp'] = dataset.groupby(['User_id', 'Discount_rate'], sort = False)['Discount_rate'].count().values
    temp_user['user_get_kind_discount_count'] = temp_user_1.groupby('User_id', sort = False)['temp'].count().values
    user = user.merge(temp_user, how = 'left', on = 'User_id')
    user['user_get_kind_discount_count'].replace(np.nan, 0, inplace = True)
    
    #用户领取并消费了多少种不同折扣率的券
    temp = dataset[(dataset['Date_received'] != 'null') & (dataset['Date'] != 'null')]
    temp_user = temp[['User_id']].drop_duplicates()
    temp_user_1 = temp[['User_id', 'Discount_rate']].drop_duplicates()
    temp_user_1['temp'] = temp.groupby(['User_id', 'Discount_rate'], sort = False)['Discount_rate'].count().values
    temp_user['user_get_cost_kind_discount_count'] = temp_user_1.groupby('User_id', sort = False)['temp'].count().values
    user = user.merge(temp_user, how = 'left', on = 'User_id')
    user['user_get_cost_kind_discount_count'].replace(np.nan, 0, inplace = True)
    
    #用户领取了多少个不同商家的券
    temp_user = dataset[['User_id']].drop_duplicates()
    temp_user_1 = dataset[['User_id', 'Merchant_id']].drop_duplicates()
    temp_user_1['temp'] = dataset.groupby(['User_id', 'Merchant_id'], sort = False)['Merchant_id'].count().values
    temp_user['user_get_kind_merchant_count'] = temp_user_1.groupby('User_id', sort = False)['temp'].count().values
    user = user.merge(temp_user, how = 'left', on = 'User_id')
    user['user_get_kind_merchant_count'].replace(np.nan, 0, inplace = True)
    
    #用户领取并消费了多少个不同商家的券
    temp = dataset[(dataset['Date_received'] != 'null') & (dataset['Date'] != 'null')]
    temp_user = temp[['User_id']].drop_duplicates()
    temp_user_1 = temp[['User_id', 'Merchant_id']].drop_duplicates()
    temp_user_1['temp'] = temp.groupby(['User_id', 'Merchant_id'], sort = False)['Merchant_id'].count().values
    temp_user['user_get_cost_kind_merchant_count'] = temp_user_1.groupby('User_id', sort = False)['temp'].count().values
    user = user.merge(temp_user, how = 'left', on = 'User_id')
    user['user_get_cost_kind_merchant_count'].replace(np.nan, 0, inplace = True)
    
    #用户领券的日期是周末的次数
    temp = dataset[dataset['Date_received_is_weekend'] == 1]
    temp_user = temp[['User_id']].drop_duplicates()
    temp_user['user_get_is_weekend_count'] = temp.groupby('User_id', sort = False)['Date_received_is_weekend'].count().values
    user = user.merge(temp_user, how = 'left', on = 'User_id')
    user['user_get_is_weekend_count'].replace(np.nan, 0, inplace = True)
    
    #用户领券并消费的日期是周末的次数
    temp = dataset[(dataset['Date_is_weekend'] == 1) & (dataset['Date_received'] != 'null')]
    temp_user = temp[['User_id']].drop_duplicates()
    temp_user['user_get_cost_is_weekend_count'] = temp.groupby('User_id', sort = False)['Date_is_weekend'].count().values
    user = user.merge(temp_user, how = 'left', on = 'User_id')
    user['user_get_cost_is_weekend_count'].replace(np.nan, 0, inplace = True)
    
    #用户领券的日期是周末的次数 / 用户领券次数
    user['user_get_is_weekend_get_per'] = [x for x in user['user_get_is_weekend_count'] / user['user_get_count']]
    
    #用户领券并消费的日期是周末的次数 / 用户领券并消费的次数
    user['user_get_cost_is_weekend_cost_per'] = [x for x in user['user_get_cost_is_weekend_count'] / user['user_get_cost_count']]
    
    return user

In [13]:
user_feature_1 = ExtractUserFeature(dataset_1)
user_feature_2 = ExtractUserFeature(dataset_2)
user_feature_3 = ExtractUserFeature(dataset_3)

In [14]:
def ExtractMerchantFeature(dataset):
    '''提取商户特征'''
    merchant = dataset[['Merchant_id']].drop_duplicates()
    
    #商户被领取的券数量
    merchant['merchant_get_count'] = dataset.groupby('Merchant_id', sort = False)['Date_received'].count().values
    
    #商户被领取并消费的券的数量
    temp = dataset[(dataset['Date'] != 'null') & (dataset['Date_received'] != 'null')]
    temp_merchant = temp[['Merchant_id']].drop_duplicates()
    temp_merchant['merchant_get_cost_count'] = temp.groupby('Merchant_id', sort = False)['Date'].count().values
    merchant = merchant.merge(temp_merchant, how = 'left', on = 'Merchant_id')
    merchant['merchant_get_cost_count'].replace(np.nan, 0, inplace = True)
    
    #商户的券被领取但未消费的数量
    temp = dataset[(dataset['Date'] == 'null') & (dataset['Date_received'] != 'null')]
    temp_merchant = temp[['Merchant_id']].drop_duplicates()
    temp_merchant['merchant_get_not_cost_count'] = temp.groupby('Merchant_id', sort = False)['Date'].count().values
    merchant = merchant.merge(temp_merchant, how = 'left', on = 'Merchant_id')
    merchant['merchant_get_not_cost_count'].replace(np.nan, 0, inplace = True)
    
    #商户被核销的券的数量
    temp = dataset[dataset['Label'] == 1]
    temp_merchant = temp[['Merchant_id']].drop_duplicates()
    temp_merchant['merchant_hexiao_count'] = temp.groupby('Merchant_id', sort = False)['Label'].count().values
    merchant = merchant.merge(temp_merchant, how = 'left', on = 'Merchant_id')
    merchant['merchant_hexiao_count'].replace(np.nan, 0, inplace = True)
    
    #商户被领取并消费的券的数量 / 商户被领取的券数量
    merchant['merchant_get_cost_get_per'] = [x for x in merchant['merchant_get_cost_count'] / merchant['merchant_get_count']]
    
    #商户被核销的券的数量 / 商户被领取的券数量
    merchant['merchant_hexiao_get_per'] = [x for x in merchant['merchant_hexiao_count'] / merchant['merchant_get_count']]
    merchant['merchant_hexiao_get_per'].replace(np.nan, 0, inplace = True)
    
    #商户被核销的券的数量 / 商户被领取并消费的券的数量
    merchant['merchant_hexiao_get_cost_per'] = [x for x in merchant['merchant_hexiao_count'] / merchant['merchant_get_cost_count']]
    merchant['merchant_hexiao_get_cost_per'].replace(np.nan, 0, inplace = True)
    
    #商户被领取的券的平均折扣率
    dataset['Discount_rate'] = dataset['Discount_rate'].astype('float')
    merchant['merchant_get_discount_rate_mean'] = dataset.groupby('Merchant_id', sort = False)['Discount_rate'].mean().values
    
    #商家的券从被领取到消费的平均时间
    temp = dataset[(dataset['Date'] != 'null') & (dataset['Date_received'] != 'null')]
    temp_merchant = temp[['Merchant_id']].drop_duplicates()
    temp_merchant['merchant_get_get_cost_day_mean'] = temp.groupby('Merchant_id', sort = False)['Day_gap'].mean().values
    merchant = merchant.merge(temp_merchant, how = 'left', on = 'Merchant_id')
    merchant['merchant_get_get_cost_day_mean'].replace(np.nan, -1, inplace = True)
    
    #商家的券从领取到核销的平均时间
    temp = dataset[dataset['Label'] == 1]
    temp_merchant = temp[['Merchant_id']].drop_duplicates()
    temp_merchant['merchant_get_hexiao_day_mean'] = temp.groupby('Merchant_id', sort = False)['Day_gap'].mean().values
    merchant = merchant.merge(temp_merchant, how = 'left', on = 'Merchant_id')
    merchant['merchant_get_hexiao_day_mean'].replace(np.nan, -1, inplace = True)
    
    #商户被领取的券的平均距离
    temp = dataset.copy()
    temp['Distance'].replace(-1, np.nan, inplace = True)
    merchant['merchant_get_distance_mean'] = temp.groupby('Merchant_id', sort = False)['Distance'].mean().values
    merchant['merchant_get_distance_mean'].replace(np.nan, -1, inplace = True)
    
    #商户被领取并消费的券的平均距离
    temp = dataset[(dataset['Date_received'] != 'null') & (dataset['Date'] != 'null')]
    temp_merchant = temp[['Merchant_id']].drop_duplicates()
    temp_merchant['merchant_get_cost_distance_mean'] = temp.groupby('Merchant_id', sort = False)['Distance'].mean().values
    merchant = merchant.merge(temp_merchant, how = 'left', on = 'Merchant_id')
    merchant['merchant_get_cost_distance_mean'].replace(np.nan, -1, inplace = True)
    
    #商家的券被多少不同的用户领取
    temp_merchant = dataset[['Merchant_id']].drop_duplicates()
    temp_merchant_1 = dataset[['User_id', 'Merchant_id']].drop_duplicates()
    temp_merchant_1['temp'] = dataset.groupby(['User_id', 'Merchant_id'], sort = False)['User_id'].count().values
    temp_merchant['merchant_get_kind_user_count'] = temp_merchant_1.groupby('Merchant_id', sort = False)['temp'].count().values
    merchant = merchant.merge(temp_merchant, how = 'left', on = 'Merchant_id')
    
    #商家的券被多少不同的用户消费
    temp = dataset[(dataset['Date_received'] != 'null') & (dataset['Date'] != 'null')]
    temp_merchant = temp[['Merchant_id']].drop_duplicates()
    temp_merchant_1 = temp[['User_id', 'Merchant_id']].drop_duplicates()
    temp_merchant_1['temp'] = temp.groupby(['User_id', 'Merchant_id'], sort = False)['User_id'].count().values
    temp_merchant['merchant_get_cost_kind_user_count'] = temp_merchant_1.groupby('Merchant_id', sort = False)['temp'].count().values
    merchant = merchant.merge(temp_merchant, how = 'left', on = 'Merchant_id')
    merchant['merchant_get_cost_kind_user_count'].replace(np.nan, -1, inplace = True)
    
    #商家被领取的满减券的平均最低消费
    temp = dataset[dataset['Discount_type'] == 0]
    temp_merchant = temp[['Merchant_id']].drop_duplicates()
    temp_merchant['merchant_get_man_min_cost_mean'] = temp.groupby('Merchant_id', sort = False)['Min_cost'].mean().values
    merchant = merchant.merge(temp_merchant, how = 'left', on = 'Merchant_id')
    merchant['merchant_get_man_min_cost_mean'].replace(np.nan, -1, inplace = True)
    
    #商家发放了多少种不同的券
    temp_merchant = dataset[['Merchant_id']].drop_duplicates()
    temp_merchant_1 = dataset[['Merchant_id', 'Coupon_id']].drop_duplicates()
    temp_merchant_1['temp'] = dataset.groupby(['Merchant_id', 'Coupon_id'], sort = False)['Coupon_id'].count().values
    temp_merchant['merchant_get_kind_coupon_count'] = temp_merchant_1.groupby('Merchant_id', sort = False)['temp'].count().values
    merchant = merchant.merge(temp_merchant, how = 'left', on = 'Merchant_id')
    
    #商户被领券的日期是周末的次数
    temp = dataset[dataset['Date_received_is_weekend'] == 1]
    temp_user = temp[['Merchant_id']].drop_duplicates()
    temp_user['merchant_get_is_weekend_count'] = temp.groupby('Merchant_id', sort = False)['Date_received_is_weekend'].count().values
    merchant = merchant.merge(temp_user, how = 'left', on = 'Merchant_id')
    merchant['merchant_get_is_weekend_count'].replace(np.nan, 0, inplace = True)
    
    #商户被领券并消费的日期是周末的次数
    temp = dataset[(dataset['Date_is_weekend'] == 1) & (dataset['Date_received'] != 'null')]
    temp_user = temp[['Merchant_id']].drop_duplicates()
    temp_user['merchant_get_cost_is_weekend_count'] = temp.groupby('Merchant_id', sort = False)['Date_is_weekend'].count().values
    merchant = merchant.merge(temp_user, how = 'left', on = 'Merchant_id')
    merchant['merchant_get_cost_is_weekend_count'].replace(np.nan, 0, inplace = True)
    
    #商户被领券并消费的日期是周末的次数 / 商户被领券消费的次数
    merchant['merchant_get_cost_is_weekend_cost_per'] = [x for x in merchant['merchant_get_cost_is_weekend_count'] / merchant['merchant_get_cost_count']]
    
    #商户被领券的日期是周末的次数 / 商户被领取的券数量
    merchant['merchant_get_is_weekend_cost_per'] = [x for x in merchant['merchant_get_is_weekend_count'] / merchant['merchant_get_count']]
    
    return merchant

In [15]:
merchant_feature_1 = ExtractMerchantFeature(dataset_1)
merchant_feature_2 = ExtractMerchantFeature(dataset_2)
merchant_feature_3 = ExtractMerchantFeature(dataset_3)

In [16]:
def ExtraceDiscountFeature(dataset):
    '''提取折扣的特征'''
    discount = dataset[['Discount']].drop_duplicates()
    
    #折扣的类型（满减为0，折扣为1）
    discount['discount_type'] = dataset['Discount_type']
    
    #折扣率
    discount['discount_rate'] = dataset['Discount_rate']
    
    #该折扣率的券被领了多少次
    discount['discount_get_count'] = dataset.groupby('Discount', sort = False)['Date_received'].count().values
    
    #该折扣率的券被领取并消费了多少次
    temp = dataset[(dataset['Date_received'] != 'null') & (dataset['Date'] != 'null')]
    temp_discount = temp[temp['Discount'] != 'null'][['Discount']].drop_duplicates()
    temp_discount['discount_get_cost_count'] = temp.groupby('Discount', sort = False)['Date'].count().values
    discount = discount.merge(temp_discount, how = 'left', on = 'Discount')
    discount['discount_get_cost_count'].replace(np.nan, 0, inplace = True)
    
    #该折扣率的券被核销了多少次
    temp = dataset[dataset['Label'] == 1]
    temp_discount = temp[temp['Discount'] != 'null'][['Discount']].drop_duplicates()
    temp_discount['discount_hexiao_count'] = temp.groupby('Discount', sort = False)['Date'].count().values
    discount = discount.merge(temp_discount, how = 'left', on = 'Discount')
    discount['discount_hexiao_count'].replace(np.nan, 0, inplace = True)
    
    #该折扣率的券被消费了多少次 / 该折扣率的券被领了多少次
    discount['discount_get_cost_get_per'] = [x for x in discount['discount_get_cost_count'] / discount['discount_get_count']]
    discount['discount_get_cost_get_per'].replace(np.nan, 0, inplace = True)
    
    #该折扣率的券被核销了多少次 / 该折扣率的券被领了多少次
    discount['discount_hexiao_get_per'] = [x for x in discount['discount_hexiao_count'] / discount['discount_get_count']]
    discount['discount_hexiao_get_per'].replace(np.nan, 0, inplace = True)
    
    #该折扣率的券被核销了多少次 / 该折扣率的券被消费了多少次
    discount['discount_hexiao_get_cost_per'] = [x for x in discount['discount_hexiao_count'] / discount['discount_get_cost_count']]
    discount['discount_hexiao_get_cost_per'].replace(np.nan, 0, inplace = True)
    
    #该折扣率的券被多少个不同的用户所领取
    temp_discount = dataset[['Discount']].drop_duplicates()
    temp_discount_1 = dataset[['Discount', 'User_id']].drop_duplicates()
    temp_discount_1['temp'] = dataset.groupby(['Discount', 'User_id'], sort = False)['User_id'].count().values
    temp_discount['discount_get_kind_user_count'] = temp_discount_1.groupby('Discount', sort = False)['User_id'].count().values
    discount = discount.merge(temp_discount, how = 'left', on = 'Discount')
    discount['discount_get_kind_user_count'].replace(np.nan, 0, inplace = True)
    
    #该折扣率的券被多少个不同的用户所领取并消费
    temp = dataset[(dataset['Date_received'] != 'null') & (dataset['Date'] != 'null')]
    temp_discount = temp[['Discount']].drop_duplicates()
    temp_discount_1 = temp[['Discount', 'User_id']].drop_duplicates()
    temp_discount_1['temp'] = temp.groupby(['Discount', 'User_id'], sort = False)['User_id'].count().values
    temp_discount['discount_get_cost_kind_user_count'] = temp_discount_1.groupby('Discount', sort = False)['User_id'].count().values
    discount = discount.merge(temp_discount, how = 'left', on = 'Discount')
    discount['discount_get_cost_kind_user_count'].replace(np.nan, 0, inplace = True)
    
    #该折扣率的券被多少个不同的商户发放
    temp_discount = dataset[['Discount']].drop_duplicates()
    temp_discount_1 = dataset[['Discount', 'Merchant_id']].drop_duplicates()
    temp_discount_1['temp'] = dataset.groupby(['Discount', 'Merchant_id'], sort = False)['Merchant_id'].count().values
    temp_discount['discount_get_kind_merchant_count'] = temp_discount_1.groupby('Discount', sort = False)['Merchant_id'].count().values
    discount = discount.merge(temp_discount, how = 'left', on = 'Discount')
    discount['discount_get_kind_merchant_count'].replace(np.nan, 0, inplace = True)
    
    #该折扣率的券被多少个不同的商户发放并被领取消费
    temp = dataset[(dataset['Date_received'] != 'null') & (dataset['Date'] != 'null')]
    temp_discount = temp[['Discount']].drop_duplicates()
    temp_discount_1 = temp[['Discount', 'Merchant_id']].drop_duplicates()
    temp_discount_1['temp'] = temp.groupby(['Discount', 'Merchant_id'], sort = False)['Merchant_id'].count().values
    temp_discount['discount_get_cost_kind_merchant_count'] = temp_discount_1.groupby('Discount', sort = False)['Merchant_id'].count().values
    discount = discount.merge(temp_discount, how = 'left', on = 'Discount')
    discount['discount_get_cost_kind_merchant_count'].replace(np.nan, 0, inplace = True)
    
    #该折扣率的券被领券的日期是周末的次数
    temp = dataset[dataset['Date_received_is_weekend'] == 1]
    temp_user = temp[['Discount']].drop_duplicates()
    temp_user['discount_get_is_weekend_count'] = temp.groupby('Discount', sort = False)['Date_received_is_weekend'].count().values
    discount = discount.merge(temp_user, how = 'left', on = 'Discount')
    discount['discount_get_is_weekend_count'].replace(np.nan, 0, inplace = True)
    
    #该折扣率的券被领券并消费的日期是周末的次数
    temp = dataset[(dataset['Date_is_weekend'] == 1) & (dataset['Date_received'] != 'null')]
    temp_user = temp[['Discount']].drop_duplicates()
    temp_user['discount_get_cost_is_weekend_count'] = temp.groupby('Discount', sort = False)['Date_is_weekend'].count().values
    discount = discount.merge(temp_user, how = 'left', on = 'Discount')
    discount['discount_get_cost_is_weekend_count'].replace(np.nan, 0, inplace = True)
    
    #该折扣率的券被领券的日期是周末的次数 / 该折扣率的券被领了多少次
    discount['discount_get_is_weekend_get_per'] = [x for x in discount['discount_get_is_weekend_count'] / discount['discount_get_count']]
    
    #该折扣率的券被领券并消费的日期是周末的次数 / 该折扣率的券被领取并消费了多少次
    discount['discount_get_cost_is_weekend_cost_per'] = [x for x in discount['discount_get_cost_is_weekend_count'] / discount['discount_get_cost_count']]
    
    return discount

In [17]:
discount_feature_1 =ExtraceDiscountFeature(dataset_1)
discount_feature_2 =ExtraceDiscountFeature(dataset_2)
discount_feature_3 =ExtraceDiscountFeature(dataset_3)

In [18]:
def ExtraceUserMerchantFeature(dataset):
    '''提取用户-商户特征'''
    user_merchant = dataset[['User_id', 'Merchant_id']].drop_duplicates()
    
    #用户领取商家的优惠券次数
    user_merchant['user_merchant_get_count'] = dataset.groupby(['User_id', 'Merchant_id'], sort = False)['Coupon_id'].count().values
    
    #用户在商家领了券并消费的次数
    temp = dataset[(dataset['Date'] != 'null') & (dataset['Date_received'] != 'null')]
    temp_user_merchant = temp[['User_id', 'Merchant_id']].drop_duplicates()
    temp_user_merchant['user_merchant_get_get_cost_count'] = temp.groupby(['User_id', 'Merchant_id'], sort = False)['Date'].count().values
    user_merchant = user_merchant.merge(temp_user_merchant, how = 'left', on = ['User_id', 'Merchant_id'])
    user_merchant['user_merchant_get_get_cost_count'].replace(np.nan, 0, inplace = True)
    
    #用户在商家领了券却未消费的次数
    temp = dataset[(dataset['Date'] == 'null') & (dataset['Date_received'] != 'null')]
    temp_user_merchant = temp[['User_id', 'Merchant_id']].drop_duplicates()
    temp_user_merchant['user_merchant_get_not_cost_count'] = temp.groupby(['User_id', 'Merchant_id'], sort = False)['Date_received'].count().values
    user_merchant = user_merchant.merge(temp_user_merchant, how = 'left', on = ['User_id', 'Merchant_id'])
    user_merchant['user_merchant_get_not_cost_count'].replace(np.nan, 0, inplace = True)
    
    #用户在商家处核销的次数
    temp = dataset[dataset['Label'] == 1]
    temp_user_merchant = temp[['User_id', 'Merchant_id']].drop_duplicates()
    temp_user_merchant['user_merchant_hexiao_count'] = temp.groupby(['User_id', 'Merchant_id'], sort = False)['Label'].count().values
    user_merchant = user_merchant.merge(temp_user_merchant, how = 'left', on = ['User_id', 'Merchant_id'])
    user_merchant['user_merchant_hexiao_count'].replace(np.nan, 0, inplace = True)
    
    #用户在商家领了券并消费的次数 / 用户领取商家的优惠券次数
    user_merchant['user_merchant_get_cost_get_per'] = [x for x in user_merchant['user_merchant_get_get_cost_count'] / user_merchant['user_merchant_get_count']]
    user_merchant['user_merchant_get_cost_get_per'].replace(np.nan, 0, inplace = True)
    
    #用户在商家处核销的次数 / 用户领取商家的优惠券次数
    user_merchant['user_merchant_hexiao_get_per'] = [x for x in user_merchant['user_merchant_hexiao_count'] / user_merchant['user_merchant_get_count']] 
    user_merchant['user_merchant_hexiao_get_per'].replace(np.nan, 0, inplace = True)
    
    #用户在商家处核销的次数 / 用户在商家领了券并消费的次数
    user_merchant['user_merchant_hexiao_get_cost_per'] = [x for x in user_merchant['user_merchant_hexiao_count'] / user_merchant['user_merchant_get_get_cost_count']]
    user_merchant['user_merchant_hexiao_get_cost_per'].replace(np.nan, 0, inplace = True)
    
    #用户在商家处领取的券的平均折扣率
    user_merchant['user_merchant_get_discount_rate_mean'] = dataset.groupby(['User_id', 'Merchant_id'], sort = False)['Discount_rate'].mean().values
    
    #用户在商家处领取并消费的券的平均折扣率
    temp = dataset[(dataset['Date'] != 'null') & (dataset['Date_received'] != 'null')]
    temp_user_merchant = temp[['User_id', 'Merchant_id']].drop_duplicates()
    temp_user_merchant['user_merchant_get_cost_discount_rate_mean'] = temp.groupby(['User_id', 'Merchant_id'], sort = False)['Discount_rate'].mean().values
    user_merchant = user_merchant.merge(temp_user_merchant, how = 'left', on = ['User_id', 'Merchant_id'])
    user_merchant['user_merchant_get_cost_discount_rate_mean'].replace(np.nan, -1, inplace = True)
    
    #用户在商家处领取了多少种不同的券
    temp_user_merchant =  dataset[['User_id', 'Merchant_id']].drop_duplicates()
    temp_user_merchant_1 = dataset[['User_id', 'Merchant_id', 'Coupon_id']].drop_duplicates()
    temp_user_merchant_1['temp'] = dataset.groupby(['User_id', 'Merchant_id', 'Coupon_id'], sort = False)['Coupon_id'].count().values
    temp_user_merchant['user_merchant_get_kind_count'] = temp_user_merchant_1.groupby(['User_id', 'Merchant_id'], sort = False)['temp'].count().values
    user_merchant = user_merchant.merge(temp_user_merchant, how = 'left', on = ['User_id', 'Merchant_id'])
    user_merchant['user_merchant_get_kind_count'].replace(np.nan, 0, inplace = True)
    
    #用户在商家处领取并消费了多少种不同的券
    temp = dataset[(dataset['Date'] != 'null') & (dataset['Date_received'] != 'null')]
    temp_user_merchant =  temp[['User_id', 'Merchant_id']].drop_duplicates()
    temp_user_merchant_1 = temp[['User_id', 'Merchant_id', 'Coupon_id']].drop_duplicates()
    temp_user_merchant_1['temp'] = temp.groupby(['User_id', 'Merchant_id', 'Coupon_id'], sort = False)['Coupon_id'].count().values
    temp_user_merchant['user_merchant_get_cost_kind_count'] = temp_user_merchant_1.groupby(['User_id', 'Merchant_id'], sort = False)['temp'].count().values
    user_merchant = user_merchant.merge(temp_user_merchant, how = 'left', on = ['User_id', 'Merchant_id'])
    user_merchant['user_merchant_get_cost_kind_count'].replace(np.nan, 0, inplace = True)
    
    #用户在商家处从领取到消费的平均时间
    temp = dataset[(dataset['Date'] != 'null') & (dataset['Date_received'] != 'null')]
    temp_user_merchant = temp[['User_id', 'Merchant_id']].drop_duplicates()
    temp_user_merchant['user_merchant_get_get_cost_day_mean'] = temp.groupby(['User_id', 'Merchant_id'], sort = False)['Day_gap'].mean().values
    user_merchant = user_merchant.merge(temp_user_merchant, how = 'left', on = ['User_id', 'Merchant_id'])
    user_merchant['user_merchant_get_get_cost_day_mean'].replace(np.nan, -1, inplace = True)
    
    return user_merchant

In [19]:
user_merchant_feature_1 = ExtraceUserMerchantFeature(dataset_1)
user_merchant_feature_2 = ExtraceUserMerchantFeature(dataset_2)
user_merchant_feature_3 = ExtraceUserMerchantFeature(dataset_3) 

In [20]:

def ExtraceUserDiscountFeature(dataset):
    '''提取用户折扣特征'''
    user_discount = dataset[['User_id', 'Discount']].drop_duplicates()
    
    #用户领取该折扣率的券的数量
    user_discount['user_discount_get_count'] = dataset.groupby(['User_id', 'Discount'], sort = False)['Date_received'].count().values
    
    #用户领取并消费该折扣率的券的数量
    temp = dataset[(dataset['Date'] != 'null') & (dataset['Date_received'] != 'null')]
    temp_user_discount = temp[['User_id', 'Discount']].drop_duplicates()
    temp_user_discount['user_discount_get_cost_count'] = temp.groupby(['User_id', 'Discount'], sort = False)['Date'].count().values
    user_discount = user_discount.merge(temp_user_discount, how = 'left', on = ['User_id', 'Discount'])
    user_discount['user_discount_get_cost_count'].replace(np.nan, 0, inplace = True)
    
    #用户领取未消费该折扣率的券的数量
    user_discount['user_discount_get_not_cost_count'] = [x for x in user_discount['user_discount_get_count'] - user_discount['user_discount_get_cost_count']]
    
    #用户核销该折扣率的券的数量
    temp = dataset[dataset['Label'] == 1]
    temp_user_discount = temp[['User_id', 'Discount']].drop_duplicates()
    temp_user_discount['user_discount_hexiao_count'] = temp.groupby(['User_id', 'Discount'], sort = False)['Label'].count().values
    user_discount = user_discount.merge(temp_user_discount, how = 'left', on = ['User_id', 'Discount'])
    user_discount['user_discount_hexiao_count'].replace(np.nan, 0, inplace = True)
    
    #用户领取并消费该折扣率的券的数量 / 用户领取该折扣率的券的数量
    user_discount['user_discount_get_cost_get_per'] = [x for x in user_discount['user_discount_get_cost_count'] / user_discount['user_discount_get_count']]
    user_discount['user_discount_get_cost_get_per'].replace(np.nan, 0, inplace = True)
    
    #用户核销该折扣率的券的数量 / 用户领取该折扣率的券的数量
    user_discount['user_discount_hexiao_cost_get_per'] = [x for x in user_discount['user_discount_hexiao_count'] / user_discount['user_discount_get_count']]
    user_discount['user_discount_hexiao_cost_get_per'].replace(np.nan, 0, inplace = True)
    
    #用户核销该折扣率的券的数量 / 用户领取并消费该折扣率的券的数量
    user_discount['user_discount_hexiao_cost_per'] = [x for x in user_discount['user_discount_hexiao_count'] / user_discount['user_discount_get_cost_count']]
    user_discount['user_discount_hexiao_cost_per'].replace(np.nan, 0, inplace = True)
    
    #用户领取该折扣率的券的平均折扣率
    user_discount['user_discount_get_discount_rate_mean'] = dataset.groupby(['User_id', 'Discount'], sort = False)['Discount_rate'].mean().values
    
    #用户领取并消费该折扣率的券的平均折扣率
    temp = dataset[(dataset['Date'] != 'null') & (dataset['Date_received'] != 'null')]
    temp_user_discount = temp[['User_id', 'Discount']].drop_duplicates()
    temp_user_discount['user_discount_get_cost_discount_rate_mean'] = temp.groupby(['User_id', 'Discount'], sort = False)['Discount_rate'].mean().values
    user_discount = user_discount.merge(temp_user_discount, how = 'left', on = ['User_id', 'Discount'])
    user_discount['user_discount_get_cost_discount_rate_mean'].replace(np.nan, -1, inplace = True)
    
    #用户从领取该折扣的券到消费该券的平均时间
    temp = dataset[(dataset['Date'] != 'null') & (dataset['Date_received'] != 'null')]
    temp_user_discount = temp[['User_id', 'Discount']].drop_duplicates()
    temp_user_discount['user_discount_get_get_cost_day_mean'] = temp.groupby(['User_id', 'Discount'], sort = False)['Day_gap'].mean().values
    user_discount = user_discount.merge(temp_user_discount, how = 'left', on = ['User_id', 'Discount'])
    user_discount['user_discount_get_get_cost_day_mean'].replace(np.nan, -1, inplace = True)
    
    #用户与领取该折扣率的券的商家的平均距离
    dataset['Distance'].replace(-1, np.nan, inplace = True)
    user_discount['user_discount_get_distance'] = dataset.groupby(['User_id', 'Discount'], sort = False)['Distance'].mean().values
    user_discount['user_discount_get_distance'].replace(np.nan, -1, inplace = True)
    
    #用户与领取并消费该折扣率的券的商家的平均距离
    temp = dataset[(dataset['Date'] != 'null') & (dataset['Date_received'] != 'null')]
    temp_user_discount = temp[['User_id', 'Discount']].drop_duplicates()
    temp_user_discount['user_discount_get_get_cost_distance_mean'] = temp.groupby(['User_id', 'Discount'], sort = False)['Distance'].mean().values
    user_discount = user_discount.merge(temp_user_discount, how = 'left', on = ['User_id', 'Discount'])
    user_discount['user_discount_get_get_cost_distance_mean'].replace(np.nan, -1, inplace = True)
    dataset['Distance'].replace(np.nan, -1, inplace = True)
    
    #用户在多少不同的商家处领取了该折扣率的券
    temp_user_discount =  dataset[['User_id', 'Discount']].drop_duplicates()
    temp_user_discount_1 = dataset[['User_id', 'Discount', 'Merchant_id']].drop_duplicates()
    temp_user_discount_1['temp'] = dataset.groupby(['User_id', 'Discount', 'Merchant_id'], sort = False)['Merchant_id'].count().values
    temp_user_discount['user_discount_get_kind_count'] = temp_user_discount_1.groupby(['User_id', 'Discount'], sort = False)['temp'].count().values
    user_discount = user_discount.merge(temp_user_discount, how = 'left', on = ['User_id', 'Discount'])
    user_discount['user_discount_get_kind_count'].replace(np.nan, 0, inplace = True)
    
    #用户在多少不同的商家处领取并消费了该折扣率的券
    temp = dataset[(dataset['Date'] != 'null') & (dataset['Date_received'] != 'null')]
    temp_user_discount =  temp[['User_id', 'Discount']].drop_duplicates()
    temp_user_discount_1 = temp[['User_id', 'Discount', 'Merchant_id']].drop_duplicates()
    temp_user_discount_1['temp'] = temp.groupby(['User_id', 'Discount', 'Merchant_id'], sort = False)['Merchant_id'].count().values
    temp_user_discount['user_discount_get_cost_kind_count'] = temp_user_discount_1.groupby(['User_id', 'Discount'], sort = False)['temp'].count().values
    user_discount = user_discount.merge(temp_user_discount, how = 'left', on = ['User_id', 'Discount'])
    user_discount['user_discount_get_cost_kind_count'].replace(np.nan, 0, inplace = True)
    
    return user_discount

In [21]:
user_discount_feature_1 = ExtraceUserDiscountFeature(dataset_1)
user_discount_feature_2 = ExtraceUserDiscountFeature(dataset_2)
user_discount_feature_3 = ExtraceUserDiscountFeature(dataset_3)

In [22]:
kong_1, kong_2, kong_3 = SplitKong(offline_train)

In [23]:
def ExtractKongFeature(dataset):
    '''提取空闲段的特征'''
    kong_user = dataset[['User_id']].drop_duplicates()
    kong_merchant = dataset[['Merchant_id']].drop_duplicates()
    kong_user_merchant = dataset[['User_id', 'Merchant_id']].drop_duplicates()
    
    #用户的消费数量
    temp = dataset.copy()
    temp_kong = temp[temp['Date'] != 'null']
    temp_user = temp_kong[['User_id']].drop_duplicates()
    temp_user['kong_user_cost_count'] = temp_kong.groupby('User_id', sort = False)['Date'].count().values
    kong_user = kong_user.merge(temp_user, on = 'User_id', how = 'left')
    kong_user['kong_user_cost_count'].replace(np.nan, 0, inplace = True)
    
    #用户领券消费的数量
    temp = dataset.copy()
    temp_kong = temp[(temp['Date'] != 'null') & (temp['Date_received'] != 'null')]
    temp_user = temp_kong[['User_id']].drop_duplicates()
    temp_user['kong_user_get_cost_count'] = temp_kong.groupby('User_id', sort = False)['Date'].count().values
    kong_user = kong_user.merge(temp_user, on = 'User_id', how = 'left')
    kong_user['kong_user_get_cost_count'].replace(np.nan, 0, inplace = True)
    
    #用户未领券去消费的数量
    temp = dataset.copy()
    temp_kong = temp[(temp['Date'] != 'null') & (temp['Date_received'] == 'null')]
    temp_user = temp_kong[['User_id']].drop_duplicates()
    temp_user['kong_user_not_get_cost_count'] = temp_kong.groupby('User_id', sort = False)['Date'].count().values
    kong_user = kong_user.merge(temp_user, on = 'User_id', how = 'left')
    kong_user['kong_user_not_get_cost_count'].replace(np.nan, 0, inplace = True)
    
    #用户领券就消费的数量 / 用户的消费数量
    kong_user['kong_user_get_cost_cost_per'] = [x for x in kong_user['kong_user_get_cost_count'] / kong_user['kong_user_cost_count']]
    
    #用户未领券消费的数量 / 用户的消费数量
    kong_user['kong_user_not_get_cost_cost_per'] = [x for x in kong_user['kong_user_not_get_cost_count'] / kong_user['kong_user_cost_count']]
    
    #商户的被消费次数
    temp = dataset.copy()
    temp_kong = temp[temp['Date'] != 'null']
    temp_merchant = temp_kong[['Merchant_id']].drop_duplicates()
    temp_merchant['kong_merchant_cost_count'] = temp_kong.groupby('Merchant_id', sort = False)['Date'].count().values
    kong_merchant = kong_merchant.merge(temp_merchant, on = 'Merchant_id', how = 'left')
    kong_merchant['kong_merchant_cost_count'].replace(np.nan, 0, inplace = True)
    
    #商户被领券并消费的次数
    temp = dataset.copy()
    temp_kong = temp[(temp['Date'] != 'null') & (temp['Date_received'] != 'null')]
    temp_merchant = temp_kong[['Merchant_id']].drop_duplicates()
    temp_merchant['kong_merchant_get_cost_count'] = temp_kong.groupby('Merchant_id', sort = False)['Date'].count().values
    kong_merchant = kong_merchant.merge(temp_merchant, on = 'Merchant_id', how = 'left')
    kong_merchant['kong_merchant_get_cost_count'].replace(np.nan, 0, inplace = True)
    
    #商户被未领券被费的次数
    temp = dataset.copy()
    temp_kong = temp[(temp['Date'] != 'null') & (temp['Date_received'] == 'null')]
    temp_merchant = temp_kong[['Merchant_id']].drop_duplicates()
    temp_merchant['kong_merchant_not_get_cost_count'] = temp_kong.groupby('Merchant_id', sort = False)['Date'].count().values
    kong_merchant = kong_merchant.merge(temp_merchant, on = 'Merchant_id', how = 'left')
    kong_merchant['kong_merchant_not_get_cost_count'].replace(np.nan, 0, inplace = True)
    
    #商户被领券并消费的次数 / 商户的被消费次数
    kong_merchant['kong_merchant_get_cost_cost_per'] = [x for x in kong_merchant['kong_merchant_get_cost_count'] / kong_merchant['kong_merchant_cost_count']]
    
    #商户领券就消费的数量 / 用户的消费数量
    kong_merchant['kong_merchant_not_get_cost_cost_per'] = [x for x in kong_merchant['kong_merchant_not_get_cost_count'] / kong_merchant['kong_merchant_cost_count']]
    
    #用户-商户：用户领取该商家的券的数量
    temp = dataset[dataset['Date_received'] != 'null']
    temp_user_merchant = temp[['User_id', 'Merchant_id']].drop_duplicates()
    temp_user_merchant['kong_user_merchant_get_count'] = temp.groupby(['User_id', 'Merchant_id'], sort = False)['Date_received'].count().values
    kong_user_merchant = kong_user_merchant.merge(temp_user_merchant, how = 'left', on = ['User_id', 'Merchant_id'])
    kong_user_merchant['kong_user_merchant_get_count'].replace(np.nan, 0, inplace = True)
    
    #用户-商户：用户在该商家的消费次数
    kong_user_merchant['kong_user_merchant_cost_count'] = dataset.groupby(['User_id', 'Merchant_id'], sort = False)['Date'].count().values
    
    #用户-商户：用户领取该商家的券并消费的数量
    temp = dataset[(dataset['Date'] != 'null') & (dataset['Date_received'] != 'null')]
    temp_user_merchant = temp[['User_id', 'Merchant_id']].drop_duplicates()
    temp_user_merchant['kong_user_merchant_get_get_cost_count'] = temp.groupby(['User_id', 'Merchant_id'], sort = False)['Date'].count().values
    kong_user_merchant = kong_user_merchant.merge(temp_user_merchant, how = 'left', on = ['User_id', 'Merchant_id'])
    kong_user_merchant['kong_user_merchant_get_get_cost_count'].replace(np.nan, 0, inplace = True)
    
    #用户-商户：用户未领取该商户的券还去消费了的次数
    temp = dataset[(dataset['Date'] != 'null') & (dataset['Date_received'] == 'null')]
    temp_user_merchant = temp[['User_id', 'Merchant_id']].drop_duplicates()
    temp_user_merchant['kong_user_merchant_not_get_get_cost_count'] = temp.groupby(['User_id', 'Merchant_id'], sort = False)['Date'].count().values
    kong_user_merchant = kong_user_merchant.merge(temp_user_merchant, how = 'left', on = ['User_id', 'Merchant_id'])
    kong_user_merchant['kong_user_merchant_not_get_get_cost_count'].replace(np.nan, 0, inplace = True)
    
    #用户-商户：用户在该商家的消费次数 / 用户领取该商家的券的数量
    kong_user_merchant['kong_user_merchant_get_cost_get_per'] = [x for x in kong_user_merchant['kong_user_merchant_cost_count'] / kong_user_merchant['kong_user_merchant_get_count']]
    kong_user_merchant['kong_user_merchant_get_cost_get_per'].replace(np.inf, 0, inplace = True)
    
    #用户-商户：用户领取该商家的券并消费的数量 / 用户在该商家的消费次数
    kong_user_merchant['kong_user_merchant_get_cost_cost_per'] = [x for x in kong_user_merchant['kong_user_merchant_get_get_cost_count'] / kong_user_merchant['kong_user_merchant_cost_count']]
    
    #用户-商户：用户未领取该商户的券还去消费了的次数 / 用户在该商家的消费次数
    kong_user_merchant['kong_user_merchant_not_get_cost_cost_per'] = [x for x in kong_user_merchant['kong_user_merchant_not_get_get_cost_count'] / kong_user_merchant['kong_user_merchant_cost_count']]
    
    return kong_user, kong_merchant, kong_user_merchant

In [24]:
kong_user_feature_1, kong_merchant_feature_1, kong_user_merchant_feature_1 = ExtractKongFeature(kong_1)
kong_user_feature_2, kong_merchant_feature_2, kong_user_merchant_feature_2 = ExtractKongFeature(kong_2)
kong_user_feature_3, kong_merchant_feature_3, kong_user_merchant_feature_3 = ExtractKongFeature(kong_3)

In [25]:
def SplitDataset(dataset):
    '''划分特征区间'''
    dataset_1 = dataset[(dataset['Date_received'] >= '20160115') & (dataset['Date_received'] <= '20160315')]
    dataset_2 = dataset[(dataset['Date_received'] >= '20160301') & (dataset['Date_received'] <= '20160430')]
    dataset_3 = dataset[(dataset['Date_received'] >= '20160416') & (dataset['Date_received'] <= '20160615')]
    
    return dataset_1, dataset_2, dataset_3


def SplitLabel(dataset, dataset_test):
    '''划分标签区间'''
    label_1 = dataset[(dataset['Date_received'] >= '20160331') & (dataset['Date_received'] <= '20160430')].drop('Date', axis = 1)
    label_2 = dataset[(dataset['Date_received'] >= '20160516') & (dataset['Date_received'] <= '20160615')].drop('Date', axis = 1)
    label_3 = dataset_test
    
    return label_1, label_2, label_3

def SplitKong(dataset):
    '''划分空闲区间'''
    kong_1 = dataset[(dataset['Date'] >= '20160316') & (dataset['Date'] <= '20160330')]
    kong_2 = dataset[(dataset['Date'] >= '20160501') & (dataset['Date'] <= '20160515')]
    kong_3 = dataset[(dataset['Date'] >= '20160616') & (dataset['Date'] <= '20160630')]
    
    return kong_1, kong_2, kong_3

In [26]:
label_1, label_2, label_3 = SplitLabel(offline_train_handled_with_label, offline_test_handled)

In [27]:
import datetime

In [28]:
def ExtraceLabelFeature(dataset):
    '''提取标签特征'''
    label_user = dataset[['User_id']].drop_duplicates()
    label_merchant = dataset[['Merchant_id']].drop_duplicates()
    label_coupon = dataset[['Coupon_id']].drop_duplicates()
    label_user_merchant = dataset[['User_id', 'Merchant_id']].drop_duplicates()
    label_user_coupon = dataset[['User_id', 'Coupon_id']].drop_duplicates()
    label_user_discount = dataset[['User_id', 'Discount']].drop_duplicates()
    label_discount = dataset[['Discount']].drop_duplicates()
    temp = dataset.copy()
    
    ###############################################################################
    #用户的领券数量
    label_user['label_user_get_count'] = dataset.groupby('User_id', sort = False)['Date_received'].count().values
    
    #用户领取了多少种类的券
    temp_label = temp[['User_id']].drop_duplicates()
    temp_label_1 = temp[['User_id', 'Coupon_id']].drop_duplicates()
    temp_label_1['temp'] = temp.groupby(['User_id', 'Coupon_id'], sort = False)['Coupon_id'].count().values
    temp_label['label_user_get_kind_count'] = temp_label_1.groupby('User_id', sort = False)['temp'].count().values
    label_user = label_user.merge(temp_label, how = 'left', on = 'User_id')
    
    #用户领取了多少种折扣率的券
    temp_label = temp[['User_id']].drop_duplicates()
    temp_label_1 = temp[['User_id', 'Discount']].drop_duplicates()
    temp_label_1['temp'] = temp.groupby(['User_id', 'Discount'], sort = False)['Discount'].count().values
    temp_label['label_user_get_kind_discount_count'] = temp_label_1.groupby('User_id', sort = False)['temp'].count().values
    label_user = label_user.merge(temp_label, how = 'left', on = 'User_id')
    
    #用户领取了多少个不同商家的券
    temp_label = temp[['User_id']].drop_duplicates()
    temp_label_1 = temp[['User_id', 'Merchant_id']].drop_duplicates()
    temp_label_1['temp'] = temp.groupby(['User_id', 'Merchant_id'], sort = False)['Merchant_id'].count().values
    temp_label['label_user_get_kind_merchant_id_count'] = temp_label_1.groupby('User_id', sort = False)['temp'].count().values
    label_user = label_user.merge(temp_label, how = 'left', on = 'User_id')
    
    #用户领取的券的商家的平均距离
    temp['Distance'].replace(-1, np.nan, inplace = True)
    temp_label = temp[['User_id']].drop_duplicates()
    temp_label['label_user_get_distance_mean'] = temp.groupby('User_id', sort = False)['Distance'].mean().values
    label_user = label_user.merge(temp_label, how = 'left', on = 'User_id')
    label_user['label_user_get_distance_mean'].replace(np.nan, -1, inplace = True)
    temp['Distance'].replace(np.nan, -1, inplace = True)
    
    #用户领取的券的平均折扣率
    temp['Discount_rate'] = temp['Discount_rate'].astype('float')
    temp_label = temp[['User_id']].drop_duplicates()
    temp_label['label_user_get_discount_rate_mean'] = temp.groupby('User_id', sort = False)['Discount_rate'].mean().values
    label_user = label_user.merge(temp_label, how = 'left', on = 'User_id')
    
    #用户第一次领券与最后一次领券的天数差
    temp = dataset.copy()
    temp_label = temp[['User_id']].drop_duplicates()
    temp_label['Date_last_get'] = temp.groupby('User_id', sort = False)['Date_received_datetime'].max().values
    temp_label['Date_first_get'] = temp.groupby('User_id', sort = False)['Date_received_datetime'].min().values
    temp_label['label_user_first_get_last_get_day'] = [int(x.days) for x in pd.to_datetime(temp_label['Date_last_get']) - pd.to_datetime(temp_label['Date_first_get'])]
    temp_label.drop(['Date_last_get', 'Date_first_get'], axis = 1, inplace = True)
    label_user = label_user.merge(temp_label, how = 'left', on = 'User_id')
    
    #用户在这个月的前10天的领券次数
    temp = dataset.copy()
    temp['Date_received'] = temp['Date_received'].astype('str')
    temp['Date_received'] = pd.to_datetime(temp['Date_received'])
    max_date = temp['Date_received'].min() + datetime.timedelta(days = 10)
    temp = temp[temp['Date_received'] < max_date]
    label_temp = temp[['User_id']].drop_duplicates()
    label_temp['label_user_get_on_shangxun_count'] = temp.groupby('User_id', sort = False)['Date_received'].count().values
    label_user = label_user.merge(label_temp, how = 'left', on = 'User_id')
    label_user['label_user_get_on_shangxun_count'].replace(np.nan, 0, inplace = True)
    
    #用户在这个月的中间10天的领券次数
    temp = dataset.copy()
    temp['Date_received'] = temp['Date_received'].astype('str')
    temp['Date_received'] = pd.to_datetime(temp['Date_received'])
    min_date = max_date
    max_date = min_date + datetime.timedelta(days = 10)
    temp = temp[temp['Date_received'] >= min_date]
    temp = temp[temp['Date_received'] < max_date]
    label_temp = temp[['User_id']].drop_duplicates()
    label_temp['label_user_get_on_zhongxun_count'] = temp.groupby('User_id', sort = False)['Date_received'].count().values
    label_user = label_user.merge(label_temp, how = 'left', on = 'User_id')
    label_user['label_user_get_on_zhongxun_count'].replace(np.nan, 0, inplace = True)
    
    #用户在这个月的后10天的领券次数
    temp = dataset.copy()
    temp['Date_received'] = temp['Date_received'].astype('str')
    temp['Date_received'] = pd.to_datetime(temp['Date_received'])
    max_date = max_date
    temp = temp[temp['Date_received'] >= max_date]
    label_temp = temp[['User_id']].drop_duplicates()
    label_temp['label_user_get_on_xiaxun_count'] = temp.groupby('User_id', sort = False)['Date_received'].count().values
    label_user = label_user.merge(label_temp, how = 'left', on = 'User_id')
    label_user['label_user_get_on_xiaxun_count'].replace(np.nan, 0, inplace = True)
    
    #用户在上旬的领券次数与总的领券次数的占比
    label_user['label_user_get_on_shangxun_get_count'] = [x for x in label_user['label_user_get_on_shangxun_count'] / label_user['label_user_get_count']]
    
    #用户在中旬的领券次数与总的领券次数的占比
    label_user['label_user_get_on_zhongxun_get_count'] = [x for x in label_user['label_user_get_on_zhongxun_count'] / label_user['label_user_get_count']]
    
    #用户在下旬的领券次数与总的领券次数的占比
    label_user['label_user_get_on_xiaxun_get_count'] = [x for x in label_user['label_user_get_on_xiaxun_count'] / label_user['label_user_get_count']]
    
    #用户的领券日期是周末的次数
    temp = dataset[dataset['Date_received_is_weekend'] == 1]
    temp_user = temp[['User_id']].drop_duplicates()
    temp_user['label_user_get_is_weekend_count'] = temp.groupby('User_id', sort = False)['Date_received_is_weekend'].count().values
    label_user = label_user.merge(temp_user, how = 'left', on = 'User_id')
    label_user['label_user_get_is_weekend_count'].replace(np.nan, 0, inplace = True)
    
    #用户的领券日期是周末的次数 / 用户的领券数量
    label_user['label_user_get_is_weekend_get_per'] = [x for x in label_user['label_user_get_is_weekend_count'] / label_user['label_user_get_count']]
   
    return_dataset = dataset.merge(label_user, on = 'User_id', how = 'left')
    
    #用户是否是最后一次领券
    temp = dataset.copy()
    temp_label = temp[['User_id']].drop_duplicates()
    temp_label['Date_last_get'] = temp.groupby('User_id', sort = False)['Date_received_datetime'].max().values
    temp = temp.merge(temp_label, how = 'left', on = 'User_id')
    return_dataset['Label_user_is_last_get'] = [1 if x['Date_received_datetime'] == x['Date_last_get'] else 0 for i, x in temp.iterrows()]
    
    #用户是否是第一次领券
    temp = dataset.copy()
    temp_label = temp[['User_id']].drop_duplicates()
    temp_label['Date_first_get'] = temp.groupby('User_id', sort = False)['Date_received_datetime'].min().values
    temp = temp.merge(temp_label, how = 'left', on = 'User_id')
    return_dataset['Label_user_is_first_get'] = [1 if x['Date_received_datetime'] == x['Date_first_get'] else 0 for i, x in temp.iterrows()]
    ###############################################################################
    temp = dataset.copy()
    
    #商户被领取了多少券
    temp_label = temp[['Merchant_id']].drop_duplicates()
    temp_label['label_merchant_get_count'] = temp.groupby('Merchant_id', sort = False)['Date_received'].count().values
    label_merchant = label_merchant.merge(temp_label, how = 'left', on = 'Merchant_id')
    
    #商户在这个月的前10天的被领券次数
    temp = dataset.copy()
    temp['Date_received'] = temp['Date_received'].astype('str')
    temp['Date_received'] = pd.to_datetime(temp['Date_received'])
    max_date = temp['Date_received'].min() + datetime.timedelta(days = 10)
    temp = temp[temp['Date_received'] < max_date]
    label_temp = temp[['Merchant_id']].drop_duplicates()
    label_temp['label_merchant_get_on_shangxun_count'] = temp.groupby('Merchant_id', sort = False)['Date_received'].count().values
    label_merchant = label_merchant.merge(label_temp, how = 'left', on = 'Merchant_id')
    label_merchant['label_merchant_get_on_shangxun_count'].replace(np.nan, 0, inplace = True)
    
    #商户在这个月的中间10天的被领券次数
    temp = dataset.copy()
    temp['Date_received'] = temp['Date_received'].astype('str')
    temp['Date_received'] = pd.to_datetime(temp['Date_received'])
    min_date = max_date
    max_date = min_date + datetime.timedelta(days = 10)
    temp = temp[temp['Date_received'] >= min_date]
    temp = temp[temp['Date_received'] < max_date]
    label_temp = temp[['Merchant_id']].drop_duplicates()
    label_temp['label_merchant_get_on_zhongxun_count'] = temp.groupby('Merchant_id', sort = False)['Date_received'].count().values
    label_merchant = label_merchant.merge(label_temp, how = 'left', on = 'Merchant_id')
    label_merchant['label_merchant_get_on_zhongxun_count'].replace(np.nan, 0, inplace = True)
    
    #商户在这个月的后10天的被领券次数
    temp = dataset.copy()
    temp['Date_received'] = temp['Date_received'].astype('str')
    temp['Date_received'] = pd.to_datetime(temp['Date_received'])
    max_date = max_date
    temp = temp[temp['Date_received'] >= max_date]
    label_temp = temp[['Merchant_id']].drop_duplicates()
    label_temp['label_merchant_get_on_xiaxun_count'] = temp.groupby('Merchant_id', sort = False)['Date_received'].count().values
    label_merchant = label_merchant.merge(label_temp, how = 'left', on = 'Merchant_id')
    label_merchant['label_merchant_get_on_xiaxun_count'].replace(np.nan, 0, inplace = True)
    
    #商户在上旬的领券次数与总的领券次数的占比
    label_merchant['label_merchant_get_on_shangxun_get_count'] = [x for x in label_merchant['label_merchant_get_on_shangxun_count'] / label_merchant['label_merchant_get_count']]
    
    #商户在中旬的领券次数与总的领券次数的占比
    label_merchant['label_merchant_get_on_zhongxun_get_count'] = [x for x in label_merchant['label_merchant_get_on_zhongxun_count'] / label_merchant['label_merchant_get_count']]
    
    #商户在下旬的领券次数与总的领券次数的占比
    label_merchant['label_merchant_get_on_xiaxun_get_count'] = [x for x in label_merchant['label_merchant_get_on_xiaxun_count'] / label_merchant['label_merchant_get_count']]
    
    #商户被领取的券的平均折扣率
    temp = dataset.copy()
    temp['Discount_rate'] = temp['Discount_rate'].astype('float')
    temp_label = temp[['Merchant_id']].drop_duplicates()
    temp_label['label_merchant_coupon_discount_rate_mean'] = temp.groupby('Merchant_id', sort = False)['Discount_rate'].mean().values
    label_merchant = label_merchant.merge(temp_label, how = 'left', on = 'Merchant_id')
    
    #商家被领取的所有优惠券种类数目
    temp_label = temp[['Merchant_id']].drop_duplicates()
    temp_label_1 = temp[['Merchant_id', 'Coupon_id']].drop_duplicates()
    temp_label_1['temp'] = temp.groupby(['Merchant_id', 'Coupon_id'], sort = False)['Coupon_id'].count().values
    temp_label['label_merchant_coupon_kinds_count'] = temp_label_1.groupby('Merchant_id', sort = False)['temp'].count().values
    label_merchant = label_merchant.merge(temp_label, how = 'left', on = 'Merchant_id')
    
    #商家被领券的平均距离
    temp = dataset.copy()
    temp['Distance'].replace(-1, np.nan, inplace = True)
    temp_merchant = temp[['Merchant_id']].drop_duplicates()
    temp_merchant['label_merchant_coupon_distance_mean'] = temp.groupby('Merchant_id', sort = False)['Distance'].mean().values
    label_merchant = label_merchant.merge(temp_merchant, how = 'left', on = 'Merchant_id')
    
    #商户的领券日期是周末的次数
    temp = dataset[dataset['Date_received_is_weekend'] == 1]
    temp_user = temp[['Merchant_id']].drop_duplicates()
    temp_user['label_merchant_get_is_weekend_count'] = temp.groupby('Merchant_id', sort = False)['Date_received_is_weekend'].count().values
    label_merchant = label_merchant.merge(temp_user, how = 'left', on = 'Merchant_id')
    label_merchant['label_merchant_get_is_weekend_count'].replace(np.nan, 0, inplace = True)
    
    #商户的领券日期是周末的次数 / 商户的领券数量
    label_merchant['label_merchant_get_is_weekend_get_per'] = [x for x in label_merchant['label_merchant_get_is_weekend_count'] / label_merchant['label_merchant_get_count']]
    
    return_dataset = return_dataset.merge(label_merchant, on = 'Merchant_id', how = 'left')
    
    #商家是否最后一次被领券
    temp = dataset.copy()
    temp_label = temp[['Merchant_id']].drop_duplicates()
    temp_label['Date_last_get'] = temp.groupby('Merchant_id', sort = False)['Date_received_datetime'].max().values
    temp = temp.merge(temp_label, how = 'left', on = 'Merchant_id')
    return_dataset['Label_merchant_is_last_get'] = [1 if x['Date_received_datetime'] == x['Date_last_get'] else 0 for i, x in temp.iterrows()]   
    
    #商家是否第一次被领券
    temp = dataset.copy()
    temp_label = temp[['Merchant_id']].drop_duplicates()
    temp_label['Date_first_get'] = temp.groupby('Merchant_id', sort = False)['Date_received_datetime'].min().values
    temp = temp.merge(temp_label, how = 'left', on = 'Merchant_id')
    return_dataset['Label_merchant_is_first_get'] = [1 if x['Date_received_datetime'] == x['Date_first_get'] else 0 for i, x in temp.iterrows()]   
    ###############################################################################
    temp = dataset.copy()
    
    #优惠券被领取的次数
    temp = dataset.copy()
    temp_coupon = temp[['Coupon_id']].drop_duplicates()
    temp_coupon['label_coupon_get_count'] = temp.groupby('Coupon_id', sort = False)['Date_received'].count().values
    label_coupon = label_coupon.merge(temp_coupon, how = 'left', on = 'Coupon_id')
    
    #优惠券被多少不用的用户领取
    temp_coupon = temp[['Coupon_id']].drop_duplicates()
    temp_coupon_1 = temp[['Coupon_id', 'User_id']].drop_duplicates()
    temp_coupon_1['temp'] = temp_coupon_1.groupby(['Coupon_id', 'User_id'], sort = False)['User_id'].count().values
    temp_coupon['label_coupon_get_diff_user_count'] = temp_coupon_1.groupby('Coupon_id', sort = False)['temp'].count().values
    label_coupon = label_coupon.merge(temp_coupon, how = 'left', on = 'Coupon_id')
    
    #优惠券被用户领取的平均距离
    temp = dataset.copy()
    temp['Distance'].replace(-1, np.nan, inplace = True)
    temp_coupon = temp[['Coupon_id']].drop_duplicates()
    temp_coupon['label_coupon_distance_mean'] = temp.groupby('Coupon_id', sort = False)['Distance'].mean().values
    label_coupon = label_coupon.merge(temp_coupon, how = 'left', on = 'Coupon_id')
    label_coupon['label_coupon_distance_mean'].replace(np.nan, -1, inplace = True)
    
    #优惠券在这个月的前10天的被领券次数
    temp = dataset.copy()
    temp['Date_received'] = temp['Date_received'].astype('str')
    temp['Date_received'] = pd.to_datetime(temp['Date_received'])
    max_date = temp['Date_received'].min() + datetime.timedelta(days = 10)
    temp = temp[temp['Date_received'] < max_date]
    label_temp = temp[['Coupon_id']].drop_duplicates()
    label_temp['label_coupon_get_on_shangxun_count'] = temp.groupby('Coupon_id', sort = False)['Date_received'].count().values
    label_coupon = label_coupon.merge(label_temp, how = 'left', on = 'Coupon_id')
    label_coupon['label_coupon_get_on_shangxun_count'].replace(np.nan, 0, inplace = True)
    
    #优惠券在这个月的中间10天的被领券次数
    temp = dataset.copy()
    temp['Date_received'] = temp['Date_received'].astype('str')
    temp['Date_received'] = pd.to_datetime(temp['Date_received'])
    min_date = max_date
    max_date = min_date + datetime.timedelta(days = 10)
    temp = temp[temp['Date_received'] >= min_date]
    temp = temp[temp['Date_received'] < max_date]
    label_temp = temp[['Coupon_id']].drop_duplicates()
    label_temp['label_coupon_get_on_zhongxun_count'] = temp.groupby('Coupon_id', sort = False)['Date_received'].count().values
    label_coupon = label_coupon.merge(label_temp, how = 'left', on = 'Coupon_id')
    label_coupon['label_coupon_get_on_zhongxun_count'].replace(np.nan, 0, inplace = True)
    
    #优惠券在这个月的后10天的被领券次数
    temp = dataset.copy()
    temp['Date_received'] = temp['Date_received'].astype('str')
    temp['Date_received'] = pd.to_datetime(temp['Date_received'])
    max_date = max_date
    temp = temp[temp['Date_received'] >= max_date]
    label_temp = temp[['Coupon_id']].drop_duplicates()
    label_temp['label_coupon_get_on_xiaxun_count'] = temp.groupby('Coupon_id', sort = False)['Date_received'].count().values
    label_coupon = label_coupon.merge(label_temp, how = 'left', on = 'Coupon_id')
    label_coupon['label_coupon_get_on_xiaxun_count'].replace(np.nan, 0, inplace = True) 
    
    #优惠券在上旬的领券次数与总的领券次数的占比
    label_coupon['label_coupon_get_on_shangxun_get_count'] = [x for x in label_coupon['label_coupon_get_on_shangxun_count'] / label_coupon['label_coupon_get_count']]
    
    #优惠券在中旬的领券次数与总的领券次数的占比
    label_coupon['label_coupon_get_on_zhongxun_get_count'] = [x for x in label_coupon['label_coupon_get_on_zhongxun_count'] / label_coupon['label_coupon_get_count']]
    
    #优惠券在下旬的领券次数与总的领券次数的占比
    label_coupon['label_coupon_get_on_xiaxun_get_count'] = [x for x in label_coupon['label_coupon_get_on_xiaxun_count'] / label_coupon['label_coupon_get_count']]
    
    #优惠券的领券日期是周末的次数
    temp = dataset[dataset['Date_received_is_weekend'] == 1]
    temp_user = temp[['Coupon_id']].drop_duplicates()
    temp_user['label_coupon_get_is_weekend_count'] = temp.groupby('Coupon_id', sort = False)['Date_received_is_weekend'].count().values
    label_coupon = label_coupon.merge(temp_user, how = 'left', on = 'Coupon_id')
    label_coupon['label_coupon_get_is_weekend_count'].replace(np.nan, 0, inplace = True)
    
    #优惠券的领券日期是周末的次数 / 优惠券的领券数量
    label_coupon['label_coupon_get_is_weekend_get_per'] = [x for x in label_coupon['label_coupon_get_is_weekend_count'] / label_coupon['label_coupon_get_count']]
    
    return_dataset = return_dataset.merge(label_coupon, how = 'left', on = 'Coupon_id')
    
    #优惠券是否最后一次被领
    temp = dataset.copy()
    temp_label = temp[['Coupon_id']].drop_duplicates()
    temp_label['Date_last_get'] = temp.groupby('Coupon_id', sort = False)['Date_received_datetime'].max().values
    temp = temp.merge(temp_label, how = 'left', on = 'Coupon_id')
    return_dataset['Label_coupon_is_last_get'] = [1 if x['Date_received_datetime'] == x['Date_last_get'] else 0 for i, x in temp.iterrows()]    
    
    #优惠券是否第一次被领
    temp = dataset.copy()
    temp_label = temp[['Coupon_id']].drop_duplicates()
    temp_label['Date_first_get'] = temp.groupby('Coupon_id', sort = False)['Date_received_datetime'].min().values
    temp = temp.merge(temp_label, how = 'left', on = 'Coupon_id')
    return_dataset['Label_coupon_is_first_get'] = [1 if x['Date_received_datetime'] == x['Date_first_get'] else 0 for i, x in temp.iterrows()]  
    ###############################################################################
    temp = dataset.copy()
    
    #该折扣率的券被领取了多少次
    temp = dataset.copy()
    label_discount['label_discount_get_count'] = temp.groupby('Discount', sort = False)['Date_received'].count().values
    
    #该折扣率的券被多少不同用户领取
    temp_discount = dataset[['Discount']].drop_duplicates()
    temp_discount_1 = dataset[['Discount', 'User_id']].drop_duplicates()
    temp_discount_1['temp'] = dataset.groupby(['Discount', 'User_id'], sort = False)['User_id'].count().values
    temp_discount['Discount_get_kind_user_count'] = temp_discount_1.groupby('Discount', sort = False)['User_id'].count().values
    label_discount = label_discount.merge(temp_discount, how = 'left', on = 'Discount')
    
    #该折扣率的券被多少不同商家发放
    temp_discount = dataset[['Discount']].drop_duplicates()
    temp_discount_1 = dataset[['Discount', 'Merchant_id']].drop_duplicates()
    temp_discount_1['temp'] = dataset.groupby(['Discount', 'Merchant_id'], sort = False)['Merchant_id'].count().values
    temp_discount['Discount_get_kind_merchant_count'] = temp_discount_1.groupby('Discount', sort = False)['Merchant_id'].count().values
    label_discount = label_discount.merge(temp_discount, how = 'left', on = 'Discount')
    
    #该折扣率的券在这个月的前10天的被领券次数
    temp = dataset.copy()
    temp['Date_received'] = temp['Date_received'].astype('str')
    temp['Date_received'] = pd.to_datetime(temp['Date_received'])
    max_date = temp['Date_received'].min() + datetime.timedelta(days = 10)
    temp = temp[temp['Date_received'] < max_date]
    label_temp = temp[['Discount']].drop_duplicates()
    label_temp['label_discount_get_on_shangxun_count'] = temp.groupby('Discount', sort = False)['Date_received'].count().values
    label_discount = label_discount.merge(label_temp, how = 'left', on = 'Discount')
    label_discount['label_discount_get_on_shangxun_count'].replace(np.nan, 0, inplace = True)
    
    #该折扣率的券在这个月的中间10天的被领券次数
    temp = dataset.copy()
    temp['Date_received'] = temp['Date_received'].astype('str')
    temp['Date_received'] = pd.to_datetime(temp['Date_received'])
    min_date = max_date
    max_date = min_date + datetime.timedelta(days = 10)
    temp = temp[temp['Date_received'] >= min_date]
    temp = temp[temp['Date_received'] < max_date]
    label_temp = temp[['Discount']].drop_duplicates()
    label_temp['label_discount_get_on_zhongxun_count'] = temp.groupby('Discount', sort = False)['Date_received'].count().values
    label_discount = label_discount.merge(label_temp, how = 'left', on = 'Discount')
    label_discount['label_discount_get_on_zhongxun_count'].replace(np.nan, 0, inplace = True)
    
    #该折扣率的券在这个月的后10天的被领券次数
    temp = dataset.copy()
    temp['Date_received'] = temp['Date_received'].astype('str')
    temp['Date_received'] = pd.to_datetime(temp['Date_received'])
    max_date = max_date
    temp = temp[temp['Date_received'] >= max_date]
    label_temp = temp[['Discount']].drop_duplicates()
    label_temp['label_discount_get_on_xiaxun_count'] = temp.groupby('Discount', sort = False)['Date_received'].count().values
    label_discount = label_discount.merge(label_temp, how = 'left', on = 'Discount')
    label_discount['label_discount_get_on_xiaxun_count'].replace(np.nan, 0, inplace = True)
    
    #该折扣率的券在上旬的领券次数与总的领券次数的占比
    label_discount['label_discount_get_on_shangxun_get_count'] = [x for x in label_discount['label_discount_get_on_shangxun_count'] / label_discount['label_discount_get_count']]
    
    #该折扣率的券在中旬的领券次数与总的领券次数的占比
    label_discount['label_discount_get_on_zhongxun_get_count'] = [x for x in label_discount['label_discount_get_on_zhongxun_count'] / label_discount['label_discount_get_count']]
    
    #该折扣率的券在下旬的领券次数与总的领券次数的占比
    label_discount['label_discount_get_on_xiaxun_get_count'] = [x for x in label_discount['label_discount_get_on_xiaxun_count'] / label_discount['label_discount_get_count']]
    
    #优惠券的领券日期是周末的次数
    temp = dataset[dataset['Date_received_is_weekend'] == 1]
    temp_user = temp[['Discount']].drop_duplicates()
    temp_user['label_discount_get_is_weekend_count'] = temp.groupby('Discount', sort = False)['Date_received_is_weekend'].count().values
    label_discount = label_discount.merge(temp_user, how = 'left', on = 'Discount')
    label_discount['label_discount_get_is_weekend_count'].replace(np.nan, 0, inplace = True)
    
    #优惠券的领券日期是周末的次数 / 优惠券的领券数量
    label_discount['label_discount_get_is_weekend_get_per'] = [x for x in label_discount['label_discount_get_is_weekend_count'] / label_discount['label_discount_get_count']]
    
    return_dataset = return_dataset.merge(label_discount, how = 'left', on = 'Discount')
    
    #该折扣率的券是否最后一次被领取
    temp = dataset.copy()
    temp_label = temp[['Discount']].drop_duplicates()
    temp_label['Date_last_get'] = temp.groupby('Discount', sort = False)['Date_received_datetime'].max().values
    temp = temp.merge(temp_label, how = 'left', on = 'Discount')
    return_dataset['Label_discount_is_last_get'] = [1 if x['Date_received_datetime'] == x['Date_last_get'] else 0 for i, x in temp.iterrows()] 
    
    #该折扣率的券是否第一次被领取
    temp = dataset.copy()
    temp_label = temp[['Discount']].drop_duplicates()
    temp_label['Date_first_get'] = temp.groupby('Discount', sort = False)['Date_received_datetime'].min().values
    temp = temp.merge(temp_label, how = 'left', on = 'Discount')
    return_dataset['Label_discount_is_first_get'] = [1 if x['Date_received_datetime'] == x['Date_first_get'] else 0 for i, x in temp.iterrows()] 
    ###############################################################################
    temp = dataset.copy()
    
    #用户-商家:用户领取商家优惠券的次数
    temp_user_merchant = temp[['User_id', 'Merchant_id']].drop_duplicates()
    temp_user_merchant['label_user_merchant_get_count'] = temp.groupby(['User_id', 'Merchant_id'], sort = False)['Coupon_id'].count().values
    label_user_merchant = label_user_merchant.merge(temp_user_merchant, how = 'left', on = ['User_id', 'Merchant_id'])
     
    #用户-商家：用户领取商家优惠券的平均折扣率
    temp_user_merchant = temp[['User_id', 'Merchant_id']].drop_duplicates()
    temp_user_merchant['label_user_merchant_get_discount_rate_mean'] = temp.groupby(['User_id', 'Merchant_id'], sort = False)['Discount_rate'].mean().values
    label_user_merchant = label_user_merchant.merge(temp_user_merchant, how = 'left', on = ['User_id', 'Merchant_id'])
     
    #用户-商家：用户在这个商家的领券次数占用户总的领券次数的比例
    temp_user_merchant_1 = dataset[['User_id']].drop_duplicates()
    temp_user_merchant_1['label_user_get_count'] = dataset.groupby('User_id', sort = False)['Date_received'].count().values
    label_user_merchant = label_user_merchant.merge(temp_user_merchant_1, how = 'left', on = 'User_id')
    label_user_merchant['label_user_merchant_get_this_merchant_get_per'] = [x for x in label_user_merchant['label_user_merchant_get_count'] / label_user_merchant['label_user_get_count']]
    label_user_merchant.drop('label_user_get_count', axis = 1, inplace = True)
    
    #用户-商家：用户在这个月的前10天领取该券的次数
    temp = dataset.copy()
    temp['Date_received'] = temp['Date_received'].astype('str')
    temp['Date_received'] = pd.to_datetime(temp['Date_received'])
    max_date = temp['Date_received'].min() + datetime.timedelta(days = 10)
    temp = temp[temp['Date_received'] < max_date]
    label_temp = temp[['User_id', 'Merchant_id']].drop_duplicates()
    label_temp['label_user_merchant_get_on_shangxun_count'] = temp.groupby(['User_id', 'Merchant_id'], sort = False)['Date_received'].count().values
    label_user_merchant = label_user_merchant.merge(label_temp, how = 'left', on = ['User_id', 'Merchant_id'])
    label_user_merchant['label_user_merchant_get_on_shangxun_count'].replace(np.nan, 0, inplace = True)
    
    #用户-商家：用户在这个月的中间10天领取该券的次数
    temp = dataset.copy()
    temp['Date_received'] = temp['Date_received'].astype('str')
    temp['Date_received'] = pd.to_datetime(temp['Date_received'])
    min_date = max_date
    max_date = min_date + datetime.timedelta(days = 10)
    temp = temp[temp['Date_received'] >= min_date]
    temp = temp[temp['Date_received'] < max_date]
    label_temp = temp[['User_id', 'Merchant_id']].drop_duplicates()
    label_temp['label_user_merchant_get_on_zhongxun_count'] = temp.groupby(['User_id', 'Merchant_id'], sort = False)['Date_received'].count().values
    label_user_merchant = label_user_merchant.merge(label_temp, how = 'left', on = ['User_id', 'Merchant_id'])
    label_user_merchant['label_user_merchant_get_on_zhongxun_count'].replace(np.nan, 0, inplace = True)
    
    #用户-商家：用户在这个月的后10天领取该券的次数
    temp = dataset.copy()
    temp['Date_received'] = temp['Date_received'].astype('str')
    temp['Date_received'] = pd.to_datetime(temp['Date_received'])
    max_date = max_date
    temp = temp[temp['Date_received'] >= max_date]
    label_temp = temp[['User_id', 'Merchant_id']].drop_duplicates()
    label_temp['label_user_merchant_get_on_xiaxun_count'] = temp.groupby(['User_id', 'Merchant_id'], sort = False)['Date_received'].count().values
    label_user_merchant = label_user_merchant.merge(label_temp, how = 'left', on = ['User_id', 'Merchant_id'])
    label_user_merchant['label_user_merchant_get_on_xiaxun_count'].replace(np.nan, 0, inplace = True)
    
    #用户-商家：在上旬的领券次数与总的领券次数的占比
    label_user_merchant['label_user_merchant_get_on_shangxun_get_count'] = [x for x in label_user_merchant['label_user_merchant_get_on_shangxun_count'] / label_user_merchant['label_user_merchant_get_count']]
    
    #用户-商家：在中旬的领券次数与总的领券次数的占比
    label_user_merchant['label_user_merchant_get_on_zhongxun_get_count'] = [x for x in label_user_merchant['label_user_merchant_get_on_zhongxun_count'] / label_user_merchant['label_user_merchant_get_count']]
    
    #用户-商家：在下旬的领券次数与总的领券次数的占比
    label_user_merchant['label_user_merchant_get_on_xiaxun_get_count'] = [x for x in label_user_merchant['label_user_merchant_get_on_xiaxun_count'] / label_user_merchant['label_user_merchant_get_count']]
    
    return_dataset = return_dataset.merge(label_user_merchant, how = 'left', on = ['User_id', 'Merchant_id'])
    
    #用户-商家：用户是否是最后一次领这家店的券
    temp = dataset.copy()
    temp_label = temp[['User_id', 'Merchant_id']].drop_duplicates()
    temp_label['Date_last_get'] = temp.groupby(['User_id', 'Merchant_id'], sort = False)['Date_received_datetime'].max().values
    temp = temp.merge(temp_label, how = 'left', on = ['User_id', 'Merchant_id'])
    return_dataset['Label_user_merchant_is_last_get'] = [1 if x['Date_received_datetime'] == x['Date_last_get'] else 0 for i, x in temp.iterrows()] 
    
    #用户-商家：用户是否是第一次领这家店的券
    temp = dataset.copy()
    temp_label = temp[['User_id', 'Merchant_id']].drop_duplicates()
    temp_label['Date_first_get'] = temp.groupby(['User_id', 'Merchant_id'], sort = False)['Date_received_datetime'].min().values
    temp = temp.merge(temp_label, how = 'left', on = ['User_id', 'Merchant_id'])
    return_dataset['Label_user_merchant_is_first_get'] = [1 if x['Date_received_datetime'] == x['Date_first_get'] else 0 for i, x in temp.iterrows()] 
    ##############################################################################
    temp = dataset.copy()
     
    #用户-优惠券：用户领取该优惠券的次数
    temp_user_coupon = temp[['User_id', 'Coupon_id']].drop_duplicates()
    temp_user_coupon['label_user_coupon_get_count'] = temp.groupby(['User_id', 'Coupon_id'], sort = False)['Date_received'].count().values
    label_user_coupon = label_user_coupon.merge(temp_user_coupon, how = 'left', on = ['User_id', 'Coupon_id'])
     
    #用户-优惠券：用户领取这个优惠券的次数占用户总的领取次数的比例
    temp_user_coupon_1 = dataset[['User_id']].drop_duplicates()
    temp_user_coupon_1['label_user_get_count'] = dataset.groupby('User_id', sort = False)['Date_received'].count().values
    label_user_coupon = label_user_coupon.merge(temp_user_coupon_1, how = 'left', on = 'User_id')
    label_user_coupon['label_user_coupon_get_this_coupon_get_per'] = [x for x in label_user_coupon['label_user_coupon_get_count'] / label_user_coupon['label_user_get_count']]
    label_user_coupon.drop('label_user_get_count', axis = 1, inplace = True)
    
    #用户-优惠券：用户在这个月的前10天领取该券的次数
    temp = dataset.copy()
    temp['Date_received'] = temp['Date_received'].astype('str')
    temp['Date_received'] = pd.to_datetime(temp['Date_received'])
    max_date = temp['Date_received'].min() + datetime.timedelta(days = 10)
    temp = temp[temp['Date_received'] < max_date]
    label_temp = temp[['User_id', 'Coupon_id']].drop_duplicates()
    label_temp['label_user_coupon_get_on_shangxun_count'] = temp.groupby(['User_id', 'Coupon_id'], sort = False)['Date_received'].count().values
    label_user_coupon = label_user_coupon.merge(label_temp, how = 'left', on = ['User_id', 'Coupon_id'])
    label_user_coupon['label_user_coupon_get_on_shangxun_count'].replace(np.nan, 0, inplace = True)
    
    #用户-优惠券：用户在这个月的中间10天领取该券的次数
    temp = dataset.copy()
    temp['Date_received'] = temp['Date_received'].astype('str')
    temp['Date_received'] = pd.to_datetime(temp['Date_received'])
    min_date = max_date
    max_date = min_date + datetime.timedelta(days = 10)
    temp = temp[temp['Date_received'] >= min_date]
    temp = temp[temp['Date_received'] < max_date]
    label_temp = temp[['User_id', 'Coupon_id']].drop_duplicates()
    label_temp['label_user_coupon_get_on_zhongxun_count'] = temp.groupby(['User_id', 'Coupon_id'], sort = False)['Date_received'].count().values
    label_user_coupon = label_user_coupon.merge(label_temp, how = 'left', on = ['User_id', 'Coupon_id'])
    label_user_coupon['label_user_coupon_get_on_zhongxun_count'].replace(np.nan, 0, inplace = True)
    
    #用户-优惠券：用户在这个月的后10天领取该券的次数
    temp = dataset.copy()
    temp['Date_received'] = temp['Date_received'].astype('str')
    temp['Date_received'] = pd.to_datetime(temp['Date_received'])
    max_date = max_date
    temp = temp[temp['Date_received'] >= max_date]
    label_temp = temp[['User_id', 'Coupon_id']].drop_duplicates()
    label_temp['label_user_coupon_get_on_xiaxun_count'] = temp.groupby(['User_id', 'Coupon_id'], sort = False)['Date_received'].count().values
    label_user_coupon = label_user_coupon.merge(label_temp, how = 'left', on = ['User_id', 'Coupon_id'])
    label_user_coupon['label_user_coupon_get_on_xiaxun_count'].replace(np.nan, 0, inplace = True)
    
    #用户-优惠券：在上旬的领券次数与总的领券次数的占比
    label_user_coupon['label_user_coupon_get_on_shangxun_get_count'] = [x for x in label_user_coupon['label_user_coupon_get_on_shangxun_count'] / label_user_coupon['label_user_coupon_get_count']]
    
    #用户-优惠券：在中旬的领券次数与总的领券次数的占比
    label_user_coupon['label_user_coupon_get_on_zhongxun_get_count'] = [x for x in label_user_coupon['label_user_coupon_get_on_zhongxun_count'] / label_user_coupon['label_user_coupon_get_count']]
    
    #用户-优惠券：在下旬的领券次数与总的领券次数的占比
    label_user_coupon['label_user_coupon_get_on_xiaxun_get_count'] = [x for x in label_user_coupon['label_user_coupon_get_on_xiaxun_count'] / label_user_coupon['label_user_coupon_get_count']]
    
    return_dataset = return_dataset.merge(label_user_coupon, how = 'left', on = ['User_id', 'Coupon_id'])
    
    #用户-优惠券：用户是否是最后一次领券
    temp = dataset.copy()
    temp_label = temp[['User_id', 'Coupon_id']].drop_duplicates()
    temp_label['Date_last_get'] = temp.groupby(['User_id', 'Coupon_id'], sort = False)['Date_received_datetime'].max().values
    temp = temp.merge(temp_label, how = 'left', on = ['User_id', 'Coupon_id'])
    return_dataset['Label_user_coupon_is_last_get'] = [1 if x['Date_received_datetime'] == x['Date_last_get'] else 0 for i, x in temp.iterrows()] 
    
    #用户-优惠券：用户是否是第一次领券
    temp = dataset.copy()
    temp_label = temp[['User_id', 'Coupon_id']].drop_duplicates()
    temp_label['Date_first_get'] = temp.groupby(['User_id', 'Coupon_id'], sort = False)['Date_received_datetime'].min().values
    temp = temp.merge(temp_label, how = 'left', on = ['User_id', 'Coupon_id'])
    return_dataset['Label_user_coupon_is_first_get'] = [1 if x['Date_received_datetime'] == x['Date_first_get'] else 0 for i, x in temp.iterrows()] 
    ##############################################################################
    temp = dataset.copy()
    
    #用户-折扣：用户领取该折扣率的券的次数
    temp_user_discount = temp[['User_id', 'Discount']].drop_duplicates()
    temp_user_discount['label_user_discount_get_count'] = temp.groupby(['User_id', 'Discount'], sort = False)['Date_received'].count().values
    label_user_discount = label_user_discount.merge(temp_user_discount, how = 'left', on = ['User_id', 'Discount'])
     
    #用户-折扣：用户领取这个折扣率的券的次数占用户总的领取次数的比例
    temp_user_discount_1 = dataset[['User_id']].drop_duplicates()
    temp_user_discount_1['label_user_get_count'] = dataset.groupby('User_id', sort = False)['Date_received'].count().values
    label_user_discount = label_user_discount.merge(temp_user_discount_1, how = 'left', on = 'User_id')
    label_user_discount['label_user_discount_get_this_coupon_get_per'] = [x for x in label_user_discount['label_user_discount_get_count'] / label_user_discount['label_user_get_count']]
    label_user_discount.drop('label_user_get_count', axis = 1, inplace = True)
    
    #用户-折扣：用户在这个月的前10天领取该券的次数
    temp = dataset.copy()
    temp['Date_received'] = temp['Date_received'].astype('str')
    temp['Date_received'] = pd.to_datetime(temp['Date_received'])
    max_date = temp['Date_received'].min() + datetime.timedelta(days = 10)
    temp = temp[temp['Date_received'] < max_date]
    label_temp = temp[['User_id', 'Discount']].drop_duplicates()
    label_temp['label_user_discount_get_on_shangxun_count'] = temp.groupby(['User_id', 'Discount'], sort = False)['Date_received'].count().values
    label_user_discount = label_user_discount.merge(label_temp, how = 'left', on = ['User_id', 'Discount'])
    label_user_discount['label_user_discount_get_on_shangxun_count'].replace(np.nan, 0, inplace = True)
    
    #用户-折扣：用户在这个月的中间10天领取该券的次数
    temp = dataset.copy()
    temp['Date_received'] = temp['Date_received'].astype('str')
    temp['Date_received'] = pd.to_datetime(temp['Date_received'])
    min_date = max_date
    max_date = min_date + datetime.timedelta(days = 10)
    temp = temp[temp['Date_received'] >= min_date]
    temp = temp[temp['Date_received'] < max_date]
    label_temp = temp[['User_id', 'Discount']].drop_duplicates()
    label_temp['label_user_discount_get_on_zhongxun_count'] = temp.groupby(['User_id', 'Discount'], sort = False)['Date_received'].count().values
    label_user_discount = label_user_discount.merge(label_temp, how = 'left', on = ['User_id', 'Discount'])
    label_user_discount['label_user_discount_get_on_zhongxun_count'].replace(np.nan, 0, inplace = True)
    
    #用户-折扣：用户在这个月的后10天领取该券的次数
    temp = dataset.copy()
    temp['Date_received'] = temp['Date_received'].astype('str')
    temp['Date_received'] = pd.to_datetime(temp['Date_received'])
    max_date = max_date
    temp = temp[temp['Date_received'] >= max_date]
    label_temp = temp[['User_id', 'Discount']].drop_duplicates()
    label_temp['label_user_discount_get_on_xiaxun_count'] = temp.groupby(['User_id', 'Discount'], sort = False)['Date_received'].count().values
    label_user_discount = label_user_discount.merge(label_temp, how = 'left', on = ['User_id', 'Discount'])
    label_user_discount['label_user_discount_get_on_xiaxun_count'].replace(np.nan, 0, inplace = True)
    
    #用户-折扣：在上旬的领券次数与总的领券次数的占比
    label_user_discount['label_user_discount_get_on_shangxun_get_count'] = [x for x in label_user_discount['label_user_discount_get_on_shangxun_count'] / label_user_discount['label_user_discount_get_count']]
    
    #用户-折扣：在中旬的领券次数与总的领券次数的占比
    label_user_discount['label_user_discount_get_on_zhongxun_get_count'] = [x for x in label_user_discount['label_user_discount_get_on_zhongxun_count'] / label_user_discount['label_user_discount_get_count']]
    
    #用户-折扣：在下旬的领券次数与总的领券次数的占比
    label_user_discount['label_user_discount_get_on_xiaxun_get_count'] = [x for x in label_user_discount['label_user_discount_get_on_xiaxun_count'] / label_user_discount['label_user_discount_get_count']]
    
    return_dataset = return_dataset.merge(label_user_discount, how = 'left', on = ['User_id', 'Discount'])
    
    #用户-折扣：用户是否是最后一次领券
    temp = dataset.copy()
    temp_label = temp[['User_id', 'Discount']].drop_duplicates()
    temp_label['Date_last_get'] = temp.groupby(['User_id', 'Discount'], sort = False)['Date_received_datetime'].max().values
    temp = temp.merge(temp_label, how = 'left', on = ['User_id', 'Discount'])
    return_dataset['Label_user_discount_is_last_get'] = [1 if x['Date_received_datetime'] == x['Date_last_get'] else 0 for i, x in temp.iterrows()]
    
    #用户-折扣：用户是否是第一次领券
    temp = dataset.copy()
    temp_label = temp[['User_id', 'Discount']].drop_duplicates()
    temp_label['Date_first_get'] = temp.groupby(['User_id', 'Discount'], sort = False)['Date_received_datetime'].min().values
    temp = temp.merge(temp_label, how = 'left', on = ['User_id', 'Discount'])
    return_dataset['Label_user_discount_is_first_get'] = [1 if x['Date_received_datetime'] == x['Date_first_get'] else 0 for i, x in temp.iterrows()]
    
    return_dataset.drop('Date_received_datetime', axis = 1, inplace = True)
    
    return return_dataset

In [29]:
label_feature_1 = ExtraceLabelFeature(label_1)
label_feature_2 = ExtraceLabelFeature(label_2)
label_feature_3 = ExtraceLabelFeature(label_3)

In [30]:
def GenTrain(user_feature_2, merchant_feature_2, user_merchant_2, discount_feature_2, user_discount_2, label_feature_2, kong_user_feature_2, kong_merchant_feature_2, kong_user_merchant_feature_2):
    '''生成训练集'''
    train = label_feature_2.merge(user_feature_2, how = 'left', on = 'User_id')
    train = train.merge(merchant_feature_2, how = 'left', on = 'Merchant_id')
    train = train.merge(discount_feature_2, how = 'left', on = 'Discount')
    train = train.merge(user_merchant_2, how = 'left', on = ['User_id', 'Merchant_id'])
    train = train.merge(user_discount_2, how = 'left', on = ['User_id', 'Discount'])
    train = train.merge(kong_user_feature_2, how = 'left', on = 'User_id')
    train = train.merge(kong_merchant_feature_2, how = 'left', on = 'Merchant_id')
    train = train.merge(kong_user_merchant_feature_2, how = 'left', on = ['User_id', 'Merchant_id'])
    #train.drop(['User_id', 'Merchant_id', 'Discount', 'Coupon_id',  'Date_datetime', 'Date_is_weekend', 'Day_gap', 'Date_received'], axis = 1, inplace = True)
    
    #缺失值填充
    for col in train.columns:
        if train[col].isnull().any():
            if 'count' in col:
                train[col].replace(np.nan, 0, inplace = True)
            elif 'per' in col:
                train[col].replace(np.nan, 0, inplace = True)
            else:
                train[col].replace(np.nan, -1, inplace = True)
                
    return train
    
def GenValidation(user_feature_1, merchant_feature_1, user_merchant_1, discount_feature_1, user_discount_1, label_feature_1, kong_user_feature_1, kong_merchant_feature_1, kong_user_merchant_feature_1):
    '''生成验证集'''
    validation = label_feature_1.merge(user_feature_1, how = 'left', on = 'User_id')
    validation = validation.merge(merchant_feature_1, how = 'left', on = 'Merchant_id')
    validation = validation.merge(discount_feature_1, how = 'left', on = 'Discount')
    validation = validation.merge(user_merchant_1, how = 'left', on = ['User_id', 'Merchant_id'])
    validation = validation.merge(user_discount_1, how = 'left', on = ['User_id', 'Discount'])
    validation = validation.merge(kong_user_feature_1, how = 'left', on = 'User_id')
    validation = validation.merge(kong_merchant_feature_1, how = 'left', on = 'Merchant_id')
    validation = validation.merge(kong_user_merchant_feature_1, how = 'left', on = ['User_id', 'Merchant_id'])
    #validation.drop(['User_id', 'Merchant_id', 'Coupon_id', 'Discount', 'Date_datetime', 'Date_is_weekend', 'Day_gap', 'Date_received'], axis = 1, inplace = True)
    
    #缺失值填充
    for col in validation.columns:
        if validation[col].isnull().any():
            if 'count' in col:
                validation[col].replace(np.nan, 0, inplace = True)
            elif 'per' in col:
                validation[col].replace(np.nan, 0, inplace = True)
            else:
                validation[col].replace(np.nan, -1, inplace = True)
    return validation
                
def GenTest(user_feature_3, merchant_feature_3, user_merchant_3, discount_feature_3, user_discount_3, label_feature_3, kong_user_feature_3, kong_merchant_feature_3, kong_user_merchant_feature_3):
    '''生成测试集'''  
    test = label_feature_3.merge(user_feature_3, how = 'left', on = 'User_id')
    test = test.merge(merchant_feature_3, how = 'left', on = 'Merchant_id')
    test = test.merge(discount_feature_3, how = 'left', on = 'Discount')
    test = test.merge(user_merchant_3, how = 'left', on = ['User_id', 'Merchant_id'])
    test = test.merge(user_discount_3, how = 'left', on = ['User_id', 'Discount'])
    test = test.merge(kong_user_feature_3, how = 'left', on = 'User_id')
    test = test.merge(kong_merchant_feature_3, how = 'left', on = 'Merchant_id')
    test = test.merge(kong_user_merchant_feature_3, how = 'left', on = ['User_id', 'Merchant_id'])
    #test.drop(['Merchant_id', 'Discount'], axis = 1, inplace = True)
    
    #缺失值填充
    for col in test.columns:
        if test[col].isnull().any():
            if 'count' in col:
                test[col].replace(np.nan, 0, inplace = True)
            elif 'per' in col:
                test[col].replace(np.nan, 0, inplace = True)
            else:
                test[col].replace(np.nan, -1, inplace = True)
                
    return test


In [31]:
print('开始生成训练集.....')
train = GenTrain(user_feature_2, merchant_feature_2, user_merchant_feature_2, discount_feature_2, user_discount_feature_2, label_feature_2, kong_user_feature_2, kong_merchant_feature_2, kong_user_merchant_feature_2)
print('训练集的大小：', train.shape)
train.to_csv('train1.csv',index=None)
print('开始生成验证集.....')
validation = GenValidation(user_feature_1, merchant_feature_1, user_merchant_feature_1, discount_feature_1, user_discount_feature_1, label_feature_1, kong_user_feature_1, kong_merchant_feature_1, kong_user_merchant_feature_1)
print('验证集的大小：', validation.shape)
validation.to_csv('validation1.csv',index=None)
print('开始生成测试集.....')
test = GenTest(user_feature_3, merchant_feature_3, user_merchant_feature_3, discount_feature_3, user_discount_feature_3, label_feature_3, kong_user_feature_3, kong_merchant_feature_3, kong_user_merchant_feature_3)
print('测试集的大小：', test.shape)
test.to_csv('test1.csv',index=None)

开始生成训练集.....
训练集的大小： (252586, 203)
开始生成验证集.....
验证集的大小： (140944, 203)
开始生成测试集.....
测试集的大小： (113640, 199)


In [41]:
train_y = train['Label']
train_x = train.drop('Label', axis = 1)
test_name = test[['User_id', 'Coupon_id', 'Date_received']]
test_x = test.drop(['User_id', 'Coupon_id', 'Date_received'], axis = 1)
validation_y = validation['Label']
validation_x = validation.drop('Label', axis = 1)

In [43]:
import xgboost as xgb
train_and_vali_x = train_x.append(validation_x)
train_and_vali_y = train_y.append(validation_y)
    
train_set = xgb.DMatrix(train_and_vali_x, label = train_and_vali_y)
test_set = xgb.DMatrix(test_x)

In [44]:
from sklearn.ensemble import GradientBoostingClassifier

In [45]:
params = { 
			'booster':'gbtree',
			'objective': 'binary:logistic',
			'min_child_weight':5,
			'max_depth':5,
			'subsample':0.7,
			'colsample_bytree':0.7,
			'eta': 0.1,
			'eval_metric':'auc',
			'seed':7
            }
num_boost_round = 3500
watchlist = [(train_set,'train')]    
   
model = xgb.train(params, train_set, num_boost_round = num_boost_round,evals=watchlist)
   
xgb_result = test_name.copy()
xgb_result['Probability'] = model.predict(test_set)
    

[0]	train-auc:0.847375
[1]	train-auc:0.861283
[2]	train-auc:0.866512
[3]	train-auc:0.868248
[4]	train-auc:0.869768
[5]	train-auc:0.869667
[6]	train-auc:0.871898
[7]	train-auc:0.874479
[8]	train-auc:0.881049
[9]	train-auc:0.883281
[10]	train-auc:0.88348
[11]	train-auc:0.885152
[12]	train-auc:0.885921
[13]	train-auc:0.886758
[14]	train-auc:0.887702
[15]	train-auc:0.888659
[16]	train-auc:0.889857
[17]	train-auc:0.890554
[18]	train-auc:0.89151
[19]	train-auc:0.892191
[20]	train-auc:0.892418
[21]	train-auc:0.892743
[22]	train-auc:0.89326
[23]	train-auc:0.893811
[24]	train-auc:0.894081
[25]	train-auc:0.89464
[26]	train-auc:0.89508
[27]	train-auc:0.895681
[28]	train-auc:0.896173
[29]	train-auc:0.896611
[30]	train-auc:0.896966
[31]	train-auc:0.897447
[32]	train-auc:0.897707
[33]	train-auc:0.898088
[34]	train-auc:0.898567
[35]	train-auc:0.898982
[36]	train-auc:0.899619
[37]	train-auc:0.89994
[38]	train-auc:0.900304
[39]	train-auc:0.900713
[40]	train-auc:0.901025
[41]	train-auc:0.90123
[42]	trai

[335]	train-auc:0.924602
[336]	train-auc:0.924628
[337]	train-auc:0.924667
[338]	train-auc:0.924717
[339]	train-auc:0.92475
[340]	train-auc:0.924776
[341]	train-auc:0.924806
[342]	train-auc:0.924844
[343]	train-auc:0.924883
[344]	train-auc:0.924922
[345]	train-auc:0.925001
[346]	train-auc:0.925025
[347]	train-auc:0.925074
[348]	train-auc:0.925126
[349]	train-auc:0.92518
[350]	train-auc:0.925202
[351]	train-auc:0.925229
[352]	train-auc:0.925259
[353]	train-auc:0.925308
[354]	train-auc:0.925365
[355]	train-auc:0.925411
[356]	train-auc:0.925441
[357]	train-auc:0.925498
[358]	train-auc:0.925526
[359]	train-auc:0.925575
[360]	train-auc:0.925609
[361]	train-auc:0.925643
[362]	train-auc:0.925665
[363]	train-auc:0.925698
[364]	train-auc:0.925759
[365]	train-auc:0.925778
[366]	train-auc:0.925819
[367]	train-auc:0.92586
[368]	train-auc:0.925889
[369]	train-auc:0.925909
[370]	train-auc:0.925926
[371]	train-auc:0.925952
[372]	train-auc:0.925983
[373]	train-auc:0.926015
[374]	train-auc:0.926022
[37

[665]	train-auc:0.933959
[666]	train-auc:0.933967
[667]	train-auc:0.933995
[668]	train-auc:0.934011
[669]	train-auc:0.934019
[670]	train-auc:0.934063
[671]	train-auc:0.934081
[672]	train-auc:0.934105
[673]	train-auc:0.934127
[674]	train-auc:0.934138
[675]	train-auc:0.934171
[676]	train-auc:0.934193
[677]	train-auc:0.934213
[678]	train-auc:0.934225
[679]	train-auc:0.934249
[680]	train-auc:0.934265
[681]	train-auc:0.934298
[682]	train-auc:0.934311
[683]	train-auc:0.934321
[684]	train-auc:0.934347
[685]	train-auc:0.934351
[686]	train-auc:0.934373
[687]	train-auc:0.934399
[688]	train-auc:0.934437
[689]	train-auc:0.934456
[690]	train-auc:0.934496
[691]	train-auc:0.934515
[692]	train-auc:0.934545
[693]	train-auc:0.934563
[694]	train-auc:0.934583
[695]	train-auc:0.934598
[696]	train-auc:0.934611
[697]	train-auc:0.93464
[698]	train-auc:0.934664
[699]	train-auc:0.934691
[700]	train-auc:0.934723
[701]	train-auc:0.934743
[702]	train-auc:0.934779
[703]	train-auc:0.934818
[704]	train-auc:0.934836
[

[995]	train-auc:0.939765
[996]	train-auc:0.939782
[997]	train-auc:0.939797
[998]	train-auc:0.939826
[999]	train-auc:0.939839
[1000]	train-auc:0.939854
[1001]	train-auc:0.939895
[1002]	train-auc:0.939907
[1003]	train-auc:0.939935
[1004]	train-auc:0.939946
[1005]	train-auc:0.939957
[1006]	train-auc:0.939977
[1007]	train-auc:0.939988
[1008]	train-auc:0.940002
[1009]	train-auc:0.940019
[1010]	train-auc:0.940024
[1011]	train-auc:0.940033
[1012]	train-auc:0.940046
[1013]	train-auc:0.940071
[1014]	train-auc:0.940094
[1015]	train-auc:0.940098
[1016]	train-auc:0.940101
[1017]	train-auc:0.94012
[1018]	train-auc:0.940127
[1019]	train-auc:0.940146
[1020]	train-auc:0.94017
[1021]	train-auc:0.940173
[1022]	train-auc:0.940206
[1023]	train-auc:0.94021
[1024]	train-auc:0.940225
[1025]	train-auc:0.940238
[1026]	train-auc:0.940268
[1027]	train-auc:0.940293
[1028]	train-auc:0.940301
[1029]	train-auc:0.940325
[1030]	train-auc:0.940344
[1031]	train-auc:0.940381
[1032]	train-auc:0.940394
[1033]	train-auc:0.9

[1312]	train-auc:0.944545
[1313]	train-auc:0.944563
[1314]	train-auc:0.944583
[1315]	train-auc:0.944596
[1316]	train-auc:0.944602
[1317]	train-auc:0.944624
[1318]	train-auc:0.944636
[1319]	train-auc:0.944667
[1320]	train-auc:0.944673
[1321]	train-auc:0.94469
[1322]	train-auc:0.944717
[1323]	train-auc:0.944742
[1324]	train-auc:0.944771
[1325]	train-auc:0.944785
[1326]	train-auc:0.944793
[1327]	train-auc:0.944812
[1328]	train-auc:0.944826
[1329]	train-auc:0.944829
[1330]	train-auc:0.944833
[1331]	train-auc:0.944839
[1332]	train-auc:0.944839
[1333]	train-auc:0.944846
[1334]	train-auc:0.94486
[1335]	train-auc:0.944869
[1336]	train-auc:0.944884
[1337]	train-auc:0.944907
[1338]	train-auc:0.944913
[1339]	train-auc:0.944909
[1340]	train-auc:0.944925
[1341]	train-auc:0.944935
[1342]	train-auc:0.944949
[1343]	train-auc:0.944958
[1344]	train-auc:0.944977
[1345]	train-auc:0.944993
[1346]	train-auc:0.945009
[1347]	train-auc:0.945032
[1348]	train-auc:0.945046
[1349]	train-auc:0.945066
[1350]	train-a

[1629]	train-auc:0.948558
[1630]	train-auc:0.948561
[1631]	train-auc:0.948596
[1632]	train-auc:0.948601
[1633]	train-auc:0.948614
[1634]	train-auc:0.948622
[1635]	train-auc:0.948643
[1636]	train-auc:0.948646
[1637]	train-auc:0.948651
[1638]	train-auc:0.948668
[1639]	train-auc:0.948671
[1640]	train-auc:0.948688
[1641]	train-auc:0.948707
[1642]	train-auc:0.94873
[1643]	train-auc:0.948748
[1644]	train-auc:0.948749
[1645]	train-auc:0.948753
[1646]	train-auc:0.948757
[1647]	train-auc:0.948773
[1648]	train-auc:0.948795
[1649]	train-auc:0.9488
[1650]	train-auc:0.948808
[1651]	train-auc:0.948807
[1652]	train-auc:0.948822
[1653]	train-auc:0.948828
[1654]	train-auc:0.948852
[1655]	train-auc:0.948857
[1656]	train-auc:0.948864
[1657]	train-auc:0.948873
[1658]	train-auc:0.948884
[1659]	train-auc:0.948899
[1660]	train-auc:0.948906
[1661]	train-auc:0.948908
[1662]	train-auc:0.948928
[1663]	train-auc:0.948944
[1664]	train-auc:0.948948
[1665]	train-auc:0.948977
[1666]	train-auc:0.948996
[1667]	train-au

[1946]	train-auc:0.951836
[1947]	train-auc:0.95184
[1948]	train-auc:0.95185
[1949]	train-auc:0.95188
[1950]	train-auc:0.951907
[1951]	train-auc:0.951917
[1952]	train-auc:0.951936
[1953]	train-auc:0.951956
[1954]	train-auc:0.951968
[1955]	train-auc:0.95198
[1956]	train-auc:0.952002
[1957]	train-auc:0.951997
[1958]	train-auc:0.952004
[1959]	train-auc:0.952009
[1960]	train-auc:0.952055
[1961]	train-auc:0.95207
[1962]	train-auc:0.952079
[1963]	train-auc:0.952074
[1964]	train-auc:0.95208
[1965]	train-auc:0.952098
[1966]	train-auc:0.952109
[1967]	train-auc:0.952131
[1968]	train-auc:0.952155
[1969]	train-auc:0.95219
[1970]	train-auc:0.952199
[1971]	train-auc:0.952218
[1972]	train-auc:0.952221
[1973]	train-auc:0.95223
[1974]	train-auc:0.952237
[1975]	train-auc:0.952237
[1976]	train-auc:0.952253
[1977]	train-auc:0.952247
[1978]	train-auc:0.952254
[1979]	train-auc:0.952255
[1980]	train-auc:0.952266
[1981]	train-auc:0.952264
[1982]	train-auc:0.952272
[1983]	train-auc:0.952265
[1984]	train-auc:0.9

[2263]	train-auc:0.954647
[2264]	train-auc:0.954652
[2265]	train-auc:0.954662
[2266]	train-auc:0.954671
[2267]	train-auc:0.954686
[2268]	train-auc:0.954697
[2269]	train-auc:0.954702
[2270]	train-auc:0.954705
[2271]	train-auc:0.95472
[2272]	train-auc:0.954727
[2273]	train-auc:0.954721
[2274]	train-auc:0.954749
[2275]	train-auc:0.954749
[2276]	train-auc:0.954754
[2277]	train-auc:0.954758
[2278]	train-auc:0.954762
[2279]	train-auc:0.954772
[2280]	train-auc:0.954773
[2281]	train-auc:0.954771
[2282]	train-auc:0.95479
[2283]	train-auc:0.954799
[2284]	train-auc:0.954808
[2285]	train-auc:0.954817
[2286]	train-auc:0.954827
[2287]	train-auc:0.954845
[2288]	train-auc:0.954859
[2289]	train-auc:0.954868
[2290]	train-auc:0.954879
[2291]	train-auc:0.954881
[2292]	train-auc:0.954885
[2293]	train-auc:0.95489
[2294]	train-auc:0.954899
[2295]	train-auc:0.954907
[2296]	train-auc:0.954904
[2297]	train-auc:0.954897
[2298]	train-auc:0.954892
[2299]	train-auc:0.954908
[2300]	train-auc:0.954917
[2301]	train-au

[2580]	train-auc:0.957049
[2581]	train-auc:0.957048
[2582]	train-auc:0.957065
[2583]	train-auc:0.957073
[2584]	train-auc:0.957092
[2585]	train-auc:0.957089
[2586]	train-auc:0.957101
[2587]	train-auc:0.957105
[2588]	train-auc:0.957108
[2589]	train-auc:0.957126
[2590]	train-auc:0.957123
[2591]	train-auc:0.957127
[2592]	train-auc:0.95713
[2593]	train-auc:0.957134
[2594]	train-auc:0.957137
[2595]	train-auc:0.957146
[2596]	train-auc:0.957161
[2597]	train-auc:0.957167
[2598]	train-auc:0.957158
[2599]	train-auc:0.957158
[2600]	train-auc:0.957177
[2601]	train-auc:0.957187
[2602]	train-auc:0.957177
[2603]	train-auc:0.957186
[2604]	train-auc:0.9572
[2605]	train-auc:0.957205
[2606]	train-auc:0.957209
[2607]	train-auc:0.957218
[2608]	train-auc:0.957218
[2609]	train-auc:0.957228
[2610]	train-auc:0.95723
[2611]	train-auc:0.957244
[2612]	train-auc:0.957255
[2613]	train-auc:0.957261
[2614]	train-auc:0.957265
[2615]	train-auc:0.957267
[2616]	train-auc:0.957278
[2617]	train-auc:0.95728
[2618]	train-auc:

[2897]	train-auc:0.959188
[2898]	train-auc:0.959183
[2899]	train-auc:0.959177
[2900]	train-auc:0.95919
[2901]	train-auc:0.959192
[2902]	train-auc:0.959195
[2903]	train-auc:0.959199
[2904]	train-auc:0.959222
[2905]	train-auc:0.959224
[2906]	train-auc:0.959227
[2907]	train-auc:0.959227
[2908]	train-auc:0.95923
[2909]	train-auc:0.959238
[2910]	train-auc:0.959242
[2911]	train-auc:0.959235
[2912]	train-auc:0.959256
[2913]	train-auc:0.959267
[2914]	train-auc:0.959281
[2915]	train-auc:0.959293
[2916]	train-auc:0.959304
[2917]	train-auc:0.959318
[2918]	train-auc:0.959328
[2919]	train-auc:0.959321
[2920]	train-auc:0.959333
[2921]	train-auc:0.959341
[2922]	train-auc:0.959358
[2923]	train-auc:0.95937
[2924]	train-auc:0.959378
[2925]	train-auc:0.959387
[2926]	train-auc:0.959387
[2927]	train-auc:0.959397
[2928]	train-auc:0.959393
[2929]	train-auc:0.959392
[2930]	train-auc:0.959412
[2931]	train-auc:0.959414
[2932]	train-auc:0.959421
[2933]	train-auc:0.959416
[2934]	train-auc:0.959417
[2935]	train-au

[3213]	train-auc:0.961093
[3214]	train-auc:0.9611
[3215]	train-auc:0.961095
[3216]	train-auc:0.961096
[3217]	train-auc:0.961095
[3218]	train-auc:0.961092
[3219]	train-auc:0.961092
[3220]	train-auc:0.961099
[3221]	train-auc:0.961101
[3222]	train-auc:0.961116
[3223]	train-auc:0.961121
[3224]	train-auc:0.961134
[3225]	train-auc:0.961146
[3226]	train-auc:0.961154
[3227]	train-auc:0.961157
[3228]	train-auc:0.961144
[3229]	train-auc:0.961138
[3230]	train-auc:0.961155
[3231]	train-auc:0.96116
[3232]	train-auc:0.961159
[3233]	train-auc:0.961173
[3234]	train-auc:0.961182
[3235]	train-auc:0.9612
[3236]	train-auc:0.961202
[3237]	train-auc:0.961203
[3238]	train-auc:0.961218
[3239]	train-auc:0.961219
[3240]	train-auc:0.961234
[3241]	train-auc:0.961241
[3242]	train-auc:0.961253
[3243]	train-auc:0.961255
[3244]	train-auc:0.961256
[3245]	train-auc:0.961249
[3246]	train-auc:0.961268
[3247]	train-auc:0.96127
[3248]	train-auc:0.961286
[3249]	train-auc:0.961289
[3250]	train-auc:0.961292
[3251]	train-auc:0

In [48]:
print('开始训练gbdt，很慢，请耐心等待.....')
model = GradientBoostingClassifier()
model.fit(train_and_vali_x, train_and_vali_y)
gbdt_result = test_name.copy()
print('训练结束，开始预测.....')
gbdt_result['Probability'] = model.predict_proba(test_x)[:,1]
    
print('两个模型都训练结束，开始进行融合.....')
result = test_name.copy()
result['Probability'] = 0.6 * xgb_result['Probability'] + 0.4 * gbdt_result['Probability']
print('开始保存结果.....')
    
result.to_csv('submit.csv', index = False,header=None)
print('保存结束，去提交吧!!!!!!!!!!')

开始训练gbdt，很慢，请耐心等待.....
训练结束，开始预测.....
两个模型都训练结束，开始进行融合.....
开始保存结果.....


FileNotFoundError: [Errno 2] No such file or directory: '../reslut/submit.csv'

In [49]:
result.to_csv('submit.csv', index = False,header=None)
print('保存结束，去提交吧!!!!!!!!!!')

保存结束，去提交吧!!!!!!!!!!
