In [3]:
# 查看当前挂载的数据集目录, 该目录下的变更重启环境后会自动还原
# View dataset directory. This directory will be recovered automatically after resetting environment. 
!ls /home/aistudio/data

data19383


In [4]:
# 查看工作区文件, 该目录下的变更将会持久保存. 请及时清理不必要的文件, 避免加载过慢.
# View personal work directory. All changes under this directory will be kept even after reset. Please clean unnecessary files in time to speed up environment loading.
!ls /home/aistudio/work

In [5]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import datetime
import time
raw = pd.read_csv('data/data19383/train.csv')
raw['goods_id'] = pd.factorize(raw['goods_id'])[0]

In [6]:
# 数据探索
raw.columns
raw['goods_id'].value_counts()

0       472943
5       171277
243      91748
73       67783
121      60263
14       59098
40       54200
34       52130
68       46610
55       44218
59       34846
10       31837
168      26849
8        23268
244      23071
315      21786
98       21633
23       20847
234      20554
90       19172
102      17790
35       17636
45       16480
38       15421
108      14653
92       14583
69       14461
43       14280
257      13722
333      13566
         ...  
1029         1
1030         1
1031         1
1032         1
958          1
1034         1
1024         1
1013         1
716          1
1012         1
940          1
939          1
936          1
946          1
933          1
932          1
931          1
763          1
999          1
1001         1
981          1
959          1
926          1
609          1
923          1
947          1
949          1
950          1
1011         1
1036         1
Name: goods_id, Length: 1037, dtype: int64

In [None]:
# 对Goods_id进行标签编码
raw['goods_id'] = pd.factorize(raw['goods_id'])[0]

In [5]:
# 对数据进行预处理
def prerpocess(raw, train='train'):
    st = time.time()
    # 性别：0未知，1男，2女
    data = pd.DataFrame(raw.groupby('customer_id')['customer_gender'].last().fillna(0))
    # 添加商品相关信息
    data[['goods_id_last', 'goods_status_last', 'goods_price_last', 'goods_has_discount_last', 'goods_list_time_last', 'goods_delist_time_last']] = raw.groupby('customer_id')['goods_id', 'goods_status', 'goods_price', 'goods_has_discount', 'goods_list_time', 'goods_delist_time'].last()
    # 添加订单相关信息
    data[['order_total_num_last', 'order_amount_last',
       'order_total_payment_last', 'order_total_discount_last', 'order_pay_time_last',
       'order_status_last', 'order_count_last', 'is_customer_rate_last',
       'order_detail_status_last', 'order_detail_goods_num_last', 'order_detail_amount_last',
       'order_detail_payment_last', 'order_detail_discount_last']] = raw.groupby('customer_id')['order_total_num', 'order_amount',
       'order_total_payment', 'order_total_discount', 'order_pay_time',
       'order_status', 'order_count', 'is_customer_rate',
       'order_detail_status', 'order_detail_goods_num', 'order_detail_amount',
       'order_detail_payment', 'order_detail_discount'].last()
    # 添加商品原始价格统计字段
    data[['good_price_std', 'good_price_mean', 'good_price_min', 'good_price_max']] = raw.groupby('customer_id')['goods_price'].agg({'good_price_std':'std', 'good_price_mean':'mean', 'good_price_min':'min', 'good_price_max':'max'})
    # 添加订单实付金额统计字段
    data[['order_detail_payment_std', 'order_detail_payment_mean', 'order_detail_payment_min', 'order_detail_payment_max']] = raw.groupby('customer_id')['order_detail_payment'].agg({'order_detail_payment_std':'std', 'order_detail_payment_mean':'mean', 'order_detail_payment_min':'min', 'order_detail_payment_max':'max'})
    # 用户购买的订单数量
    data['count'] = raw.groupby('customer_id')['order_id'].nunique()
    # 用户购买的商品数量
    data['goods_count'] = raw.groupby('customer_id')['order_total_num'].sum()
    # 用户所在省份
    data['customer_province'] = raw.groupby('customer_id')['customer_province'].last()
    # 用户所在城市
    data['customer_city'] = raw.groupby('customer_id')['customer_city'].last()
    # 用户是否评价 统计结果（平均，综合）
    data[['is_customer_rate_ratio','is_customer_rate_sum']] = raw.groupby('customer_id')['is_customer_rate'].agg({'is_customer_rate_ratio':np.mean,'is_customer_rate_sum':np.sum})
    # 用户购买的goods数量，一个订单商品，即order_detail_id（goods_id）
    data['order_detail_count'] = raw.groupby('customer_id')['customer_id'].count()
    # 商品折扣统计属性（sum, ave）
    data[['goods_has_discount_sum','goods_has_discount_ave']] = raw.groupby('customer_id')['goods_has_discount'].agg({'goods_has_discount_sum':np.sum,'goods_has_discount_ave':np.mean})
    # 订单实付金额 统计属性（sum, ave）
    data[['order_total_payment_sum','order_total_ave_pay']] = raw.groupby('customer_id')['order_total_payment'].agg({'order_total_payment_sum':np.sum,'order_total_ave_pay':np.mean})
    # 订单商品数量 统计属性（sum, ave）
    data[['order_total_num_sum', 'order_total_num_ave']] = raw.groupby('customer_id')['order_total_num'].agg({'order_total_num_sum':np.sum,'order_total_num_ave':np.mean})

    # 时间转换
    def time2multi(x):
        t=datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S')
        return pd.Series([t.month,t.day,t.weekday(),t.hour,t.minute,t.second])
    # 订单付款时间
    data[['order_pay_time_last_m','order_pay_time_last_d','order_pay_time_last_week','order_pay_time_last_h','order_pay_time_last_min','order_pay_time_last_s']]=data['order_pay_time_last'].apply(time2multi)
    #data[['order_pay_time_last_m','order_pay_time_last_d','order_pay_time_last_week','order_pay_time_last_h','order_pay_time_last_min','order_pay_time_last_s']] = raw.groupby('customer_id')['order_pay_time_last_m','order_pay_time_last_d','order_pay_time_last_week','order_pay_time_last_h','order_pay_time_last_min','order_pay_time_last_s'].last()
    # 起始时间是从2013-01-01开始
    t_str='2013-01-01 00:00:00'
    t=datetime.datetime.strptime(t_str, '%Y-%m-%d %H:%M:%S')
    # 商品最新上架时间diff （距离起始时间）
    data['goods_list_time_diff'] = data['goods_list_time_last'].map(lambda x:(datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S')-t).days/364)
    # 商品最新下架时间diff （距离起始时间）
    data['goods_delist_time_diff'] = data['goods_delist_time_last'].map(lambda x:(datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S')-t).days/364)
    # 商品展示时间
    data['goods_diff'] = data['goods_delist_time_diff'] - data['goods_list_time_diff']
    # 付款时间diff (距离起始时间)
    data['order_pay_time_last_diff'] = data['order_pay_time_last'].map(lambda x:(datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S')-t).days/364)
    ed = time.time()
    # 输出preprocess计算时间
    print(ed-st)
    
    return data

In [6]:
# 8月之前的数据作为训练集
train_raw = raw[raw['order_pay_time'] < '2013-07-31 23:59:59']
train_raw = prerpocess(train_raw)
# 8月份的数据作为label_raw
label_raw = set(raw[raw['order_pay_time'] > '2013-07-31 23:59:59']['customer_id'].dropna())
# 如果该用户在8月份完成了购买 label=1, 否则为0
train_raw['labels']=train_raw.index.map(lambda x:int(x in label_raw))
test = prerpocess(raw)

is deprecated and will be removed in a future version
is deprecated and will be removed in a future version
is deprecated and will be removed in a future version
is deprecated and will be removed in a future version
is deprecated and will be removed in a future version
is deprecated and will be removed in a future version


请点击[此处](https://ai.baidu.com/docs#/AIStudio_Project_Notebook/a38e5576)查看本环境基本用法.  <br>
Please click [here ](https://ai.baidu.com/docs#/AIStudio_Project_Notebook/a38e5576) for more detailed instructions. 

In [7]:
# 这些时间，之前已经根据时间提取了特征 原始的格式没法直接参与训练
train_data = train_raw.drop(['goods_list_time_last', 'goods_delist_time_last', 'order_pay_time_last'], axis=1)
# 暂时没有处理customer_province, customer_city 可以先去掉
train_data = train_data.drop(['customer_province', 'customer_city'], axis=1)
# 分类变量
catel = ['order_pay_time_last_h', 'order_pay_time_last_week', 'order_pay_time_last_m', 'order_detail_status_last', 'order_status_last', 'goods_status_last', 'goods_id_last', 'customer_gender']

In [8]:
!pip install lightgbm
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(train_data.drop('labels', axis=1), np.array(train_data['labels']), test_size=0.2, random_state=33)
from sklearn.model_selection import KFold
# 采用CV=5折交叉验证
kf = KFold(n_splits=5,shuffle=True)

y_pre = 0
for train_index , test_index in kf.split(train_data):
    # 设置每一折的train和test
    X_train, X_valid, y_train, y_valid = train_data.drop('labels', axis=1).iloc[train_index], train_data.drop('labels', axis=1).iloc[test_index], np.array(train_data['labels'])[train_index], np.array(train_data['labels'])[test_index]
    import lightgbm as lgb
    param = {
    'num_leaves':121,
    'boosting_type': 'gbdt',
    'objective':'binary',
    'max_depth':7,
    'learning_rate':0.05,
    'metric':'binary_logloss'}
    # 使用lgb进行训练
    trn_data = lgb.Dataset(X_train, label=y_train)
    val_data = lgb.Dataset(X_valid, label=y_valid)
    lgbm = lgb.train(param,trn_data,valid_sets=[trn_data,val_data],num_boost_round = 10000, early_stopping_rounds=150,verbose_eval=50, categorical_feature=catel)
    # 得到test的预测结果
    test = test[X_train.columns]
    y_pre += lgbm.predict(test)

In [None]:
# 输出feature_importance
print(pd.DataFrame({
        'column': X_train.columns,
        'importance': lgbm.feature_importance(),
    }).sort_values(by='importance'))
# 5次结果求平均
y_pre = y_pre/5    

In [None]:
# 削弱0附近的影响，因为结果有log，一旦分错，接近0的部分log0影响非常大
def f(x):
    if x < 0.1:
        return 0.1
    if x > 0.9:
        return 0.9
    return x

In [None]:
test['result'] = y_pre
test['result'] = test['result'].map(f)
subm = pd.DataFrame(test['result'])
subm.to_csv('submission.csv')
# 平均购买情况
subm.mean()