In [1]:
import os
import zipfile
import time
import pickle
import gc

import pandas as pd
import numpy as np
from tqdm import tqdm

from utils import load_pickle, dump_pickle, get_feature_value, feature_spearmanr, feature_target_spearmanr, addCrossFeature, calibration
from utils import raw_data_path, feature_data_path, cache_pkl_path, analyse

In [2]:
all_data_path = feature_data_path + 'all_data_all_features.pkl'
all_data = load_pickle(all_data_path)

target = 'is_trade'

features = load_pickle(feature_data_path + 'features_0418_fewer.pkl')

len(features)

230

## 用18-23号数据训练 lgb

In [24]:
from sklearn.metrics import log_loss
from lightgbm import LGBMClassifier

train_data = all_data[(all_data.day >= 20) & (all_data.day <= 23)]
test_data = all_data[all_data.day == 24]

lgb_clf = LGBMClassifier(n_estimators=200, max_depth=3, )

cate_features = ['user_gender_id', 'user_occupation_id', 'user_click_rank_day']

lgb_clf.fit(train_data[features], train_data['is_trade'],
            feature_name=features,
            categorical_feature=cate_features,
            verbose=50,
            )

loss_train = log_loss(train_data[target],lgb_clf.predict_proba(train_data[features]))
loss_test = log_loss(test_data[target], lgb_clf.predict_proba(test_data[features]))

loss_train, loss_test



(0.08110331184334206, 0.079060620253194033)

In [25]:
lgb_leaves = lgb_clf.apply(all_data[features])

lgb_leaves = pd.DataFrame(lgb_leaves).astype(np.int32)

# 转换为one-hot
lgb_leaves = pd.get_dummies(lgb_leaves, dummy_na=True, columns=lgb_leaves.columns)
lgb_leaves.shape

(496482, 1788)

## 用18-23号数据训练 xgb

In [5]:
from sklearn.metrics import log_loss
import xgboost as xgb

train_data = all_data[(all_data.day >= 19) & (all_data.day <= 23)]
test_data = all_data[all_data.day == 24]

xgb_clf = xgb.XGBClassifier(objective='binary:logistic',

                             n_estimators=200,
                             learning_rate=0.1,

                             max_depth=3,
                             min_child_weight=1e-3,
                             gamma=0,

                             colsample_bytree=0.8,
                             subsample=0.9,

                             reg_lambda=10,
                             min_split_gain=0.,
                             
                             max_bin=63,

                             n_jobs=6,
                             silent=False,
                             )


xgb_clf.fit(train_data[features], train_data[target],
#             eval_set=[(test_data[features], test_data[target])],
#             early_stopping_rounds=50,
#             eval_metric='logloss',
#             verbose=10,
            )

loss_train = log_loss(train_data[target], xgb_clf.predict_proba(train_data[features]))
loss_test = log_loss(test_data[target], xgb_clf.predict_proba(test_data[features]))

loss_train, loss_test

(0.082443637414406287, 0.07825402801893741)

In [6]:
xgb_leaves = xgb_clf.apply(all_data[features])

xgb_leaves = pd.DataFrame(xgb_leaves).astype(np.int32)

# 转换为one-hot
xgb_leaves = pd.get_dummies(xgb_leaves, dummy_na=True, columns=xgb_leaves.columns)
xgb_leaves.shape

(539370, 1794)

In [7]:
nominal_feats = [

    'user_gender_id',
    'user_occupation_id',
    'context_page_id',
    'user_last_click_day',
    'category2_label',
    'category_predict_rank',

#     'user_item_id_click_rank',
#     'user_item_brand_id_click_rank',
#     'user_shop_id_click_rank',
#     'user_context_page_id_click_rank',
#     'user_category2_label_click_rank',

#     'user_item_id_click_rank_day',
#     'user_item_brand_id_click_rank_day',
#     'user_shop_id_click_rank_day',
#     'user_context_page_id_click_rank_day',
#     'user_category2_label_click_rank_day',
]

# nominal_feats = ['user_gender_id', 'user_occupation_id', 'context_page_id']

# numeric_feats = ['item_price_level', 'item_sales_level', 'item_collected_level', 'item_pv_level', 'user_age_level', 'user_star_level',
#                  'shop_review_num_level', 'shop_review_positive_rate', 'shop_star_level', 'shop_score_service', 'shop_score_delivery', 'shop_score_description']


## 原始特征与xgb叶节点特征合并

In [None]:


# xgb叶节点+原始特征
# all_data_with_leaves = pd.concat([all_data, xgb_leaves], axis=1)

# 只用xgb叶节点特征
all_data_with_leaves = pd.concat(
    [all_data[['day', 'is_trade', 'instance_id'] + nominal_feats], xgb_leaves], axis=1)

# 标称属性转换为one-hot
all_data_with_leaves = pd.get_dummies(
    all_data_with_leaves, dummy_na=True, columns=nominal_feats)

features = list(all_data_with_leaves.columns)
features.remove('is_trade')
features.remove('instance_id')
features.remove('day')
target = 'is_trade'
len(features)


# 数值特征归一化
all_data_with_leaves[features] = all_data_with_leaves[features].apply(
    lambda x: (x - x.min()) / (x.max() - x.min()))

all_data_with_leaves = all_data_with_leaves.fillna(-1)


## 数据转换为 mxnet.ndarray 格式

In [28]:
train_data = all_data_with_leaves[(all_data_with_leaves.day >= 18) & (all_data_with_leaves.day <= 23)]
test_data = all_data_with_leaves[all_data_with_leaves.day == 24]

from mxnet import ndarray as nd
from mxnet import autograd
from mxnet import gluon
import mxnet as mx

X_train = train_data[features].as_matrix()
X_test = test_data[features].as_matrix()
y_train = train_data[target].astype(np.int).as_matrix()
y_test = test_data[target].astype(np.int).as_matrix()

X_train = nd.array(X_train)
X_test = nd.array(X_test)

y_train = nd.array(y_train).reshape((-1, 1))
y_test = nd.array(y_test).reshape((-1, 1))

X_train.shape

(420693, 944)

## 定义网络结构和loss

In [29]:
ctx = mx.gpu(0)
# ctx = mx.cpu(0)

from mxnet.gluon import nn


def get_lr():
    net = nn.Sequential()
    with net.name_scope():
#         net.add(nn.Dense(64, activation="relu"))
        net.add(nn.Dense(2))
    net.initialize(ctx=ctx)
    return net


def get_net_dropout(drop_prob1, drop_prob2):
    net = gluon.nn.Sequential()

    with net.name_scope():
        # 第一层全连接。
        net.add(nn.Dense(32, activation="relu"))
        # 在第一层全连接后添加丢弃层。
        net.add(nn.Dropout(drop_prob1))
#         # 第二层全连接。
        net.add(nn.Dense(32, activation="relu"))
        # 在第二层全连接后添加丢弃层。
        net.add(nn.Dropout(drop_prob2))
        net.add(nn.Dense(2))
    net.initialize(ctx=ctx)
    return net


softmax_cross_entropy = gluon.loss.SoftmaxCrossEntropyLoss()


def evaluate_loss(net, data_iter):
    total_loss = 0.
    n = 0
    for data, label in data_iter:
        data = data.as_in_context(ctx)
        label = label.as_in_context(ctx)
        output = net(data)
        loss = softmax_cross_entropy(output, label)
        total_loss += nd.sum(loss).asscalar()
        n += label.size
    return total_loss / n

## Train

In [30]:
%matplotlib inline
import matplotlib as mpl
mpl.rcParams['figure.dpi'] = 120
import matplotlib.pyplot as plt
from time import time
import utils

def train(net, X_train, y_train, X_test, y_test, epochs,
          verbose_epoch, batch_size, learning_rate, weight_decay, lr_decay, lr_decay_epoch):
    """Train a network"""
    print("Start training on ", ctx)
    
    train_loss = []

    dataset_train = gluon.data.ArrayDataset(X_train, y_train)
    data_iter_train = utils.DataLoader(
        dataset_train, batch_size, shuffle=True)

    if X_test is not None:
        test_loss = []
        dataset_test = gluon.data.ArrayDataset(X_test, y_test)
        data_iter_test = gluon.data.DataLoader(
            dataset_test, batch_size, shuffle=False)

    trainer = gluon.Trainer(net.collect_params(), 'adam',
                            {'learning_rate': learning_rate, 'wd': weight_decay})
    
    net.collect_params().initialize(force_reinit=True, ctx=ctx)
    
    for epoch in range(epochs):
        start = time()
        for data, label in data_iter_train:
            with autograd.record():
                data = data.as_in_context(ctx)
                label = label.as_in_context(ctx)
                output = net(data)
                loss = softmax_cross_entropy(output, label)

            loss.backward()
            trainer.step(batch_size)
            nd.waitall()

#         if epoch > 0 and epoch % lr_decay_epoch == 0:
#             trainer.set_learning_rate(trainer.learning_rate * lr_decay)
#             print('change lr to %f' % (trainer.learning_rate))

        if epoch >= verbose_epoch:
            cur_train_loss = evaluate_loss(net, data_iter_train)
            train_loss.append(cur_train_loss)

            if X_test is not None:
                cur_test_loss = evaluate_loss(net, data_iter_test)
                test_loss.append(cur_test_loss)

            if X_test is not None:
                print("Epoch %d, train loss: %f, test loss: %f, Time %.1f sec" % (
                    epoch, cur_train_loss, cur_test_loss, time() - start))
            else:
                print("Epoch %d, train loss: %f, Time %.1f sec" %
                      (epoch, cur_train_loss, time() - start))

    plt.plot(train_loss)
    plt.legend(['train'])
    if X_test is not None:
        plt.plot(test_loss)
        plt.legend(['train', 'test'])
    plt.show()
    if X_test is not None:
        return cur_train_loss, cur_test_loss
    else:
        return cur_train_loss

## 模型参数

In [None]:
epochs = 50
verbose_epoch = 0
learning_rate = 0.005
batch_size = 10000
lr_decay = 0.2
lr_decay_epoch = 30
weight_decay = 0.1


drop_prob1 = 0.2
drop_prob2 = 0.2

# net = get_net_dropout(drop_prob1, drop_prob2)
net = get_lr()

train_loss, test_loss = train(net, X_train, y_train, X_test, y_test, epochs, verbose_epoch, batch_size, learning_rate, weight_decay, lr_decay, lr_decay_epoch)

In [None]:
def softmax(X):
    exp = nd.exp(X)
    # 假设exp是矩阵，这里对行进行求和，并要求保留axis 1，
    # 就是返回 (nrows, 1) 形状的矩阵
    partition = exp.sum(axis=1, keepdims=True)
    return exp / partition

train_predict = softmax(net(X_train.as_in_context(ctx)))[:,1].as_in_context(mx.cpu()).asnumpy()
test_predict = softmax(net(X_test.as_in_context(ctx)))[:,1].as_in_context(mx.cpu()).asnumpy()

train_predict, test_predict

loss_train = log_loss(train_data[target], train_predict)
loss_test = log_loss(test_data[target], test_predict)

loss_train, loss_test