In [17]:
import os
import zipfile
import time
import pickle
import gc

import pandas as pd
import numpy as np
from tqdm import tqdm

from utils import load_pickle, dump_pickle, get_feature_value, feature_spearmanr, feature_target_spearmanr, addCrossFeature, calibration
from utils import raw_data_path, feature_data_path, cache_pkl_path, analyse
import utils

In [None]:
all_data_path = feature_data_path + 'all_data_all_features.pkl'
all_data = load_pickle(all_data_path)

type_convert = ['item_price_level', 'item_sales_level', 'item_collected_level', 'item_pv_level', 'shop_review_num_level', 'shop_star_level']
all_data[type_convert] = all_data[type_convert].astype(np.int64)

features = list(all_data.columns)
features.remove('is_trade')
features.remove('instance_id')
target = 'is_trade'

features = load_pickle(feature_data_path + 'feature_list.pkl')
len(features)

## 用18-23号数据训练 xgb

In [4]:
from sklearn.metrics import log_loss
import xgboost as xgb

train_data = all_data[(all_data.day >= 20) & (all_data.day <= 23)]
test_data = all_data[all_data.day == 24]

xgb_clf = xgb.XGBClassifier() 
xgb_clf.fit(train_data[features], train_data[target])

loss_train = log_loss(train_data[target], xgb_clf.predict_proba(train_data[features]))
loss_test = log_loss(test_data[target], xgb_clf.predict_proba(test_data[features]))

loss_train, loss_test

(0.083681341116011845, 0.079189066746593845)

In [5]:
xgb_leaves = xgb_clf.apply(all_data[features])

xgb_leaves = pd.DataFrame(xgb_leaves).astype(np.int32)

# 转换为one-hot
xgb_leaves = pd.get_dummies(xgb_leaves, dummy_na=True, columns=xgb_leaves.columns)
xgb_leaves.shape

(496482, 895)

## 原始特征与xgb叶节点特征合并

In [8]:
nominal_feats = ['user_gender_id', 'user_occupation_id', 'context_page_id']

numeric_feats = ['item_price_level', 'item_sales_level', 'item_collected_level', 'item_pv_level', 'user_age_level', 'user_star_level',
                 'shop_review_num_level', 'shop_review_positive_rate', 'shop_star_level', 'shop_score_service', 'shop_score_delivery', 'shop_score_description']


# xgb叶节点+原始特征
# all_data_with_leaves = pd.concat([all_data, xgb_leaves], axis=1)

# 只用xgb叶节点特征
# all_data_with_leaves = pd.concat(
#     [all_data[['day', 'is_trade', 'instance_id'] + nominal_feats + numeric_feats], xgb_leaves], axis=1)

all_data_with_leaves = pd.concat(
    [train_data[['day', 'is_trade', 'instance_id']], xgb_leaves], axis=1)

# 标称属性转换为one-hot
# all_data_with_leaves = pd.get_dummies(
#     all_data_with_leaves, dummy_na=True, columns=nominal_feats)

features = list(all_data_with_leaves.columns)
features.remove('is_trade')
features.remove('instance_id')
features.remove('day')
target = 'is_trade'


# 数值特征归一化
all_data_with_leaves[features] = all_data_with_leaves[features].apply(
    lambda x: (x - x.min()) / (x.max() - x.min()))

all_data_with_leaves = all_data_with_leaves.fillna(-1)

len(features)

895

## 数据转换为 mxnet.ndarray 格式

In [11]:
train_data = all_data_with_leaves[(all_data_with_leaves.day >= 20) & (all_data_with_leaves.day <= 23)]
test_data = all_data_with_leaves[all_data_with_leaves.day == 24]

from mxnet import ndarray as nd
from mxnet import autograd
from mxnet import gluon
import mxnet as mx

X_train = train_data[features].as_matrix()
X_test = test_data[features].as_matrix()
y_train = train_data[target].astype(np.int).as_matrix()
y_test = test_data[target].astype(np.int).as_matrix()

X_train = nd.array(X_train)
X_test = nd.array(X_test)

y_train = nd.array(y_train).reshape((-1, 1))
y_test = nd.array(y_test).reshape((-1, 1))

X_train.shape

(271505, 895)