In [1]:
import pandas as pd
import numpy as np
import gc

import os
import sys
sys.path.append("../")


from sklearn.metrics import log_loss
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve, auc, roc_auc_score
from sklearn.preprocessing.data import OneHotEncoder
from sklearn.preprocessing import StandardScaler

import xgboost as xgb
import matplotlib.pyplot as plt

import Tool.utils as utils
import Tool.config as config

from scipy.sparse import hstack
from scipy.sparse import vstack

In [2]:
data = pd.read_pickle(config.data_prefix_path + 'data.pkl')

In [3]:
cat_features = ['user_gender_id',
                'user_occupation_id',
                'context_id',
                'context_page_id',
                'item_category_list',
                'hour',
                "category_1",
                "category_2",]

object_features = ["predict_category_1","predict_category_2","predict_category_0",
                    "predict_property_0","predict_property_1","predict_property_2",
                    "property_1","property_0","property_2",
                    "category_0",
                    'hour_and_category_1',
                    'category_cross_0', 'category_cross_1', 'category_cross_2',
                    ]

In [4]:
features = [c for c in data.columns if c not in ['is_trade', 'instance_id','index',
                                            'context_id', 'time', 'day','context_timestamp',
                                            'property_list','category_list','property_predict_list','category_predict_list',
                                            'item_category_list', 'item_property_list', 'predict_category_property',
                                            'user_id','item_id','item_brand_id','item_city_id','user_id','shop_id',
                                            ]
            and c not in object_features
            and c not in cat_features]
target = ['is_trade']

In [5]:
params = {  'booster':'gbtree', 
            'num_leaves':35, 
            'max_depth':7, 
            'eta':0.05, 
            'max_bin':425, 
            'subsample_for_bin':50000, 
            'objective':'binary:logistic', 
            'min_split_gain':0,
            'min_child_weight':6, 
            'min_child_samples':10, 
            #'colsample_bytree':0.8,#在建立树时对特征采样的比例。缺省值为1
            #'subsample':0.9,#用于训练模型的子样本占整个样本集合的比例。 
            'subsample_freq':1,
            'colsample_bytree':1, 
            'reg_lambda':4,  # 控制模型复杂度的权重值的L2正则化项参数，参数越大，模型越不容易过拟合。
            'alpha':4,   #L1正则化 
            'seed':2018,
            'nthread':7, 
            'silent':True,
            'gamma':0.2,
            'eval_metric':'logloss'
         }

In [6]:
train = data[data.is_trade.notnull()]
test = data[data.is_trade.isnull()]
del data
gc.collect()

7

## One_hot类别特征

In [11]:
def getOnehotTrain(df_train,df_test,cat_features):
    
    df_train = df_train[cat_features]
    df_test = df_test[cat_features]
    
    for cat_feature in cat_features:
        df_train.loc[df_train[cat_feature]<0,cat_feature]=2018
        df_test.loc[df_test[cat_feature]<0,cat_feature]=2018
        
    df_train = df_train[cat_features].values
    df_test = df_test[cat_features].values
    gc.collect()
    
    df_merge = np.concatenate((df_train,df_test),axis=0)
    df_merge = df_merge.astype(np.int32)
    
    encoder = OneHotEncoder()
    df_trans = encoder.fit_transform(df_merge)
    
    train = df_trans[:df_train.shape[0]]
    test = df_trans[df_train.shape[0]:]
    
    print(train.shape)
    print(test.shape)
     
    utils.save_sparse_csr('onehot_train',train)
    utils.save_sparse_csr('onehot_test',test)

(420627, 8)
(57405, 8)


In [12]:
encoder = OneHotEncoder()
trans = encoder.fit_transform(np.concatenate((X_train, X_val), axis=0))

ValueError: could not convert string to float: '7908382889764677758;5755694407684602296'

## Xgboost新特征

In [7]:
X_train = train[train.day < 24]
X_val = train[train.day == 24]
    

train = X_train[features].values
val = X_val[features].values
train_label = X_train[target].values.ravel()
val_label = X_val[target].values.ravel()

xgb_train = xgb.DMatrix(train, train_label)
xgb_val = xgb.DMatrix(val, val_label)
    
watchlist = [(xgb_train, 'train'),(xgb_val, 'val')]

del X_train,X_val
gc.collect()

27

In [8]:
model = xgb.train(params, xgb_train, 300, watchlist,early_stopping_rounds=50)

[0]	train-logloss:0.648024	val-logloss:0.647773
Multiple eval metrics have been passed: 'val-logloss' will be used for early stopping.

Will train until val-logloss hasn't improved in 50 rounds.
[1]	train-logloss:0.6072	val-logloss:0.606728
[2]	train-logloss:0.570103	val-logloss:0.56941
[3]	train-logloss:0.536261	val-logloss:0.535388
[4]	train-logloss:0.50529	val-logloss:0.504233
[5]	train-logloss:0.476862	val-logloss:0.475627
[6]	train-logloss:0.450702	val-logloss:0.449298
[7]	train-logloss:0.426575	val-logloss:0.425012
[8]	train-logloss:0.404277	val-logloss:0.402551
[9]	train-logloss:0.383632	val-logloss:0.381774
[10]	train-logloss:0.364486	val-logloss:0.362419
[11]	train-logloss:0.346709	val-logloss:0.344502
[12]	train-logloss:0.330179	val-logloss:0.327853
[13]	train-logloss:0.314792	val-logloss:0.312328
[14]	train-logloss:0.300453	val-logloss:0.297854
[15]	train-logloss:0.287075	val-logloss:0.284311
[16]	train-logloss:0.274589	val-logloss:0.271699
[17]	train-logloss:0.26292	val-log

[164]	train-logloss:0.079898	val-logloss:0.079813
[165]	train-logloss:0.07988	val-logloss:0.079806
[166]	train-logloss:0.079811	val-logloss:0.079799
[167]	train-logloss:0.079773	val-logloss:0.079794
[168]	train-logloss:0.079726	val-logloss:0.079784
[169]	train-logloss:0.079665	val-logloss:0.079773
[170]	train-logloss:0.079604	val-logloss:0.079774
[171]	train-logloss:0.07956	val-logloss:0.079766
[172]	train-logloss:0.07948	val-logloss:0.079761
[173]	train-logloss:0.079436	val-logloss:0.079764
[174]	train-logloss:0.07938	val-logloss:0.07977
[175]	train-logloss:0.079355	val-logloss:0.079772
[176]	train-logloss:0.079328	val-logloss:0.079769
[177]	train-logloss:0.079261	val-logloss:0.07977
[178]	train-logloss:0.079216	val-logloss:0.079766
[179]	train-logloss:0.079186	val-logloss:0.079761
[180]	train-logloss:0.079146	val-logloss:0.079753
[181]	train-logloss:0.079072	val-logloss:0.079753
[182]	train-logloss:0.079015	val-logloss:0.079758
[183]	train-logloss:0.07898	val-logloss:0.079758
[184]	t

In [10]:
train_leaves = model.predict(xgb_train, pred_leaf=True)
val_leaves = model.predict(xgb_val, pred_leaf=True)
(train_rows, cols) = train_leaves.shape

print(train_leaves.shape)
print(val_leaves.shape)
    
del xgb_train,xgb_val
gc.collect()

NameError: name 'xgb_train' is not defined

In [10]:
del model
gc.collect()

0

In [11]:
one_hot_train = utils.load_sparse_csr('tree_leaves_train.npz')
one_hot_val = utils.load_sparse_csr('onehot_val.npz')
print(one_hot_train.shape)
print(one_hot_val.shape)

(420629, 76)
(57406, 76)


In [12]:
gbdtenc = OneHotEncoder()
trans = gbdtenc.fit_transform(np.concatenate((train_leaves, val_leaves), axis=0))
print(trans.shape)

(478035, 13575)


In [15]:
trans_train = hstack([trans[:train_rows,:],one_hot_train])
trans_val = hstack([trans[train_rows:,:],one_hot_val])
del trans,train_leaves,val_leaves
gc.collect()

NameError: name 'trans' is not defined

In [13]:
'''
utils.save_sparse_csr('xgboost_onehot_train_non24', trans[:train_rows,:])
utils.save_sparse_csr('xgboost_onehot_val_non24', trans[train_rows:,:])
'''

"\nutils.save_sparse_csr('xgboost_onehot_train_non24', trans[:train_rows,:])\nutils.save_sparse_csr('xgboost_onehot_val_non24', trans[train_rows:,:])\n"

### LR

In [14]:
gc.collect()
lr = LogisticRegression(n_jobs=6,C=0.0004)
    
lr.fit(trans_train, train_label)
    
y_train_pred = lr.predict_proba(trans_train)[:,1]
train_log = log_loss(train_label,y_train_pred)
    
y_val_pred = lr.predict_proba(trans_val)[:,1]
val_log = log_loss(val_label,y_val_pred)
    
print(train_log)
print(val_log)
print(roc_auc_score(val_label,y_val_pred))

  " = {}.".format(self.n_jobs))


0.0814638101058
0.0806875148726
0.702671568724


### FM_FTRL

In [17]:
clf = FM_FTRL(
    alpha=0.05,   #w0和w的FTRL超参数alpha
    beta=3,   #w0和w的FTRL超参数beta
    L1=0.1,    #w0和w的L1正则
    L2=10,    #w0和w的L2正则
    D=train.shape[1], 
    
    alpha_fm=0.05, #v的FTRL超参数alpha
    L2_fm=5, #v的L2正则
    
    init_fm=0.01,
    D_fm=8, 
    e_noise=0.01, 
    iters=3, 
    inv_link="sigmoid", 
    threads=10,
)

In [None]:
clf.fit(trans_train,train_label)

In [None]:
y_train = clf.predict(trans_train)
y_val = clf.predict(trans_val)

In [None]:
print("train_logloss: "+ str(log_loss(train_label,y_train)))
print("val_logloss: "+ str(log_loss(val_label,y_val)))

print("train_auc: "+ str(roc_auc_score(train_label,y_train)))
print("val_auc: "+ str(roc_auc_score(val_label,y_val)))

## 输出

In [5]:
df_train = pd.read_csv(PATH + "Train0403.csv")
df_test = pd.read_csv(PATH + "Test0403.csv")

In [6]:
features= [ feature for feature in df_train.head(0) if feature not in ['instance_id','item_id','item_category_list','item_property_list',\
                                                         'item_brand_id','item_city_id','user_id',\
                                                         'context_id','context_timestamp','predict_category_property','shop_id',\
                                                         'time','day','index_x','index_y','item_category_list_isPreTrue',\
                                                         'item_brand_id_PurchaseRate','item_city_id_PurchaseRate','hour_PurchaseRate',\
                                                         'is_trade',\
                                                         'user_gender_id_user_id_cnt','user_gender_id_user_id_prob','user_age_level_user_id_cnt','user_age_level_user_id_prob','user_occupation_id_user_id_cnt','user_occupation_id_user_id_prob',\
                                                         'item_property_list_shop_id_cnt','item_property_list_shop_id_prob','item_property_list_shop_review_num_level_cnt','item_property_list_shop_review_num_level_prob','item_property_list_shop_star_level_cnt','item_property_list_shop_star_level_prob',\
                                                         'item_brand_id_item_id_cnt','item_brand_id_item_id_prob','item_city_id_item_id_cnt','item_city_id_item_id_prob','item_price_level_item_id_cnt','item_price_level_item_id_prob','item_sales_level_item_id_cnt','item_sales_level_item_id_prob',\
                                                         'item_collected_level_item_id_cnt','item_collected_level_item_id_prob','item_pv_level_item_id_cnt','item_pv_level_item_id_prob',\
                                                         'item_id_shop_id_cnt','item_id_shop_id_prob','item_id_shop_review_num_level_cnt','item_id_shop_review_num_level_prob','item_id_shop_star_level_cnt','item_id_shop_star_level_prob',\
                                                         'item_id_user_id_cnt','item_id_user_id_prob']\
                                        and not feature.endswith('0') and not feature.endswith('var')
                                        and feature not in cat_features]
target = 'is_trade'

In [7]:
train = df_train[features].values
test = df_test[features].values
train_label =df_train[target].values.ravel()

xgb_train = xgb.DMatrix(train, train_label)
xgb_test = xgb.DMatrix(test)
    
watchlist = [(xgb_train, 'train')]
del train,test,df_train
gc.collect()

42

In [8]:
model = xgb.train(params, xgb_train, 268, watchlist,early_stopping_rounds=50)

[0]	train-auc:0.57895
Will train until train-auc hasn't improved in 50 rounds.
[1]	train-auc:0.631263
[2]	train-auc:0.632452
[3]	train-auc:0.647459
[4]	train-auc:0.646546
[5]	train-auc:0.647965
[6]	train-auc:0.64889
[7]	train-auc:0.653438
[8]	train-auc:0.655599
[9]	train-auc:0.655248
[10]	train-auc:0.655015
[11]	train-auc:0.656357
[12]	train-auc:0.657418
[13]	train-auc:0.657121
[14]	train-auc:0.657152
[15]	train-auc:0.658859
[16]	train-auc:0.659382
[17]	train-auc:0.662484
[18]	train-auc:0.664326
[19]	train-auc:0.664309
[20]	train-auc:0.665625
[21]	train-auc:0.666368
[22]	train-auc:0.66675
[23]	train-auc:0.666515
[24]	train-auc:0.666906
[25]	train-auc:0.667971
[26]	train-auc:0.668208
[27]	train-auc:0.669743
[28]	train-auc:0.670695
[29]	train-auc:0.672612
[30]	train-auc:0.672692
[31]	train-auc:0.673213
[32]	train-auc:0.674004
[33]	train-auc:0.675088
[34]	train-auc:0.676075
[35]	train-auc:0.676554
[36]	train-auc:0.677614
[37]	train-auc:0.678682
[38]	train-auc:0.679251
[39]	train-auc:0.680

In [9]:
train_leaves = model.predict(xgb_train, pred_leaf=True)
test_leaves = model.predict(xgb_test, pred_leaf=True)
(train_rows, cols) = train_leaves.shape
    
print(train_leaves.shape)
print(test_leaves.shape)
    
del xgb_train,xgb_test,train,test
gc.collect()

(478035, 268)
(18371, 268)


21

In [10]:
one_hot_train = utils.load_sparse_csr('onehot_train.npz')
one_hot_test = utils.load_sparse_csr('onehot_test.npz')
print(one_hot_train.shape)
print(one_hot_test.shape)

(478035, 76)
(18371, 76)


In [11]:
gbdtenc = OneHotEncoder()
trans = gbdtenc.fit_transform(np.concatenate((train_leaves, test_leaves), axis=0))

In [12]:
trans_train = hstack([trans[:train_rows,:],one_hot_train])
trans_test = hstack([trans[train_rows:,:],one_hot_test])
gc.collect()

24

In [13]:
gc.collect()
lr = LogisticRegression(n_jobs=6,C=0.0004)
    
lr.fit(trans_train, train_label)
    
y_train_pred = lr.predict_proba(trans_train)[:,1]
train_log = log_loss(train_label,y_train_pred)
print(train_log)
print(trans[train_rows:, :].shape)

  " = {}.".format(self.n_jobs))


0.0808350610736
(18371, 13655)


In [14]:
df_test['predicted_score']= lr.predict_proba(trans_test)[:,1]
df_test[['instance_id', 'predicted_score']].to_csv(PATH + 'xgboost_LR_baseline_1.txt',sep=" ",index=False)

In [15]:
utils.save_sparse_csr('xgboost_onehot_train', trans[:train_rows,:])
utils.save_sparse_csr('xgboost_onehot_test', trans[train_rows:,:])