In [1]:
import pandas as pd
import numpy as np
import gc

import os
import sys
sys.path.append("../")


from sklearn.metrics import log_loss
from wordbatch.models import FM_FTRL
from sklearn.metrics import roc_curve, auc, roc_auc_score


import Tool.utils as utils
import Tool.config as config
from Feature import one_hot_feature

from scipy.sparse import hstack
from scipy.sparse import vstack
from scipy.sparse import csc_matrix
from scipy.sparse import coo_matrix

In [2]:
data = pd.read_pickle(config.data_prefix_path + 'data.pkl')

In [3]:
train = data[data.is_trade.notnull()]
test = data[data.is_trade.isnull()]
del data
gc.collect()

0

## FM_FTRL线下验证

In [4]:
X_train = train[train.day < 24]
X_val = train[train.day == 24]

del train,test
gc.collect()

0

In [5]:
train_X,val_X = one_hot_feature.get_one_hot_data('val')
train_Y = X_train['is_trade'].values.ravel()
val_Y = X_val['is_trade'].values.ravel()

del X_train,X_val
gc.collect()

获取类别特征one_hot线下验证数据
训练集长度: 420627
验证集长度：57405
获取xgboost特征one_hot线下验证数据
训练集长度: 420627
测试集长度：57405


31

In [57]:
clf = FM_FTRL(
    alpha=0.005,       #w0和w的FTRL超参数alpha
    beta=0.01,        #w0和w的FTRL超参数beta
    L1=130,           #w0和w的L1正则
    L2=1200,          #w0和w的L2正则
    D=train_X.shape[1], 
    
    alpha_fm=0.005,   #v的FTRL超参数alpha
    L2_fm=10,         #v的L2正则
    
    init_fm=0.01,
    D_fm=2,  
    e_noise=0.0001, 
    iters=3, 
    inv_link="sigmoid", 
    threads=16,
)

In [58]:
clf.fit(train_X,train_Y)
y_train = clf.predict(train_X)
y_val = clf.predict(val_X)

In [59]:
print("train_logloss: "+ str(log_loss(train_Y,y_train)))
print("val_logloss: "+ str(log_loss(val_Y,y_val)))

print("train_auc: "+ str(roc_auc_score(train_Y,y_train)))
print("val_auc: "+ str(roc_auc_score(val_Y,y_val)))

train_logloss: 0.0831814288019
val_logloss: 0.0800659975731
train_auc: 0.788745386694
val_auc: 0.71648677306


## FM_FTRL线上提交

In [4]:
train_X,test_X = one_hot_feature.get_one_hot_data('train')
train_Y = train['is_trade'].values.ravel()

del train
gc.collect()

获取类别特征one_hot线上提交数据
训练集长度: 478032
测试集长度：42888
获取xgboost特征one_hot线上提交数据
训练集长度: 478032
测试集长度：42888


24

In [5]:
clf = FM_FTRL(
    alpha=0.005,       #w0和w的FTRL超参数alpha
    beta=0.01,        #w0和w的FTRL超参数beta
    L1=130,           #w0和w的L1正则
    L2=1200,          #w0和w的L2正则
    D=train_X.shape[1], 
    
    alpha_fm=0.005,   #v的FTRL超参数alpha
    L2_fm=10,         #v的L2正则
    
    init_fm=0.01,
    D_fm=2,  
    e_noise=0.0001, 
    iters=3, 
    inv_link="sigmoid", 
    threads=16,
)

In [6]:
clf.fit(train_X,train_Y)

In [7]:
test['predicted_score'] = clf.predict(test_X)

In [8]:
test[['instance_id', 'predicted_score']].to_csv(config.data_prefix_path + 'sub0421_xgboost_FM_FTRL.txt',sep=" ",index=False)

## OOF

In [4]:
# 读取one_hot特征
train_X,val_X = one_hot_feature.get_one_hot_data('val')
train_X = coo_matrix.tocsr(vstack([train_X,val_X]))

train_Y = train['is_trade'].values.ravel()

del val_X,test
gc.collect()

获取类别特征one_hot线下验证数据
训练集长度: 420627
验证集长度：57405
获取xgboost特征one_hot线下验证数据
训练集长度: 420627
测试集长度：57405


24

In [5]:
from sklearn.model_selection import KFold
folds=KFold(n_splits=4,random_state=2018,shuffle=True)

pred = np.zeros(train_X.shape[0])

In [6]:
clf = FM_FTRL(
    alpha=0.005,       #w0和w的FTRL超参数alpha
    beta=0.01,        #w0和w的FTRL超参数beta
    L1=130,           #w0和w的L1正则
    L2=1200,          #w0和w的L2正则
    D=train_X.shape[1], 
    
    alpha_fm=0.005,   #v的FTRL超参数alpha
    L2_fm=10,         #v的L2正则
    
    init_fm=0.01,
    D_fm=2,  
    e_noise=0.0001, 
    iters=3, 
    inv_link="sigmoid", 
    threads=16,
)

In [8]:
for n_fold,(trn_idx,val_idx) in enumerate(folds.split(train_X,train_Y)):
    
    clf.fit(train_X[trn_idx], train_Y[trn_idx])

    pred[val_idx] = clf.predict(train_X[val_idx])
    loss = log_loss(train_Y[val_idx],pred[val_idx])
    
    print("\t Fold %d : %.6f" % (n_fold+1, loss))
    
train['is_trade'+'_oof'] = pred

	 Fold 1 : 0.080835
	 Fold 2 : 0.079849
	 Fold 3 : 0.077058
	 Fold 4 : 0.075606


In [9]:
# 保存结果,用作stacking
train[['instance_id', 'is_trade', 'is_trade'+'_oof']].to_csv(
    config.data_prefix_path + 'xgboost_FM_FTRL_oof.txt',sep=" ",index=False
    )