In [1]:
import numpy as np
import pandas as pd
from wordbatch.models import FM_FTRL
from scipy.sparse import csr_matrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import log_loss
from scipy.sparse import hstack
from scipy.sparse import vstack
import utils
import gc
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

## 验证

In [2]:
onehot_train_non24 = utils.load_sparse_csr('onehot_train_non24.npz')
onehot_val_non24 = utils.load_sparse_csr('onehot_val_non24.npz')

In [3]:
xgboost_train_non24 = utils.load_sparse_csr('xgboost_onehot_train_non24.npz')
xgboost_val_non24 = utils.load_sparse_csr('xgboost_onehot_val_non24.npz')

In [4]:
PATH= '../DataSet/IJCAI-18 Alimama Sponsored Search Conversion Rate(CVR) Prediction Contest/'
df_train = pd.read_csv(PATH + "valTrain0403.csv")

In [5]:
cat_features = ['user_gender_id',
               'user_occupation_id',
               'context_id',
               'context_page_id',
               'item_category_list',
               'hour']

In [6]:
features= [ feature for feature in df_train.head(0) if feature not in ['instance_id','item_id','item_category_list','item_property_list',\
                                                         'item_brand_id','item_city_id','user_id',\
                                                         'context_id','context_timestamp','predict_category_property','shop_id',\
                                                         'time','day','index_x','index_y','item_category_list_isPreTrue',\
                                                         'item_brand_id_PurchaseRate','item_city_id_PurchaseRate','hour_PurchaseRate',\
                                                         'is_trade',\
                                                         'user_gender_id_user_id_cnt','user_gender_id_user_id_prob','user_age_level_user_id_cnt','user_age_level_user_id_prob','user_occupation_id_user_id_cnt','user_occupation_id_user_id_prob',\
                                                         'item_property_list_shop_id_cnt','item_property_list_shop_id_prob','item_property_list_shop_review_num_level_cnt','item_property_list_shop_review_num_level_prob','item_property_list_shop_star_level_cnt','item_property_list_shop_star_level_prob',\
                                                         'item_brand_id_item_id_cnt','item_brand_id_item_id_prob','item_city_id_item_id_cnt','item_city_id_item_id_prob','item_price_level_item_id_cnt','item_price_level_item_id_prob','item_sales_level_item_id_cnt','item_sales_level_item_id_prob',\
                                                         'item_collected_level_item_id_cnt','item_collected_level_item_id_prob','item_pv_level_item_id_cnt','item_pv_level_item_id_prob',\
                                                         'item_id_shop_id_cnt','item_id_shop_id_prob','item_id_shop_review_num_level_cnt','item_id_shop_review_num_level_prob','item_id_shop_star_level_cnt','item_id_shop_star_level_prob',\
                                                         'item_id_user_id_cnt','item_id_user_id_prob']\
                                        and not feature.endswith('0') and not feature.endswith('var')
                                        and feature not in cat_features]
target = 'is_trade'

In [7]:
X_train = df_train.loc[df_train.day < 24]  # 18,19,20,21,22,23,24
X_val = df_train.loc[df_train.day == 24]  # 暂时先使用第24天作为验证集

In [8]:
origin_train = X_train[features].values
origin_val = X_val[features].values
train_label = X_train[target].values.ravel()
val_label = X_val[target].values.ravel()

In [9]:
del X_train,X_val,df_train
gc.collect()

84

In [10]:
train = hstack([xgboost_train_non24,onehot_train_non24])
val = hstack([xgboost_val_non24,onehot_val_non24])

In [11]:
del xgboost_train_non24,onehot_train_non24
del xgboost_val_non24,onehot_val_non24
gc.collect()

0

In [12]:
print(train.shape)
print(val.shape)
print(train_label.shape)
print(val_label.shape)

(420629, 13651)
(57406, 13651)
(420629,)
(57406,)


### 合并原始特征

In [13]:
scaler = StandardScaler()

In [14]:
origin_train = scaler.fit_transform(origin_train)
origin_val = scaler.fit_transform(origin_val)
gc.collect()

0

In [15]:
train = hstack([train, origin_train])
val = hstack([val, origin_val])

del origin_train,origin_val
gc.collect()

0

### LR

In [None]:
lr = LogisticRegression(n_jobs=6,C=0.00045)
lr.fit(train, train_label)

  " = {}.".format(self.n_jobs))


In [None]:
y_train = lr.predict_proba(train)[:,1]
y_val = lr.predict_proba(val)[:,1]

In [None]:
print("train_logloss: "+ str(log_loss(train_label,y_train)))
print("val_logloss: "+ str(log_loss(val_label,y_val)))

print("train_auc: "+ str(roc_auc_score(train_label,y_train)))
print("val_auc: "+ str(roc_auc_score(val_label,y_val)))

### FM_FTRL

In [255]:
clf = FM_FTRL(
    alpha=0.01,    #w0和w的FTRL超参数alpha
    beta=0.01,     #w0和w的FTRL超参数beta
    L1=150,        #w0和w的L1正则
    L2=1200,          #w0和w的L2正则
    D=train.shape[1], 
    
    alpha_fm=0.01, #v的FTRL超参数alpha
    L2_fm=10, #v的L2正则
    
    init_fm=0.01,
    D_fm=2, 
    e_noise=0.0001, 
    iters=3, 
    inv_link="sigmoid", 
    threads=10,
)

In [None]:
clf.fit(train,train_label)

In [253]:
y_train = clf.predict(train)
y_val = clf.predict(val)

In [254]:
print("train_logloss: "+ str(log_loss(train_label,y_train)))
print("val_logloss: "+ str(log_loss(val_label,y_val)))

print("train_auc: "+ str(roc_auc_score(train_label,y_train)))
print("val_auc: "+ str(roc_auc_score(val_label,y_val)))

train_logloss: 0.0835237770446
val_logloss: 0.0809400224873
train_auc: 0.785569159225
val_auc: 0.700480514615
