In [74]:
import os
import zipfile
import time
import pickle
import gc

import pandas as pd
import numpy as np
from tqdm import tqdm

from utils import load_pickle, dump_pickle, get_feature_value, feature_spearmanr, feature_target_spearmanr, addCrossFeature, calibration
from utils import raw_data_path, feature_data_path, cache_pkl_path, analyse

In [75]:
all_data_path = feature_data_path + 'all_data_all_features.pkl'
all_data = load_pickle(all_data_path)

train_data = all_data[(all_data.day >= 19) & (all_data.day <= 24)]

# train_cv = all_data[(all_data.day >= 19) & (all_data.day <= 24)]
# train_cv = train_cv.sort_values(by=['context_timestamp'])
target = 'is_trade'

In [76]:
features = load_pickle(feature_data_path + 'important_features.pkl')
nominal_feats = ['hour',
                 'item_sales_level', 'item_price_level', 
                 'user_star_level', 'user_age_level', 'user_gender_id', 'user_occupation_id', 
                 'context_page_id', 
                 'category2_label', 'category_predict_rank',
                 'user_item_city_id_click_rank',
                 'user_item_id_click_rank',
                 'user_item_brand_id_click_rank',
                 'user_item_city_id_click_rank',
                 'user_shop_id_click_rank',
                 'user_context_page_id_click_rank',
                 'user_category2_label_click_rank',
                 'user_item_sales_level_click_rank',
                 'user_item_price_level_click_rank',
                ]

features = list(set(features + nominal_feats))

len(features)

164

In [71]:
from sklearn.metrics import log_loss
import lightgbm as lgb

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import TimeSeriesSplit

train_data = train_data.reset_index()
train_data_index = train_data[(train_data.day >= 19) & (train_data.day <= 23)].index
val_data_index = train_data[train_data.day == 24].index

lgb_clf = lgb.LGBMClassifier(objective='binary', n_jobs=-1, silent=False)

# 参数的组合
lgb_param_grad = {'n_estimators': (100,), 'learning_rate': (0.05,), 'max_depth': (
    3, ), 'num_leaves': (31, ), 'subsample': (0.8,), 'colsample_bytree': (0.8,), 'subsample_freq': (5,)}

clf = GridSearchCV(lgb_clf, param_grid=lgb_param_grad, scoring='neg_log_loss',
                   cv=((train_data_index, val_data_index),), n_jobs=-1, verbose=1, return_train_score=False)


fit_params = {'feature_name': features, 'categorical_feature': nominal_feats}

clf.fit(train_data[features], train_data[target], feature_name=features, categorical_feature=nominal_feats)


print('=====')
print("Best parameters set found on development set:")
print(clf.best_params_)

print('=====')
print("Best parameters set found on development set:")
print(clf.best_score_)

pd.DataFrame(data=clf.cv_results_)

Fitting 1 folds for each of 1 candidates, totalling 1 fits


[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    7.7s finished


=====
Best parameters set found on development set:
{'colsample_bytree': 0.8, 'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 100, 'num_leaves': 31, 'subsample': 0.8, 'subsample_freq': 5}
=====
Best parameters set found on development set:
-0.0809896066363


Unnamed: 0,mean_fit_time,mean_score_time,mean_test_score,param_colsample_bytree,param_learning_rate,param_max_depth,param_n_estimators,param_num_leaves,param_subsample,param_subsample_freq,params,rank_test_score,split0_test_score,std_fit_time,std_score_time,std_test_score
0,5.9185,0.388915,-0.08099,0.8,0.05,3,100,31,0.8,5,"{'colsample_bytree': 0.8, 'learning_rate': 0.0...",1,-0.08099,0.0,0.0,0.0


In [73]:
train_data = all_data[(all_data.day >= 19) & (all_data.day <= 23)]
test_data = all_data[all_data.day == 24]

loss_train = log_loss(train_data[target], clf.predict_proba(train_data[features]))
loss_test = log_loss(test_data[target], clf.predict_proba(test_data[features]))

loss_train, loss_test

(0.087086087998509573, 0.080475598487123934)