In [25]:
import os
import zipfile
import time
import pickle
import gc

import pandas as pd
import numpy as np
from tqdm import tqdm

from utils import load_pickle, dump_pickle, get_feature_value, feature_spearmanr, feature_target_spearmanr, addCrossFeature, calibration
from utils import raw_data_path, feature_data_path, cache_pkl_path, analyse

In [103]:
all_data_path = feature_data_path + 'all_data_all_features_until_25.pkl'
all_data = load_pickle(all_data_path)

target = 'is_trade'

features = load_pickle(feature_data_path + 'features_0420.pkl')

# features = list(all_data.columns)
# features.remove('is_trade')
# features.remove('instance_id')

len(features)

273

# 原生接口

In [104]:
from sklearn.metrics import log_loss
from lightgbm import LGBMClassifier
import lightgbm as lgb


train_data = all_data[(all_data.day >= 19) & (all_data.day <= 23)]
test_data = all_data[all_data.day == 24]

lgb_train_data = lgb.Dataset(
    train_data[features], label=train_data[target], feature_name=features)
lgb_test_data = lgb_train_data.create_valid(
    test_data[features], label=test_data[target])


param = {'application': 'binary',
         'metric': 'binary_logloss',
         
         'num_iterations': 950,
         'learning_rate': 0.02,
         
         'max_depth': 5,
         'num_leaves': 18,
         
         'min_data_in_leaf': 100,
         'min_sum_hessian_in_leaf': 0.1,
         'min_gain_to_split': 0.1,
         
         'feature_fraction': 1.0,
         'bagging_fraction': 0.7,
         'bagging_freq': 1,
         
         'lambda_l2': 0.02,
         'max_bin': 63,
         
#          'early_stopping_round': 200,
         
         'device': 'gpu',
         'gpu_use_dp': False,
         }

# param = {'application': 'binary',
#          'metric': 'binary_logloss',
         
#          'num_iterations': 1100,
#          'learning_rate': 0.02,
         
#          'max_depth': 4,
#          'num_leaves': 20,
#          'min_data_in_leaf': 200,
         
#          'feature_fraction': 1.0,
#          'bagging_fraction': 0.7,
#          'bagging_freq': 1,
         
#          'lambda_l2': 10,
#          'max_bin': 63,
         
#          'early_stopping_round': 100,
         
#          'device': 'gpu',
#          'gpu_use_dp': False,
#          }

valid_sets = [lgb_train_data, lgb_test_data]

bst = lgb.train(param, lgb_train_data, valid_sets=valid_sets, verbose_eval=20,)

loss_train = log_loss(train_data[target], bst.predict(train_data[features]))
loss_test = log_loss(test_data[target], bst.predict(test_data[features]))

loss_train, loss_test



[20]	training's binary_logloss: 0.428191	valid_1's binary_logloss: 0.427083
[40]	training's binary_logloss: 0.288081	valid_1's binary_logloss: 0.286078
[60]	training's binary_logloss: 0.207227	valid_1's binary_logloss: 0.20444
[80]	training's binary_logloss: 0.158799	valid_1's binary_logloss: 0.155336
[100]	training's binary_logloss: 0.129386	valid_1's binary_logloss: 0.125328
[120]	training's binary_logloss: 0.111507	valid_1's binary_logloss: 0.10698
[140]	training's binary_logloss: 0.100712	valid_1's binary_logloss: 0.0957904
[160]	training's binary_logloss: 0.0942328	valid_1's binary_logloss: 0.088998
[180]	training's binary_logloss: 0.0903608	valid_1's binary_logloss: 0.0849333
[200]	training's binary_logloss: 0.088027	valid_1's binary_logloss: 0.0825407
[220]	training's binary_logloss: 0.0865708	valid_1's binary_logloss: 0.0810731
[240]	training's binary_logloss: 0.0856173	valid_1's binary_logloss: 0.080178
[260]	training's binary_logloss: 0.0849153	valid_1's binary_logloss: 0.079

(0.076035256431205928, 0.077890141922122957)

In [99]:
test_data['predicted_score'] = bst.predict(test_data[features])

test_data[['instance_id', 'predicted_score']].to_csv(
    'combination/day23—features-273-depth-5-with_25.txt', index=False, sep=' ')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [108]:
online_best_predict = pd.read_csv("combination/online_best_8438.txt", sep=' ')['predicted_score']

online_8442_predict = pd.read_csv("combination/online-8442-features-273-with_25.txt", sep=' ')['predicted_score']

depth5_predict = pd.read_csv("combination/features-273-depth-5-with_25.txt", sep=' ')['predicted_score']

qwc_with_25 = pd.read_csv("combination/qwc_with_25.txt", sep=' ')['predicted_score']


test_predict = bst.predict(test_data[features])

total_predict = online_best_predict * 0.4 + depth5_predict * 0.6
loss_test = log_loss(test_data[target], total_predict)
print(loss_test)

# a = 100
# b = None

# for i in range(11):
#     for j in range(i+1):
        
#         w1 = 0.1 * (i - j)
#         w2 = 0.1 * i - w1
#         w3 = 1.0 - w1 - w2
# #         total_predict = test_predict * w1 + online_best_predict * w2 + depth5_predict * w3
#         total_predict = depth5_predict * w1 + online_best_predict * w2
#         loss_test = log_loss(test_data[target], total_predict)
        
#         if loss_test < a:
#             a = loss_test
#             b = (w1,w2)
        
# print(a)
# print(b)

0.0777953384278


In [24]:
importance = pd.DataFrame(bst.feature_importance())
importance.columns = ['importance_20']
features = pd.DataFrame(features)
features.columns = ['features_20']
pd.set_option('max_rows',500)
merge = features.join(importance)
merge = merge.sort_values(by=['importance_20'], ascending=False).reset_index()
merge

Unnamed: 0,index,features_20,importance_20
0,92,time_gap_after_total,377
1,160,item_id_smooth_CTR,293
2,88,time_gap_after,266
3,20,shop_score_delivery,254
4,169,shop_id_smooth_CTR,240
5,178,user_age_level_item_price_level_smooth_CTR,228
6,263,user_age_level_hour_smooth_CTR,166
7,12,context_id,162
8,262,user_gender_id_hour_smooth_CTR,158
9,11,user_star_level,149


In [28]:
from sklearn.metrics import log_loss
from lightgbm import LGBMClassifier
import lightgbm as lgb

train_data = all_data[(all_data.day >= 19) & (all_data.day <= 23)]
test_data = all_data[all_data.day == 24]

lgb_clf = lgb.LGBMClassifier(objective='binary',

                             n_estimators=2000,
                             learning_rate=0.02,

                             max_depth=4,
                             num_leaves=30,
                             min_child_samples=100,
                             min_child_weight=1e-3,

                             colsample_bytree=1.0,
                             subsample=0.7,
                             subsample_freq=1,

                             reg_lambda=15,
                             min_split_gain=0.,
                             
                             max_bin=63,

                             n_jobs=-1,
                             silent=False,
                             
                             device='gpu',
                             gpu_use_dp=True,
                             )


lgb_clf.fit(train_data[features], train_data[target],
            eval_set=[(test_data[features], test_data[target])],
            early_stopping_rounds=100,
            verbose=50,
            feature_name=features,
            )

loss_train = log_loss(train_data[target],
                      lgb_clf.predict_proba(train_data[features]))
loss_test = log_loss(
    test_data[target], lgb_clf.predict_proba(test_data[features]))

loss_train, loss_test

Training until validation scores don't improve for 100 rounds.
[50]	valid_0's binary_logloss: 0.241168
[100]	valid_0's binary_logloss: 0.12659
[150]	valid_0's binary_logloss: 0.0931066
[200]	valid_0's binary_logloss: 0.0836024
[250]	valid_0's binary_logloss: 0.0807791
[300]	valid_0's binary_logloss: 0.0798235
[350]	valid_0's binary_logloss: 0.0793766
[400]	valid_0's binary_logloss: 0.079122
[450]	valid_0's binary_logloss: 0.078976
[500]	valid_0's binary_logloss: 0.0788777
[550]	valid_0's binary_logloss: 0.0788011
[600]	valid_0's binary_logloss: 0.0787411
[650]	valid_0's binary_logloss: 0.0787169
[700]	valid_0's binary_logloss: 0.0786801
[750]	valid_0's binary_logloss: 0.078643
[800]	valid_0's binary_logloss: 0.0786134
[850]	valid_0's binary_logloss: 0.0786117
[900]	valid_0's binary_logloss: 0.0785871
[950]	valid_0's binary_logloss: 0.0785803
[1000]	valid_0's binary_logloss: 0.0785894
Early stopping, best iteration is:
[932]	valid_0's binary_logloss: 0.0785674


(0.080670807666239061, 0.078567408643239092)