In [1]:
import time
import pandas as pd
import numpy as np
import lightgbm as lgb
import random
import config
import pickle
import re
import os
import gc
import hyperopt
import scipy.special as special

from math import log
from numba import jit
from scipy.sparse import csr_matrix, hstack
from sklearn.metrics import log_loss,roc_curve,auc
from sklearn.preprocessing import LabelEncoder,OneHotEncoder,StandardScaler
from sklearn.feature_extraction.text import CountVectorizer
from itertools import chain, combinations
from sklearn.model_selection import train_test_split

from hyperopt import fmin, tpe, hp,space_eval,rand,Trials,partial,STATUS_OK

In [2]:
def ks_metric(true,score):
    fpr, tpr, thresholds = roc_curve(true,score)
    ks = max(tpr-fpr)
    auc_ = auc(fpr, tpr)
    return ks,auc_

def score_change(score,base_rate,real_rate):
    base_change = np.log(base_rate/(1-base_rate)) - np.log(real_rate/(1-real_rate))
    score_adj = np.exp(np.log(score/(1-score)) - base_change)/(np.exp(np.log(score/(1-score)) - base_change)+1)
    return score_adj

def memory_saving(df,del_var=[]):
    for var in df:
        if var in del_var:
            del df[var]
            continue
        if df[var].dtypes == float:
            df[var] = pd.to_numeric(df[var],downcast='float')
        else:
            df[var] = pd.to_numeric(df[var],downcast='signed')
    return df

def map_col(df,drop=False):
    map_dict = {
        'item_price_level':[4,5,6,7,8,9],
        'item_sales_level':[4,6,9,10,11,12,13,14,16],
        'item_pv_level':[6,9,10,11,12,13,14,15,16,17,18,19,20],
        'user_age_level':[1001,1002,1003,1004,1005],
        'context_page_id':[4001,4002,4004,4006,4008,4010,4013,4016,4018],
        'shop_review_num_level':[5,9,14,15,16,17,18,20,21],
        #'hour':[6,9,12,17,20],
        'user_occupation_id':{-1:2003},
        'user_star_level':{-1:3000}
    }
    for key,value in map_dict.items():
        if isinstance(value,list):
            df[key+'_mapped'] = 0
            for i in range(len(value)):
                df.loc[df[key]>value[i],key+'_mapped'] = i+1
        else:
            '''df[key+'_mapped'] = df[key]
            for key_sub,value_sub in value.items():
                df.loc[df[key]==key_sub,key+'_mapped'] = value_sub'''
            df[key+'_mapped'] = df[key].apply(lambda x:value.get(x,x))
        if drop:
            df[key] = df[key+'_mapped']
            del df[key+'_mapped']
    gc.collect()
    return df

In [None]:
featureDtypes = {'cnt_rec': 'int8',
 'context_id': 'int64',
 'context_page_id': 'int16',
 'context_timestamp': 'int32',
 'day': 'int8',
 'hour': 'int8',
 'instance_id': 'int64',
 'is_trade': 'float32',
 'item_brand_id': 'int16',
 'item_category_list_bin1': 'int8',
 'item_category_list_bin2': 'int8',
 'item_city_id': 'int16',
 'item_collected_level': 'int8',
 'item_id': 'int32',
 'item_price_level': 'int8',
 'item_pv_level': 'int8',
 'item_sales_level': 'int8',
 'len_item_property_list': 'int8',
 'len_predict_category_property': 'int8',
 'min': 'int8',
 'shop_id': 'int16',
 'shop_review_num_level': 'int8',
 'shop_review_positive_rate': 'float32',
 'shop_score_delivery': 'float32',
 'shop_score_description': 'float32',
 'shop_score_service': 'float32',
 'shop_star_level': 'int16',
 'user_age_level': 'int16',
 'user_gender_id': 'int8',
 'user_id': 'int32',
 'user_occupation_id': 'int16',
 'user_star_level': 'int16'}

dfAll = pd.read_table(config.FEATURE_SET,sep=' ',dtype=featureDtypes)
dfAll = map_col(dfAll,True); gc.collect()

In [None]:
dataRootDir = '../../Data/advertisment/Cache/'
dataAdded = [
    #'ratio_rank',
    #'ratio_rank_preday',
    #'smooth',
    #'offline_v2',
    #'cross_plus',
    #'trick_userid',
    #'text_base',
    'single_ratio',
    'significant_pro_cate',
    #'text_model_score_train',
    #'text_model_score',
]

feat_del = []
with open(dataRootDir+'del_features_v2.pkl','rb') as f:
    feat_del = pickle.load(f)

for add in dataAdded:
    tmpDf = pd.read_csv(dataRootDir + add +'.csv')
    tmpDf = memory_saving(tmpDf,feat_del)
    dfAll = pd.concat([dfAll,tmpDf],axis=1)
    del tmpDf
    gc.collect()
    print('%s is loaded, shape is %d'%(add,dfAll.shape[1]))

In [None]:
features = [i for i in dfAll.columns.tolist() if not i in config.IGNORE_COLS+['min']+feat_del]
train_idx = dfAll.loc[(dfAll['hour']<10)&(dfAll['hour']>0)].index
valid_idx = dfAll.loc[(dfAll['hour']<12)&(dfAll['hour']>9)].index
Xi_train_, y_train_ = dfAll.loc[list(train_idx),features],dfAll.loc[train_idx,'is_trade']
Xi_valid_, y_valid_ = dfAll.loc[list(valid_idx),features],dfAll.loc[valid_idx,'is_trade']
Xi_test_ = dfAll.loc[(dfAll['hour']>=12),features]
del dfAll
del train_idx
del valid_idx

In [3]:
dataRootDir = '../../Data/advertisment/Cache/'
Xi_finnal_ = pd.read_csv(dataRootDir + 'train_set.csv')
Xi_finnal_ = memory_saving(Xi_finnal_)

In [4]:
y_finnal_ = Xi_finnal_['is_trade'].values
del Xi_finnal_['is_trade']
Xi_train_,Xi_valid_,y_train_,y_valid_ = train_test_split(Xi_finnal_, y_finnal_, test_size=0.18, random_state=42)

In [5]:
features = Xi_train_.columns.tolist()

In [15]:
def lgbtune(argsDict):
    leaf = argsDict['leaf']*5 + 5
    learning_rate = argsDict["learning_rate"] * 0.02 + 0.05
    subsample = argsDict["subsample"] * 0.1 + 0.7
    colsample_bytree = argsDict["colsample_bytree"] * 0.1 + 0.7
    #scale_pos_weight = argsDict["scale_pos_weight"] + 1
    max_bin = argsDict["max_bin"] * 5 + 10
    
    scale_pos_weight=1.5
    
    print(argsDict)
    
    '''print('leaf is %f'%leaf)
    print('learning_rate is %f'%learning_rate)
    print('subsample is %f'%subsample)
    print('colsample_bytree is %f'%colsample_bytree)
    print('scale_pos_weight is %d'%scale_pos_weight)
    print('max_bin is %d'%max_bin)'''
    
    clf = lgb.LGBMClassifier(
        num_leaves=leaf, 
        n_estimators=20000,
        n_jobs=20,
        learning_rate=learning_rate,
        colsample_bytree=colsample_bytree,
        subsample=subsample,
        max_bin=max_bin,
        scale_pos_weight = scale_pos_weight
    )
    clf.fit(Xi_train_[features], y_train_, eval_set=[(Xi_valid_[features], y_valid_)],feature_name = features,early_stopping_rounds=100,verbose =200)
    
    y_score_ = clf.predict_proba(Xi_valid_[features],num_iteration=clf.best_iteration_)[:, 1]
    ks,auc = ks_metric(y_valid_, y_score_)
    print(auc)
    return -1*auc

In [16]:
space = {"leaf":hp.randint("leaf",25),
         "learning_rate":hp.randint("learning_rate",16),
         "subsample":hp.randint("subsample",4),
         "colsample_bytree":hp.randint("colsample_bytree",4),
         #"min_child_weight":hp.randint("min_child_weight",5),
         #"scale_pos_weight":hp.randint("scale_pos_weight",25),
         "max_bin":hp.randint("max_bin",10)
        }

In [17]:
trials = Trials()

algo = partial(tpe.suggest,n_startup_jobs=1)
best = fmin(lgbtune,space,algo=tpe.suggest,max_evals=50,trials=trials)
print(best)

{'colsample_bytree': 3, 'leaf': 0, 'learning_rate': 4, 'max_bin': 0, 'subsample': 2}
Training until validation scores don't improve for 100 rounds.
[200]	valid_0's binary_logloss: 0.171984
[400]	valid_0's binary_logloss: 0.171613
[600]	valid_0's binary_logloss: 0.171427
[800]	valid_0's binary_logloss: 0.171346
[1000]	valid_0's binary_logloss: 0.171306
Early stopping, best iteration is:
[914]	valid_0's binary_logloss: 0.171249
0.736658212713
{'colsample_bytree': 3, 'leaf': 19, 'learning_rate': 7, 'max_bin': 3, 'subsample': 1}
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[34]	valid_0's binary_logloss: 0.173048
0.727721555524
{'colsample_bytree': 1, 'leaf': 7, 'learning_rate': 15, 'max_bin': 0, 'subsample': 2}
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[16]	valid_0's binary_logloss: 0.173535
0.726580698267
{'colsample_bytree': 1, 'leaf': 15, 'learning_rate': 1, 'max_bin': 9, 'subsam

KeyboardInterrupt: 

In [9]:
print(best)

{'colsample_bytree': 1, 'leaf': 0, 'learning_rate': 0, 'max_bin': 0, 'subsample': 3}


In [11]:
trials

<hyperopt.base.Trials at 0x243261bab00>

In [10]:
best = {'colsample_bytree': 1, 'leaf': 0, 'learning_rate': 0, 'max_bin': 0, 'subsample': 3}

In [11]:
def lgbiter(argsDict):
    leaf = argsDict['leaf']*5 + 5
    #leaf = argsDict['max_depth']*5 + 5
    learning_rate = argsDict["learning_rate"] * 0.02 + 0.05
    subsample = argsDict["subsample"] * 0.1 + 0.7
    colsample_bytree = argsDict["colsample_bytree"] * 0.1 + 0.7
    #scale_pos_weight = argsDict["scale_pos_weight"] + 1
    max_bin = argsDict["max_bin"] * 5 + 10
    
    clf = lgb.LGBMClassifier(
        num_leaves=leaf, 
        n_estimators=20000,
        n_jobs=20,
        learning_rate=learning_rate,
        colsample_bytree=colsample_bytree,
        subsample=subsample,
        max_bin=max_bin,
        #scale_pos_weight = scale_pos_weight
    )
    clf.fit(Xi_train_[features], y_train_, eval_set=[(Xi_valid_[features], y_valid_)],feature_name = features,
            categorical_feature=[],early_stopping_rounds=100)
    bstIter = clf.best_iteration_
    return bstIter    

In [12]:
bstIter = lgbiter(best)

Xi_test_ = pd.read_csv(dataRootDir + 'test_set.csv')
Xi_test_ = memory_saving(Xi_test_)



[1]	valid_0's binary_logloss: 0.652655
Training until validation scores don't improve for 100 rounds.
[2]	valid_0's binary_logloss: 0.616019
[3]	valid_0's binary_logloss: 0.582742
[4]	valid_0's binary_logloss: 0.552425
[5]	valid_0's binary_logloss: 0.524707
[6]	valid_0's binary_logloss: 0.499297
[7]	valid_0's binary_logloss: 0.475965
[8]	valid_0's binary_logloss: 0.454486
[9]	valid_0's binary_logloss: 0.434684
[10]	valid_0's binary_logloss: 0.416396
[11]	valid_0's binary_logloss: 0.399481
[12]	valid_0's binary_logloss: 0.383816
[13]	valid_0's binary_logloss: 0.369299
[14]	valid_0's binary_logloss: 0.355822
[15]	valid_0's binary_logloss: 0.343315
[16]	valid_0's binary_logloss: 0.331684
[17]	valid_0's binary_logloss: 0.320861
[18]	valid_0's binary_logloss: 0.310787
[19]	valid_0's binary_logloss: 0.301412
[20]	valid_0's binary_logloss: 0.292677
[21]	valid_0's binary_logloss: 0.284545
[22]	valid_0's binary_logloss: 0.276958
[23]	valid_0's binary_logloss: 0.269878
[24]	valid_0's binary_logl

[203]	valid_0's binary_logloss: 0.168855
[204]	valid_0's binary_logloss: 0.16885
[205]	valid_0's binary_logloss: 0.168841
[206]	valid_0's binary_logloss: 0.168836
[207]	valid_0's binary_logloss: 0.168826
[208]	valid_0's binary_logloss: 0.168818
[209]	valid_0's binary_logloss: 0.168816
[210]	valid_0's binary_logloss: 0.168809
[211]	valid_0's binary_logloss: 0.168806
[212]	valid_0's binary_logloss: 0.168798
[213]	valid_0's binary_logloss: 0.168792
[214]	valid_0's binary_logloss: 0.168787
[215]	valid_0's binary_logloss: 0.168782
[216]	valid_0's binary_logloss: 0.168775
[217]	valid_0's binary_logloss: 0.16877
[218]	valid_0's binary_logloss: 0.168762
[219]	valid_0's binary_logloss: 0.168756
[220]	valid_0's binary_logloss: 0.168748
[221]	valid_0's binary_logloss: 0.168742
[222]	valid_0's binary_logloss: 0.168739
[223]	valid_0's binary_logloss: 0.168733
[224]	valid_0's binary_logloss: 0.168727
[225]	valid_0's binary_logloss: 0.168722
[226]	valid_0's binary_logloss: 0.168718
[227]	valid_0's bi

[404]	valid_0's binary_logloss: 0.168196
[405]	valid_0's binary_logloss: 0.168193
[406]	valid_0's binary_logloss: 0.168189
[407]	valid_0's binary_logloss: 0.168186
[408]	valid_0's binary_logloss: 0.168187
[409]	valid_0's binary_logloss: 0.168184
[410]	valid_0's binary_logloss: 0.168182
[411]	valid_0's binary_logloss: 0.168183
[412]	valid_0's binary_logloss: 0.168181
[413]	valid_0's binary_logloss: 0.16818
[414]	valid_0's binary_logloss: 0.168177
[415]	valid_0's binary_logloss: 0.168176
[416]	valid_0's binary_logloss: 0.168169
[417]	valid_0's binary_logloss: 0.168169
[418]	valid_0's binary_logloss: 0.16817
[419]	valid_0's binary_logloss: 0.168168
[420]	valid_0's binary_logloss: 0.168162
[421]	valid_0's binary_logloss: 0.168158
[422]	valid_0's binary_logloss: 0.168156
[423]	valid_0's binary_logloss: 0.168154
[424]	valid_0's binary_logloss: 0.168153
[425]	valid_0's binary_logloss: 0.168152
[426]	valid_0's binary_logloss: 0.16815
[427]	valid_0's binary_logloss: 0.168151
[428]	valid_0's bin

[605]	valid_0's binary_logloss: 0.167942
[606]	valid_0's binary_logloss: 0.167943
[607]	valid_0's binary_logloss: 0.167939
[608]	valid_0's binary_logloss: 0.167937
[609]	valid_0's binary_logloss: 0.167934
[610]	valid_0's binary_logloss: 0.167931
[611]	valid_0's binary_logloss: 0.167931
[612]	valid_0's binary_logloss: 0.16793
[613]	valid_0's binary_logloss: 0.167929
[614]	valid_0's binary_logloss: 0.16793
[615]	valid_0's binary_logloss: 0.167928
[616]	valid_0's binary_logloss: 0.167928
[617]	valid_0's binary_logloss: 0.167924
[618]	valid_0's binary_logloss: 0.167922
[619]	valid_0's binary_logloss: 0.167919
[620]	valid_0's binary_logloss: 0.167917
[621]	valid_0's binary_logloss: 0.167918
[622]	valid_0's binary_logloss: 0.167915
[623]	valid_0's binary_logloss: 0.167913
[624]	valid_0's binary_logloss: 0.167912
[625]	valid_0's binary_logloss: 0.167911
[626]	valid_0's binary_logloss: 0.167909
[627]	valid_0's binary_logloss: 0.167907
[628]	valid_0's binary_logloss: 0.167905
[629]	valid_0's bi

[806]	valid_0's binary_logloss: 0.167742
[807]	valid_0's binary_logloss: 0.167741
[808]	valid_0's binary_logloss: 0.167738
[809]	valid_0's binary_logloss: 0.167737
[810]	valid_0's binary_logloss: 0.167737
[811]	valid_0's binary_logloss: 0.167737
[812]	valid_0's binary_logloss: 0.167738
[813]	valid_0's binary_logloss: 0.167736
[814]	valid_0's binary_logloss: 0.167734
[815]	valid_0's binary_logloss: 0.167731
[816]	valid_0's binary_logloss: 0.16773
[817]	valid_0's binary_logloss: 0.16773
[818]	valid_0's binary_logloss: 0.167731
[819]	valid_0's binary_logloss: 0.16773
[820]	valid_0's binary_logloss: 0.167728
[821]	valid_0's binary_logloss: 0.167726
[822]	valid_0's binary_logloss: 0.167724
[823]	valid_0's binary_logloss: 0.167724
[824]	valid_0's binary_logloss: 0.167723
[825]	valid_0's binary_logloss: 0.167722
[826]	valid_0's binary_logloss: 0.167719
[827]	valid_0's binary_logloss: 0.167718
[828]	valid_0's binary_logloss: 0.167716
[829]	valid_0's binary_logloss: 0.167716
[830]	valid_0's bin

[1007]	valid_0's binary_logloss: 0.167586
[1008]	valid_0's binary_logloss: 0.167584
[1009]	valid_0's binary_logloss: 0.167583
[1010]	valid_0's binary_logloss: 0.167584
[1011]	valid_0's binary_logloss: 0.167585
[1012]	valid_0's binary_logloss: 0.167584
[1013]	valid_0's binary_logloss: 0.167584
[1014]	valid_0's binary_logloss: 0.167581
[1015]	valid_0's binary_logloss: 0.16758
[1016]	valid_0's binary_logloss: 0.167578
[1017]	valid_0's binary_logloss: 0.167579
[1018]	valid_0's binary_logloss: 0.167579
[1019]	valid_0's binary_logloss: 0.167579
[1020]	valid_0's binary_logloss: 0.16758
[1021]	valid_0's binary_logloss: 0.167579
[1022]	valid_0's binary_logloss: 0.167579
[1023]	valid_0's binary_logloss: 0.167578
[1024]	valid_0's binary_logloss: 0.167577
[1025]	valid_0's binary_logloss: 0.167577
[1026]	valid_0's binary_logloss: 0.167576
[1027]	valid_0's binary_logloss: 0.167575
[1028]	valid_0's binary_logloss: 0.167574
[1029]	valid_0's binary_logloss: 0.167574
[1030]	valid_0's binary_logloss: 0.1

[1203]	valid_0's binary_logloss: 0.167508
[1204]	valid_0's binary_logloss: 0.167508
[1205]	valid_0's binary_logloss: 0.167507
[1206]	valid_0's binary_logloss: 0.167508
[1207]	valid_0's binary_logloss: 0.167504
[1208]	valid_0's binary_logloss: 0.167505
[1209]	valid_0's binary_logloss: 0.167504
[1210]	valid_0's binary_logloss: 0.167503
[1211]	valid_0's binary_logloss: 0.167502
[1212]	valid_0's binary_logloss: 0.167501
[1213]	valid_0's binary_logloss: 0.167499
[1214]	valid_0's binary_logloss: 0.167499
[1215]	valid_0's binary_logloss: 0.167498
[1216]	valid_0's binary_logloss: 0.167496
[1217]	valid_0's binary_logloss: 0.167496
[1218]	valid_0's binary_logloss: 0.167496
[1219]	valid_0's binary_logloss: 0.167496
[1220]	valid_0's binary_logloss: 0.167495
[1221]	valid_0's binary_logloss: 0.167494
[1222]	valid_0's binary_logloss: 0.167492
[1223]	valid_0's binary_logloss: 0.167492
[1224]	valid_0's binary_logloss: 0.16749
[1225]	valid_0's binary_logloss: 0.16749
[1226]	valid_0's binary_logloss: 0.1

[1399]	valid_0's binary_logloss: 0.167435
[1400]	valid_0's binary_logloss: 0.167433
[1401]	valid_0's binary_logloss: 0.167431
[1402]	valid_0's binary_logloss: 0.167431
[1403]	valid_0's binary_logloss: 0.16743
[1404]	valid_0's binary_logloss: 0.167433
[1405]	valid_0's binary_logloss: 0.167432
[1406]	valid_0's binary_logloss: 0.167433
[1407]	valid_0's binary_logloss: 0.167433
[1408]	valid_0's binary_logloss: 0.167433
[1409]	valid_0's binary_logloss: 0.167432
[1410]	valid_0's binary_logloss: 0.167433
[1411]	valid_0's binary_logloss: 0.167434
[1412]	valid_0's binary_logloss: 0.167435
[1413]	valid_0's binary_logloss: 0.167435
[1414]	valid_0's binary_logloss: 0.167434
[1415]	valid_0's binary_logloss: 0.167435
[1416]	valid_0's binary_logloss: 0.167435
[1417]	valid_0's binary_logloss: 0.167435
[1418]	valid_0's binary_logloss: 0.167434
[1419]	valid_0's binary_logloss: 0.167433
[1420]	valid_0's binary_logloss: 0.167433
[1421]	valid_0's binary_logloss: 0.167432
[1422]	valid_0's binary_logloss: 0.

[1595]	valid_0's binary_logloss: 0.167368
[1596]	valid_0's binary_logloss: 0.167366
[1597]	valid_0's binary_logloss: 0.167366
[1598]	valid_0's binary_logloss: 0.167367
[1599]	valid_0's binary_logloss: 0.167367
[1600]	valid_0's binary_logloss: 0.167365
[1601]	valid_0's binary_logloss: 0.167363
[1602]	valid_0's binary_logloss: 0.167364
[1603]	valid_0's binary_logloss: 0.167365
[1604]	valid_0's binary_logloss: 0.167364
[1605]	valid_0's binary_logloss: 0.167364
[1606]	valid_0's binary_logloss: 0.167364
[1607]	valid_0's binary_logloss: 0.167364
[1608]	valid_0's binary_logloss: 0.167365
[1609]	valid_0's binary_logloss: 0.167364
[1610]	valid_0's binary_logloss: 0.167365
[1611]	valid_0's binary_logloss: 0.167363
[1612]	valid_0's binary_logloss: 0.167364
[1613]	valid_0's binary_logloss: 0.167362
[1614]	valid_0's binary_logloss: 0.167362
[1615]	valid_0's binary_logloss: 0.16736
[1616]	valid_0's binary_logloss: 0.16736
[1617]	valid_0's binary_logloss: 0.16736
[1618]	valid_0's binary_logloss: 0.16

[1791]	valid_0's binary_logloss: 0.167293
[1792]	valid_0's binary_logloss: 0.167293
[1793]	valid_0's binary_logloss: 0.167293
[1794]	valid_0's binary_logloss: 0.167291
[1795]	valid_0's binary_logloss: 0.167291
[1796]	valid_0's binary_logloss: 0.16729
[1797]	valid_0's binary_logloss: 0.16729
[1798]	valid_0's binary_logloss: 0.16729
[1799]	valid_0's binary_logloss: 0.16729
[1800]	valid_0's binary_logloss: 0.16729
[1801]	valid_0's binary_logloss: 0.167291
[1802]	valid_0's binary_logloss: 0.16729
[1803]	valid_0's binary_logloss: 0.167286
[1804]	valid_0's binary_logloss: 0.167287
[1805]	valid_0's binary_logloss: 0.167286
[1806]	valid_0's binary_logloss: 0.167286
[1807]	valid_0's binary_logloss: 0.167286
[1808]	valid_0's binary_logloss: 0.167283
[1809]	valid_0's binary_logloss: 0.167282
[1810]	valid_0's binary_logloss: 0.167282
[1811]	valid_0's binary_logloss: 0.167283
[1812]	valid_0's binary_logloss: 0.167284
[1813]	valid_0's binary_logloss: 0.167282
[1814]	valid_0's binary_logloss: 0.16728

[1987]	valid_0's binary_logloss: 0.16727
[1988]	valid_0's binary_logloss: 0.167272
[1989]	valid_0's binary_logloss: 0.167271
[1990]	valid_0's binary_logloss: 0.16727
[1991]	valid_0's binary_logloss: 0.16727
[1992]	valid_0's binary_logloss: 0.16727
[1993]	valid_0's binary_logloss: 0.167269
[1994]	valid_0's binary_logloss: 0.167268
[1995]	valid_0's binary_logloss: 0.167268
[1996]	valid_0's binary_logloss: 0.167266
[1997]	valid_0's binary_logloss: 0.167266
[1998]	valid_0's binary_logloss: 0.167265
[1999]	valid_0's binary_logloss: 0.167266
[2000]	valid_0's binary_logloss: 0.167266
[2001]	valid_0's binary_logloss: 0.167266
[2002]	valid_0's binary_logloss: 0.167267
[2003]	valid_0's binary_logloss: 0.167266
[2004]	valid_0's binary_logloss: 0.167265
[2005]	valid_0's binary_logloss: 0.167265
[2006]	valid_0's binary_logloss: 0.167267
[2007]	valid_0's binary_logloss: 0.167266
[2008]	valid_0's binary_logloss: 0.167266
[2009]	valid_0's binary_logloss: 0.167267
[2010]	valid_0's binary_logloss: 0.167

[2183]	valid_0's binary_logloss: 0.167261
[2184]	valid_0's binary_logloss: 0.167262
[2185]	valid_0's binary_logloss: 0.167263
[2186]	valid_0's binary_logloss: 0.167263
[2187]	valid_0's binary_logloss: 0.167262
[2188]	valid_0's binary_logloss: 0.167262
[2189]	valid_0's binary_logloss: 0.167262
[2190]	valid_0's binary_logloss: 0.167262
[2191]	valid_0's binary_logloss: 0.167262
[2192]	valid_0's binary_logloss: 0.167262
[2193]	valid_0's binary_logloss: 0.167261
[2194]	valid_0's binary_logloss: 0.16726
[2195]	valid_0's binary_logloss: 0.167259
[2196]	valid_0's binary_logloss: 0.167259
[2197]	valid_0's binary_logloss: 0.16726
[2198]	valid_0's binary_logloss: 0.167261
[2199]	valid_0's binary_logloss: 0.167261
[2200]	valid_0's binary_logloss: 0.167261
[2201]	valid_0's binary_logloss: 0.16726
[2202]	valid_0's binary_logloss: 0.16726
[2203]	valid_0's binary_logloss: 0.16726
[2204]	valid_0's binary_logloss: 0.167257
[2205]	valid_0's binary_logloss: 0.167257
[2206]	valid_0's binary_logloss: 0.1672

[2379]	valid_0's binary_logloss: 0.167249
[2380]	valid_0's binary_logloss: 0.16725
[2381]	valid_0's binary_logloss: 0.16725
[2382]	valid_0's binary_logloss: 0.16725
[2383]	valid_0's binary_logloss: 0.16725
[2384]	valid_0's binary_logloss: 0.167251
[2385]	valid_0's binary_logloss: 0.167251
[2386]	valid_0's binary_logloss: 0.167253
[2387]	valid_0's binary_logloss: 0.167253
[2388]	valid_0's binary_logloss: 0.167253
[2389]	valid_0's binary_logloss: 0.167254
[2390]	valid_0's binary_logloss: 0.167253
[2391]	valid_0's binary_logloss: 0.167253
[2392]	valid_0's binary_logloss: 0.167253
[2393]	valid_0's binary_logloss: 0.167253
[2394]	valid_0's binary_logloss: 0.167252
[2395]	valid_0's binary_logloss: 0.167253
[2396]	valid_0's binary_logloss: 0.167254
[2397]	valid_0's binary_logloss: 0.167254
[2398]	valid_0's binary_logloss: 0.167254
[2399]	valid_0's binary_logloss: 0.167254
[2400]	valid_0's binary_logloss: 0.167255
[2401]	valid_0's binary_logloss: 0.167256
[2402]	valid_0's binary_logloss: 0.167

In [13]:
def lgbsubmit(argsDict):
    leaf = argsDict['leaf']*5 + 5
    #leaf = argsDict['max_depth']*5 + 5
    learning_rate = argsDict["learning_rate"] * 0.02 + 0.05
    subsample = argsDict["subsample"] * 0.1 + 0.7
    colsample_bytree = argsDict["colsample_bytree"] * 0.1 + 0.7
    #scale_pos_weight = argsDict["scale_pos_weight"] + 1
    max_bin = argsDict["max_bin"] * 5 + 10
    
    clf = lgb.LGBMClassifier(
        num_leaves=leaf, 
        n_estimators=bstIter,
        n_jobs=20,
        learning_rate=learning_rate,
        colsample_bytree=colsample_bytree,
        subsample=subsample,
        max_bin=max_bin,
        #scale_pos_weight = scale_pos_weight
    )
    clf.fit(Xi_finnal_[features], y_finnal_,feature_name = features,
        categorical_feature=[])
    return clf

In [14]:
clf = lgbsubmit(best)

y_test_meta = np.zeros((Xi_test_.shape[0], 1), dtype=float)
y_test_meta[:,0] += clf.predict_proba(Xi_test_[features])[:,1]
dfinstance = pd.read_table(config.FEATURE_SET,sep=' ',usecols=[4,5])
submit = pd.DataFrame({'instance_id':dfinstance.loc[dfinstance['hour']>=12,'instance_id'],'predicted_score':y_test_meta[:,0]})


submit.to_csv('../../Submission/advertisement/tunning_505.txt', sep=" ", index=False, line_terminator='\n')
print(submit['predicted_score'].mean())

submit['predicted_score'] = score_change(submit['predicted_score'],submit['predicted_score'].mean(),0.0359194123126834)
print(submit['predicted_score'].mean())
submit.to_csv('../../Submission/advertisement/tunning_adj_505.txt', sep=" ", index=False, line_terminator='\n')



0.05084568655260291
0.03632456833923055


In [None]:
submit['predicted_score'] = submit['predicted_score']  - 0.0359194123126834
submit.loc[submit['predicted_score']<0, 'predicted_score']=0
print(submit['predicted_score'].mean())
submit.to_csv('../../Submission/advertisement/tunning_adj2_505.txt', sep=" ", index=False, line_terminator='\n')

In [None]:
submit.describe()

In [None]:
submit = pd.read_table('../../Submission/advertisement/tunning_505.txt', sep=" ")

In [None]:
submit.describe()