In [1]:
import time
import pandas as pd
import numpy as np
import lightgbm as lgb
import random
import config
import pickle
import re
import os
import gc
import scipy.special as special

from math import log
from numba import jit
from scipy.sparse import csr_matrix, hstack
from sklearn.metrics import log_loss,roc_curve,auc
from sklearn.preprocessing import LabelEncoder,OneHotEncoder,StandardScaler
from sklearn.feature_extraction.text import CountVectorizer
from itertools import chain, combinations

from hyperopt import fmin, tpe, hp,space_eval,rand,Trials,partial,STATUS_OK

In [2]:
def ks_metric(true,score):
    fpr, tpr, thresholds = roc_curve(true,score)
    ks = max(tpr-fpr)
    auc_ = auc(fpr, tpr)
    return ks,auc_

def score_change(score,base_rate,real_rate):
    base_change = np.log(base_rate/(1-base_rate)) - np.log(real_rate/(1-real_rate))
    score_adj = np.exp(np.log(score/(1-score)) - base_change)/(np.exp(np.log(score/(1-score)) - base_change)+1)
    return score_adj

def memory_saving(df,del_var=[]):
    for var in df:
        if var in del_var:
            del df[var]
            continue
        if df[var].dtypes == float:
            df[var] = pd.to_numeric(df[var],downcast='float')
        else:
            df[var] = pd.to_numeric(df[var],downcast='signed')
    return df

def map_col(df,drop=False):
    map_dict = {
        'item_price_level':[4,5,6,7,8,9],
        'item_sales_level':[4,6,9,10,11,12,13,14,16],
        'item_pv_level':[6,9,10,11,12,13,14,15,16,17,18,19,20],
        'user_age_level':[1001,1002,1003,1004,1005],
        'context_page_id':[4001,4002,4004,4006,4008,4010,4013,4016,4018],
        'shop_review_num_level':[5,9,14,15,16,17,18,20,21],
        #'hour':[6,9,12,17,20],
        'user_occupation_id':{-1:2003},
        'user_star_level':{-1:3000}
    }
    for key,value in map_dict.items():
        if isinstance(value,list):
            df[key+'_mapped'] = 0
            for i in range(len(value)):
                df.loc[df[key]>value[i],key+'_mapped'] = i+1
        else:
            '''df[key+'_mapped'] = df[key]
            for key_sub,value_sub in value.items():
                df.loc[df[key]==key_sub,key+'_mapped'] = value_sub'''
            df[key+'_mapped'] = df[key].apply(lambda x:value.get(x,x))
        if drop:
            df[key] = df[key+'_mapped']
            del df[key+'_mapped']
    gc.collect()
    return df

In [3]:
def lgbmodel(argsDict,bst_estimators = 20000):
    leaf = argsDict['leaf']*5 + 5
    learning_rate = argsDict['learning_rate']*0.02 + 0.01
    colsample_bytree = argsDict['colsample_bytree']*0.1 +0.7
    subsample = argsDict['subsample']*0.1 +0.7
    max_bin = argsDict['max_bin']*5 + 5
    clf = lgb.LGBMClassifier(
        num_leaves=leaf,
        n_estimators=bst_estimators,
        n_jobs=20,
        learning_rate=learning_rate,
        colsample_bytree=colsample_bytree,
        subsample=subsample,
        max_bin=max_bin
    )
    if bst_estimators == 20000:
        clf.fit(Xi_train_[features], y_train_, eval_set=[(Xi_valid_[features], y_valid_)],feature_name = features,
                early_stopping_rounds=100,verbose = 200)
        return clf,clf.best_iteration_
    else:
        clf.fit(Xi_finnal_[features], y_finnal_,feature_name = features,)
        return clf

def lgbtuning(argsDict):
    print(argsDict)
    clf,bst_estimators = lgbmodel(argsDict)
    y_score_ = clf.predict_proba(Xi_valid_[features],num_iteration=bst_estimators)[:, 1]
    ks,auc = ks_metric(y_valid_, y_score_)
    print(auc)
    print('-------------------------------------------------------------------------------------------------------------------------------\n')
    return -auc

<font color=#0099ff size=5 face="黑体">读取数据</font>

In [4]:
dfAll = pd.read_table(config.FEATURE_SET,sep=' ',dtype=config.featureDtypes)
dfAll = map_col(dfAll,True); gc.collect()

0

In [5]:
dataRootDir = '../../Data/advertisment/Cache/'
dataAdded = [
    'ratio_rank',
    'ratio_rank_preday',
    'smooth',
    'offline_v2',
    'cross_plus',
    'trick_userid',
    'text_base',
    'single_ratio',
    'significant_pro_cate',
    'day_gap_features_part1',
    'day_gap_features_part2',
    #'text_model_score_train',
    #'text_model_score',
]

feat_del = []
with open(dataRootDir+'del_features_v2.pkl','rb') as f:
    feat_del = pickle.load(f)

for add in dataAdded:
    tmpDf = pd.read_csv(dataRootDir + add +'.csv')
    tmpDf = memory_saving(tmpDf,feat_del)
    dfAll = pd.concat([dfAll,tmpDf],axis=1)
    del tmpDf
    gc.collect()
    print('%s is loaded, shape is %d'%(add,dfAll.shape[1]))

ratio_rank is loaded, shape is 98
ratio_rank_preday is loaded, shape is 181
smooth is loaded, shape is 200
offline_v2 is loaded, shape is 263
cross_plus is loaded, shape is 319
trick_userid is loaded, shape is 331
text_base is loaded, shape is 336
single_ratio is loaded, shape is 342
significant_pro_cate is loaded, shape is 343
day_gap_features_part1 is loaded, shape is 418
day_gap_features_part2 is loaded, shape is 501


In [10]:
features = [i for i in dfAll.columns.tolist() if not i in config.IGNORE_COLS+feat_del+['min','tfidf_score','cnt_score']]
toSave = dfAll[features+['is_trade','instance_id']]
toSave.to_csv('../../Data/advertisment/Raw/model_input.csv',index = False)
del toSave

<font color=#0099ff size=5 face="黑体">模型调优</font>

In [12]:
features = [i for i in dfAll.columns.tolist() if not i in config.IGNORE_COLS+feat_del+['min','tfidf_score','cnt_score']]
train_idx = dfAll.loc[(dfAll['hour']<10)&(dfAll['hour']>0)].index
valid_idx = dfAll.loc[(dfAll['hour']<12)&(dfAll['hour']>9)].index
Xi_train_, y_train_ = dfAll.loc[list(train_idx),features],dfAll.loc[train_idx,'is_trade']
Xi_valid_, y_valid_ = dfAll.loc[list(valid_idx),features],dfAll.loc[valid_idx,'is_trade']
Xi_test_ = dfAll.loc[(dfAll['hour']>=12),features]
del dfAll
del train_idx
del valid_idx

In [13]:
space = {"leaf":hp.randint("leaf",20),
         "learning_rate":hp.randint("learning_rate",15),
         "subsample":hp.randint("subsample",4),
         "colsample_bytree":hp.randint("colsample_bytree",4),
         "max_bin":hp.randint("max_bin",20),
        }

In [14]:
algo = partial(tpe.suggest,n_startup_jobs=1)
best = fmin(lgbtuning,space,algo=algo,max_evals=50)

{'colsample_bytree': 2, 'leaf': 5, 'learning_rate': 13, 'max_bin': 5, 'subsample': 1}


  'Please use {0} argument of the Dataset constructor to pass this parameter.'.format(key))


Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[26]	valid_0's binary_logloss: 0.1651
0.729430485277
-------------------------------------------------------------------------------------------------------------------------------

{'colsample_bytree': 2, 'leaf': 5, 'learning_rate': 13, 'max_bin': 5, 'subsample': 1}


  'Please use {0} argument of the Dataset constructor to pass this parameter.'.format(key))


Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[26]	valid_0's binary_logloss: 0.1651
0.729430485277
-------------------------------------------------------------------------------------------------------------------------------

{'colsample_bytree': 1, 'leaf': 17, 'learning_rate': 13, 'max_bin': 13, 'subsample': 1}


  'Please use {0} argument of the Dataset constructor to pass this parameter.'.format(key))


Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[19]	valid_0's binary_logloss: 0.1666
0.724572074989
-------------------------------------------------------------------------------------------------------------------------------

{'colsample_bytree': 2, 'leaf': 5, 'learning_rate': 10, 'max_bin': 5, 'subsample': 0}


  'Please use {0} argument of the Dataset constructor to pass this parameter.'.format(key))


Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[33]	valid_0's binary_logloss: 0.165186
0.72848273711
-------------------------------------------------------------------------------------------------------------------------------

{'colsample_bytree': 3, 'leaf': 7, 'learning_rate': 1, 'max_bin': 7, 'subsample': 3}


  'Please use {0} argument of the Dataset constructor to pass this parameter.'.format(key))


Training until validation scores don't improve for 100 rounds.
[200]	valid_0's binary_logloss: 0.16473
[400]	valid_0's binary_logloss: 0.164142
Early stopping, best iteration is:
[422]	valid_0's binary_logloss: 0.164133
0.734186767632
-------------------------------------------------------------------------------------------------------------------------------

{'colsample_bytree': 3, 'leaf': 7, 'learning_rate': 1, 'max_bin': 7, 'subsample': 3}


  'Please use {0} argument of the Dataset constructor to pass this parameter.'.format(key))


Training until validation scores don't improve for 100 rounds.
[200]	valid_0's binary_logloss: 0.16473
[400]	valid_0's binary_logloss: 0.164142
Early stopping, best iteration is:
[422]	valid_0's binary_logloss: 0.164133
0.734186767632
-------------------------------------------------------------------------------------------------------------------------------

{'colsample_bytree': 0, 'leaf': 7, 'learning_rate': 9, 'max_bin': 7, 'subsample': 2}


  'Please use {0} argument of the Dataset constructor to pass this parameter.'.format(key))


Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[39]	valid_0's binary_logloss: 0.16507
0.729712712166
-------------------------------------------------------------------------------------------------------------------------------

{'colsample_bytree': 3, 'leaf': 10, 'learning_rate': 0, 'max_bin': 2, 'subsample': 3}


  'Please use {0} argument of the Dataset constructor to pass this parameter.'.format(key))


Training until validation scores don't improve for 100 rounds.
[200]	valid_0's binary_logloss: 0.197892
[400]	valid_0's binary_logloss: 0.167196
[600]	valid_0's binary_logloss: 0.164735
[800]	valid_0's binary_logloss: 0.164306
[1000]	valid_0's binary_logloss: 0.164199
[1200]	valid_0's binary_logloss: 0.164157
Early stopping, best iteration is:
[1247]	valid_0's binary_logloss: 0.164148
0.734248608518
-------------------------------------------------------------------------------------------------------------------------------

{'colsample_bytree': 3, 'leaf': 10, 'learning_rate': 0, 'max_bin': 2, 'subsample': 3}


  'Please use {0} argument of the Dataset constructor to pass this parameter.'.format(key))


Training until validation scores don't improve for 100 rounds.
[200]	valid_0's binary_logloss: 0.197892
[400]	valid_0's binary_logloss: 0.167196
[600]	valid_0's binary_logloss: 0.164735
[800]	valid_0's binary_logloss: 0.164306
[1000]	valid_0's binary_logloss: 0.164199
[1200]	valid_0's binary_logloss: 0.164157
Early stopping, best iteration is:
[1247]	valid_0's binary_logloss: 0.164148
0.734248608518
-------------------------------------------------------------------------------------------------------------------------------

{'colsample_bytree': 3, 'leaf': 10, 'learning_rate': 3, 'max_bin': 2, 'subsample': 3}


  'Please use {0} argument of the Dataset constructor to pass this parameter.'.format(key))


Training until validation scores don't improve for 100 rounds.
[200]	valid_0's binary_logloss: 0.164522
Early stopping, best iteration is:
[128]	valid_0's binary_logloss: 0.164413
0.732277835145
-------------------------------------------------------------------------------------------------------------------------------

{'colsample_bytree': 1, 'leaf': 16, 'learning_rate': 12, 'max_bin': 4, 'subsample': 0}


  'Please use {0} argument of the Dataset constructor to pass this parameter.'.format(key))


Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[20]	valid_0's binary_logloss: 0.166508
0.724001208551
-------------------------------------------------------------------------------------------------------------------------------

{'colsample_bytree': 0, 'leaf': 9, 'learning_rate': 0, 'max_bin': 17, 'subsample': 2}


  'Please use {0} argument of the Dataset constructor to pass this parameter.'.format(key))


Training until validation scores don't improve for 100 rounds.


KeyboardInterrupt: 

<font color=#0099ff size=5 face="黑体">最优模型训练</font>

In [None]:
clf,bst_estimators = lgbmodel(best)

In [None]:
Xi_finnal_ ,y_finnal_ = pd.concat([Xi_train_,Xi_valid_]), pd.concat([y_train_,y_valid_])
del Xi_train_
del Xi_valid_

In [None]:
clf = lgbmodel(best,bst_estimators)
y_test_meta = np.zeros((Xi_test_.shape[0], 1), dtype=float)
y_test_meta[:,0] += clf.predict_proba(Xi_test_[features])[:,1]
dfinstance = pd.read_table(config.FEATURE_SET,sep=' ',dtype=featureDtypes,usecols=[4,5])
submit = pd.DataFrame({'instance_id':dfinstance.loc[dfinstance['hour']>=12,'instance_id'],'predicted_score':y_test_meta[:,0]})

<font color=#0099ff size=5 face="黑体">结果输出</font>

In [None]:
submitName = 'gap.txt'
adjRatio = 0.0359194123126834
submit.to_csv('../../Submission/advertisement/'+submitName, sep=" ", index=False, line_terminator='\n')
print(submit['predicted_score'].mean())
submit['predicted_score'] = score_change(submit['predicted_score'],submit['predicted_score'].mean(),adjRatio)
print(submit['predicted_score'].mean())
submit.to_csv('../../Submission/advertisement/adj_'+submitName, sep=" ", index=False, line_terminator='\n')