In [1]:
import time
import pandas as pd
import numpy as np
import lightgbm as lgb
import random
import config
import pickle
import re
import os
import gc
import hyperopt
import scipy.special as special

from math import log
from numba import jit
from scipy.sparse import csr_matrix, hstack
from sklearn.metrics import log_loss,roc_curve,auc
from sklearn.preprocessing import LabelEncoder,OneHotEncoder,StandardScaler
from sklearn.feature_extraction.text import CountVectorizer
from itertools import chain, combinations
from sklearn.model_selection import train_test_split

from hyperopt import fmin, tpe, hp,space_eval,rand,Trials,partial,STATUS_OK

In [2]:
def ks_metric(true,score):
    fpr, tpr, thresholds = roc_curve(true,score)
    ks = max(tpr-fpr)
    auc_ = auc(fpr, tpr)
    return ks,auc_

def score_change(score,base_rate,real_rate):
    base_change = np.log(base_rate/(1-base_rate)) - np.log(real_rate/(1-real_rate))
    score_adj = np.exp(np.log(score/(1-score)) - base_change)/(np.exp(np.log(score/(1-score)) - base_change)+1)
    return score_adj

def memory_saving(df,del_var=[]):
    for var in df:
        if var in del_var:
            del df[var]
            continue
        if df[var].dtypes == float:
            df[var] = pd.to_numeric(df[var],downcast='float')
        else:
            df[var] = pd.to_numeric(df[var],downcast='signed')
    return df

def map_col(df,drop=False):
    map_dict = {
        'item_price_level':[4,5,6,7,8,9],
        'item_sales_level':[4,6,9,10,11,12,13,14,16],
        'item_pv_level':[6,9,10,11,12,13,14,15,16,17,18,19,20],
        'user_age_level':[1001,1002,1003,1004,1005],
        'context_page_id':[4001,4002,4004,4006,4008,4010,4013,4016,4018],
        'shop_review_num_level':[5,9,14,15,16,17,18,20,21],
        #'hour':[6,9,12,17,20],
        'user_occupation_id':{-1:2003},
        'user_star_level':{-1:3000}
    }
    for key,value in map_dict.items():
        if isinstance(value,list):
            df[key+'_mapped'] = 0
            for i in range(len(value)):
                df.loc[df[key]>value[i],key+'_mapped'] = i+1
        else:
            '''df[key+'_mapped'] = df[key]
            for key_sub,value_sub in value.items():
                df.loc[df[key]==key_sub,key+'_mapped'] = value_sub'''
            df[key+'_mapped'] = df[key].apply(lambda x:value.get(x,x))
        if drop:
            df[key] = df[key+'_mapped']
            del df[key+'_mapped']
    gc.collect()
    return df

In [None]:
featureDtypes = {'cnt_rec': 'int8',
 'context_id': 'int64',
 'context_page_id': 'int16',
 'context_timestamp': 'int32',
 'day': 'int8',
 'hour': 'int8',
 'instance_id': 'int64',
 'is_trade': 'float32',
 'item_brand_id': 'int16',
 'item_category_list_bin1': 'int8',
 'item_category_list_bin2': 'int8',
 'item_city_id': 'int16',
 'item_collected_level': 'int8',
 'item_id': 'int32',
 'item_price_level': 'int8',
 'item_pv_level': 'int8',
 'item_sales_level': 'int8',
 'len_item_property_list': 'int8',
 'len_predict_category_property': 'int8',
 'min': 'int8',
 'shop_id': 'int16',
 'shop_review_num_level': 'int8',
 'shop_review_positive_rate': 'float32',
 'shop_score_delivery': 'float32',
 'shop_score_description': 'float32',
 'shop_score_service': 'float32',
 'shop_star_level': 'int16',
 'user_age_level': 'int16',
 'user_gender_id': 'int8',
 'user_id': 'int32',
 'user_occupation_id': 'int16',
 'user_star_level': 'int16'}

dfAll = pd.read_table(config.FEATURE_SET,sep=' ',dtype=featureDtypes)
dfAll = map_col(dfAll,True); gc.collect()

In [None]:
dataRootDir = '../../Data/advertisment/Cache/'
dataAdded = [
    #'ratio_rank',
    #'ratio_rank_preday',
    #'smooth',
    #'offline_v2',
    #'cross_plus',
    #'trick_userid',
    #'text_base',
    'single_ratio',
    'significant_pro_cate',
    #'text_model_score_train',
    #'text_model_score',
]

feat_del = []
with open(dataRootDir+'del_features_v2.pkl','rb') as f:
    feat_del = pickle.load(f)

for add in dataAdded:
    tmpDf = pd.read_csv(dataRootDir + add +'.csv')
    tmpDf = memory_saving(tmpDf,feat_del)
    dfAll = pd.concat([dfAll,tmpDf],axis=1)
    del tmpDf
    gc.collect()
    print('%s is loaded, shape is %d'%(add,dfAll.shape[1]))

In [None]:
features = [i for i in dfAll.columns.tolist() if not i in config.IGNORE_COLS+['min']+feat_del]
train_idx = dfAll.loc[(dfAll['hour']<10)&(dfAll['hour']>0)].index
valid_idx = dfAll.loc[(dfAll['hour']<12)&(dfAll['hour']>9)].index
Xi_train_, y_train_ = dfAll.loc[list(train_idx),features],dfAll.loc[train_idx,'is_trade']
Xi_valid_, y_valid_ = dfAll.loc[list(valid_idx),features],dfAll.loc[valid_idx,'is_trade']
Xi_test_ = dfAll.loc[(dfAll['hour']>=12),features]
del dfAll
del train_idx
del valid_idx

In [3]:
dataRootDir = '../../Data/advertisment/Cache/'
Xi_finnal_ = pd.read_csv(dataRootDir + 'train_set.csv')
Xi_finnal_ = memory_saving(Xi_finnal_)

In [4]:
y_finnal_ = Xi_finnal_['is_trade'].values
del Xi_finnal_['is_trade']
Xi_train_,Xi_valid_,y_train_,y_valid_ = train_test_split(Xi_finnal_, y_finnal_, test_size=0.18, random_state=42)

In [5]:
features = Xi_train_.columns.tolist()

In [6]:
def lgbtune(argsDict):
    leaf = argsDict['leaf']*5 + 5
    learning_rate = argsDict["learning_rate"] * 0.02 + 0.05
    subsample = argsDict["subsample"] * 0.1 + 0.7
    colsample_bytree = argsDict["colsample_bytree"] * 0.1 + 0.7
    #scale_pos_weight = argsDict["scale_pos_weight"] + 1
    max_bin = argsDict["max_bin"] * 5 + 10
    
    print(argsDict)
    
    '''print('leaf is %f'%leaf)
    print('learning_rate is %f'%learning_rate)
    print('subsample is %f'%subsample)
    print('colsample_bytree is %f'%colsample_bytree)
    print('scale_pos_weight is %d'%scale_pos_weight)
    print('max_bin is %d'%max_bin)'''
    
    clf = lgb.LGBMClassifier(
        num_leaves=leaf, 
        n_estimators=20000,
        n_jobs=20,
        learning_rate=learning_rate,
        colsample_bytree=colsample_bytree,
        subsample=subsample,
        max_bin=max_bin,
        #scale_pos_weight = scale_pos_weight
    )
    clf.fit(Xi_train_[features], y_train_, eval_set=[(Xi_valid_[features], y_valid_)],feature_name = features,early_stopping_rounds=100,verbose =200)
    
    y_score_ = clf.predict_proba(Xi_valid_[features],num_iteration=clf.best_iteration_)[:, 1]
    ks,auc = ks_metric(y_valid_, y_score_)
    print(auc)
    return -1*auc

In [None]:
space = {"leaf":hp.randint("leaf",25),
         "learning_rate":hp.randint("learning_rate",16),
         "subsample":hp.randint("subsample",4),
         "colsample_bytree":hp.randint("colsample_bytree",4),
         #"min_child_weight":hp.randint("min_child_weight",5),
         #"scale_pos_weight":hp.randint("scale_pos_weight",25),
         "max_bin":hp.randint("max_bin",10)
        }

In [None]:
trials = Trials()

algo = partial(tpe.suggest,n_startup_jobs=1)
best = fmin(lgbtune,space,algo=tpe.suggest,max_evals=50,trials=trials)

{'colsample_bytree': 2, 'leaf': 15, 'learning_rate': 15, 'max_bin': 2, 'subsample': 3}
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[13]	valid_0's binary_logloss: 0.169878
0.725127345453
{'colsample_bytree': 2, 'leaf': 20, 'learning_rate': 8, 'max_bin': 8, 'subsample': 0}
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[27]	valid_0's binary_logloss: 0.170126
0.722933768594
{'colsample_bytree': 1, 'leaf': 5, 'learning_rate': 15, 'max_bin': 4, 'subsample': 0}
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[20]	valid_0's binary_logloss: 0.169321
0.727815710161
{'colsample_bytree': 3, 'leaf': 23, 'learning_rate': 1, 'max_bin': 7, 'subsample': 3}
Training until validation scores don't improve for 100 rounds.
[200]	valid_0's binary_logloss: 0.168615
Early stopping, best iteration is:
[104]	valid_0's binary_logloss: 0.168279
0.73172402589
{'

Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[59]	valid_0's binary_logloss: 0.168106
0.732034790072
{'colsample_bytree': 2, 'leaf': 20, 'learning_rate': 10, 'max_bin': 9, 'subsample': 3}
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[21]	valid_0's binary_logloss: 0.169598
0.726329057349
{'colsample_bytree': 2, 'leaf': 0, 'learning_rate': 3, 'max_bin': 1, 'subsample': 3}
Training until validation scores don't improve for 100 rounds.
[200]	valid_0's binary_logloss: 0.168118
[400]	valid_0's binary_logloss: 0.167727
[600]	valid_0's binary_logloss: 0.167568
[800]	valid_0's binary_logloss: 0.167483
Early stopping, best iteration is:
[859]	valid_0's binary_logloss: 0.167455
0.736187264661
{'colsample_bytree': 1, 'leaf': 0, 'learning_rate': 3, 'max_bin': 1, 'subsample': 3}
Training until validation scores don't improve for 100 rounds.
[200]	valid_0's binary_logloss: 0.168139
[400]	valid_0's

In [None]:
Xi_finnal_ ,y_finnal_ = pd.concat([Xi_train_,Xi_valid_]), pd.concat([y_train_,y_valid_])
del Xi_train_
del Xi_valid_

In [None]:
Xi_finnal_['is_trade'] = y_finnal_

In [None]:
Xi_finnal_.to_csv(dataRootDir + 'train_set.csv',index = False)

In [None]:
Xi_test_.to_csv(dataRootDir + 'test_set.csv',index = False)

In [None]:
print(best)

In [None]:
def lgbiter(argsDict):
    #leaf = argsDict['leaf']*5 + 5
    leaf = argsDict['max_depth']*5 + 5
    learning_rate = argsDict["learning_rate"] * 0.02 + 0.05
    subsample = argsDict["subsample"] * 0.1 + 0.7
    colsample_bytree = argsDict["colsample_bytree"] * 0.1 + 0.7
    scale_pos_weight = argsDict["scale_pos_weight"] + 1
    max_bin = argsDict["scale_pos_weight"] * 5 + 1
    
    clf = lgb.LGBMClassifier(
        num_leaves=leaf, 
        n_estimators=20000,
        n_jobs=20,
        learning_rate=learning_rate,
        colsample_bytree=colsample_bytree,
        subsample=subsample,
        max_bin=max_bin,
        scale_pos_weight = scale_pos_weight
    )
    clf.fit(Xi_train_[features], y_train_, eval_set=[(Xi_valid_[features], y_valid_)],feature_name = features,
            categorical_feature=[],early_stopping_rounds=100)
    bstIter = clf.best_iteration_
    return bstIter    

In [None]:
bstIter = lgbiter(best)
Xi_finnal_ ,y_finnal_ = pd.concat([Xi_train_,Xi_valid_]), pd.concat([y_train_,y_valid_])
del Xi_train_
del Xi_valid_

Xi_test_ = pd.read_csv(dataRootDir + 'test_set.csv')
Xi_test_ = memory_saving(Xi_test_)

In [None]:
Xi_test_ = pd.read_csv(dataRootDir + 'test_set.csv')
Xi_test_ = memory_saving(Xi_test_)

In [None]:
def lgbsubmit(argsDict):
    #leaf = argsDict['leaf']*5 + 5
    leaf = argsDict['max_depth']*5 + 5
    learning_rate = argsDict["learning_rate"] * 0.02 + 0.05
    subsample = argsDict["subsample"] * 0.1 + 0.7
    colsample_bytree = argsDict["colsample_bytree"] * 0.1 + 0.7
    scale_pos_weight = argsDict["scale_pos_weight"] + 1
    max_bin = argsDict["scale_pos_weight"] * 5 + 1
    
    clf = lgb.LGBMClassifier(
        num_leaves=leaf, 
        n_estimators=bstIter,
        n_jobs=20,
        learning_rate=learning_rate,
        colsample_bytree=colsample_bytree,
        subsample=subsample,
        max_bin=max_bin,
        scale_pos_weight = scale_pos_weight
    )
    clf.fit(Xi_finnal_[features], y_finnal_,feature_name = features,
        categorical_feature=[])
    return clf
    y_test_meta = np.zeros((Xi_test_.shape[0], 1), dtype=float)
    y_test_meta[:,0] += clf.predict_proba(Xi_test_[features])[:,1]
    dfinstance = pd.read_table(config.FEATURE_SET,sep=' ',dtype=featureDtypes,usecols=[4,5])
    submit = pd.DataFrame({'instance_id':dfinstance.loc[dfinstance['hour']>=12,'instance_id'],'predicted_score':y_test_meta[:,0]})
    return submit

In [None]:
clf = lgbsubmit(best)

y_test_meta = np.zeros((Xi_test_.shape[0], 1), dtype=float)
y_test_meta[:,0] += clf.predict_proba(Xi_test_[features])[:,1]
dfinstance = pd.read_table(config.FEATURE_SET,sep=' ',usecols=[4,5])
submit = pd.DataFrame({'instance_id':dfinstance.loc[dfinstance['hour']>=12,'instance_id'],'predicted_score':y_test_meta[:,0]})


submit.to_csv('../../Submission/advertisement/tunning_505.txt', sep=" ", index=False, line_terminator='\n')
print(submit['predicted_score'].mean())

submit['predicted_score'] = score_change(submit['predicted_score'],submit['predicted_score'].mean(),0.0359194123126834)
print(submit['predicted_score'].mean())
submit.to_csv('../../Submission/advertisement/tunning_adj_505.txt', sep=" ", index=False, line_terminator='\n')

In [None]:
submit['predicted_score'] = submit['predicted_score']  - 0.0359194123126834
submit.loc[submit['predicted_score']<0, 'predicted_score']=0
print(submit['predicted_score'].mean())
submit.to_csv('../../Submission/advertisement/tunning_adj2_505.txt', sep=" ", index=False, line_terminator='\n')

In [None]:
submit.describe()

In [None]:
submit = pd.read_table('../../Submission/advertisement/tunning_505.txt', sep=" ")

In [None]:
submit.describe()