In [None]:
import time
import pandas as pd
import numpy as np
import lightgbm as lgb
import random
import config
import re
import os
import scipy.special as special

from math import log
from numba import jit
from scipy.sparse import csr_matrix, hstack
from sklearn.metrics import log_loss,roc_curve
from sklearn.preprocessing import LabelEncoder,OneHotEncoder,StandardScaler
from sklearn.feature_extraction.text import CountVectorizer
from itertools import chain, combinations

In [None]:
def get_cosine(vec1, vec2):
    vec1=Counter(vec1)
    vec2=Counter(vec2)
    intersection = set(vec1.keys()) & set(vec2.keys())
    numerator = sum([vec1[x] * vec2[x] for x in intersection])
    sum1 = sum([vec1[x]**2 for x in vec1.keys()])
    sum2 = sum([vec2[x]**2 for x in vec2.keys()])
    denominator = math.sqrt(sum1) * math.sqrt(sum2)

    if not denominator:
        return 0.0
    else:
        return float(numerator) / denominator
            
def timestamp_datetime(value):
    return time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(value))

def time_feat(df,featList,featName):
    df[featName] = df.groupby(featList)['context_timestamp'].rank(method='first')   
    return df

def powerset(iterable):
    s = list(iterable)
    return list(chain.from_iterable(combinations(s, r) for r in range(len(s)+1)))

def del_na(lst):
    out = ''
    if len(lst)<2:
        return out        
    for i in range(0,len(lst),2):
        if not lst[i+1]=='-1':
            out += lst[i]+':'+lst[i+1]+';'
    try:  return out[:-1]
    except: return out

def ks_metric(true,score):
    fpr, tpr, thresholds = roc_curve(true,score)
    ks = max(tpr-fpr)
    return ks 

def score_change(score,base_rate,real_rate):
    base_change = np.log(base_rate/(1-base_rate)) - np.log(real_rate/(1-real_rate))
    score_adj = np.exp(np.log(score/(1-score)) - base_change)/(np.exp(np.log(score/(1-score)) - base_change)+1)
    return score_adj

In [None]:
class HyperParam(object):
    def __init__(self, alpha, beta):
        self.alpha = alpha
        self.beta = beta

    def update_from_data_by_FPI(self, tries, success, iter_num, epsilon):
        '''estimate alpha, beta using fixed point iteration'''
        for i in range(iter_num):
            new_alpha, new_beta = self.__fixed_point_iteration(tries, success, self.alpha, self.beta)
            if abs(new_alpha-self.alpha)<epsilon and abs(new_beta-self.beta)<epsilon:
                break
            self.alpha = new_alpha
            self.beta = new_beta

    def __fixed_point_iteration(self, tries, success, alpha, beta):
        '''fixed point iteration'''
        sumfenzialpha = 0.0
        sumfenzibeta = 0.0
        sumfenmu = 0.0
        for i in range(len(tries)):
            sumfenzialpha += (special.digamma(success[i]+alpha) - special.digamma(alpha))
            sumfenzibeta += (special.digamma(tries[i]-success[i]+beta) - special.digamma(beta))
            sumfenmu += (special.digamma(tries[i]+alpha+beta) - special.digamma(alpha+beta))

        return alpha*(sumfenzialpha/sumfenmu), beta*(sumfenzibeta/sumfenmu)

    def update_from_data_by_moment(self, tries, success):
        '''estimate alpha, beta using moment estimation'''
        mean, var = self.__compute_moment(tries, success)
        #print 'mean and variance: ', mean, var
        #self.alpha = mean*(mean*(1-mean)/(var+0.000001)-1)
        self.alpha = (mean+0.000001) * ((mean+0.000001) * (1.000001 - mean) / (var+0.000001) - 1)
        #self.beta = (1-mean)*(mean*(1-mean)/(var+0.000001)-1)
        self.beta = (1.000001 - mean) * ((mean+0.000001) * (1.000001 - mean) / (var+0.000001) - 1)

    def __compute_moment(self, tries, success):
        '''moment estimation'''
        ctr_list = []
        var = 0.0
        for i in range(len(tries)):
            ctr_list.append(float(success[i])/tries[i])
        mean = sum(ctr_list)/len(ctr_list)
        for ctr in ctr_list:
            var += pow(ctr-mean, 2)

        return mean, var/(len(ctr_list)-1)

In [None]:




def process(df):
    df['time'] = df.context_timestamp.apply(timestamp_datetime)
    df['day'] = df.time.apply(lambda x: int(x[8:10]))
    df['hour'] = df.time.apply(lambda x: int(x[11:13]))
    '''for lst in timeFeatList:
        df = time_feat(df,lst,'_'.join(lst))'''
    df['item_property_list'] = df['item_property_list'].apply(lambda x:';'.join(sorted(set(str(x).split(';')))))
    df['predict_category_property'] = df['predict_category_property'].apply(lambda x:';'.join(sorted(set(str(x).split(';')))))
    df['predict_category_property'] =df['predict_category_property'].apply(lambda x: list(re.split('[:;]',x)))
    df['predict_category_property'] = df['predict_category_property'].map(del_na)
    df['len_item_property_list'] = df['item_property_list'].apply(lambda x: len(str(x).split(';')))
    df['len_predict_category_property'] = df['predict_category_property'].apply(lambda x: len(str(x).split(';')))
    lbl = LabelEncoder()
    for i in range(1,3):
        df['item_category_list_bin%d'%i] = lbl.fit_transform(df['item_category_list'].apply(lambda x: x.split(';')[i] if len(x.split(';'))>i else ''))
    for i in range(10):
        df['predict_category_property%d'%i] = lbl.fit_transform(df['predict_category_property'].apply(lambda x: x.split(';')[i] if len(x.split(';'))>i else ''))
    
    #df["missing_feat"] = np.sum((df == -1).values, axis=1)
    return df

def labelencoder(df):
    lbl = LabelEncoder()
    for var in ['user_id','item_id','shop_id','item_brand_id','item_city_id']:
        df[var] = lbl.fit_transform(df[var])
    return df
        

def text_cosine(df):
    df['tmp_cate'] = df['item_category_list'].apply(lambda x: x.split(';')[2] if len(x.split(';'))>2 else x.split(';')[1])
    df['cate_predict_chk']=list(map(lambda x,y: 1 if x in y else 0 , df['tmp_cate'],df['predict_category_property']))
    del df['tmp_cate']
    
    df['tmp_set_predict_property'] =df['predict_category_property'].apply(lambda x: set(re.split('[:;]',x)[1::2]))
    df['tmp_set_item_property_list'] =df['item_property_list'].apply(lambda x: set(re.split('[;]',x)))
    df['property_join_cnt'] = df[['tmp_set_predict_property','tmp_set_item_property_list']].apply(lambda x: len(x[0]&x[1])*1.0/len(x[0]|x[1]),axis=1)
    df['property_gap1_cnt'] = df[['tmp_set_predict_property','tmp_set_item_property_list']].apply(lambda x: len(x[0]-x[1])*1.0/len(x[0]|x[1]),axis=1)
    df['property_gap2_cnt'] = df[['tmp_set_predict_property','tmp_set_item_property_list']].apply(lambda x: len(x[1]-x[0])*1.0/len(x[0]|x[1]),axis=1)
    del df['tmp_set_predict_property']
    del df['tmp_set_item_property_list']
    
    return df

def smooth_ctr(df,base_list):
    dfTrain = df.loc[df['is_trade'].notnull()]
    for var in base_list:
        if not isinstance(var,list):
            var = [var]
        nameBase = '_'.join(var)
        naFill = []
        for day in range(19,26):
            hyper = HyperParam(1,1)
            dfTrainTmp = dfTrain.loc[dfTrain['day']<day,var + ['is_trade']]
            dfTrainGroup=dfTrainTmp.groupby(var,as_index=False)['is_trade'].agg({'sum':'sum','size':'count'})
            hyper.update_from_data_by_FPI(dfTrainGroup['size'].tolist(), dfTrainGroup['sum'].tolist(), 1000, 0.00000001)
            dfTrainGroup[nameBase + '_smooth_ctr'] = (dfTrainGroup['sum'] + hyper.alpha)/(dfTrainGroup['size'] + hyper.alpha + hyper.beta)
            dfTrainGroup = dfTrainGroup[var+[nameBase + '_smooth_ctr']]
            dfTrainGroup['day'] = day
            naFill.append(hyper.alpha/(hyper.alpha+hyper.beta))
            if day==19:
                dfGroup = dfTrainGroup.copy()
            else:
                dfGroup = pd.concat([dfGroup,dfTrainGroup])
        df = df.merge(dfGroup,'left',var+['day'])
        for day in range(19,26):
            df.loc[df['day']==day,nameBase + '_smooth_ctr'].fillna(naFill[day-19],inplace=True)
    return df
    
    
    
def same_day_trick(df,key_var=[]):
    if not isinstance(key_var,list):
        key_var = [key_var]
    nameBase = '~'.join(key_var)
    ###当天前后的数据情况
    df[nameBase+'_before_exist'] = ((df.groupby(key_var+['day'])['context_timestamp'].rank(method='min') - 1) > 0).astype(int)
    df[nameBase+'_after_exist'] = ((df.groupby(key_var+['day'])['context_timestamp'].rank(method='min',ascending=False)- 1) > 0).astype(int)
    df[nameBase+'_sametime_exist'] = ((df.groupby(key_var+['day'])['context_timestamp'].rank(method='max') - df.groupby(key_var+['day'])['context_timestamp'].rank(method='min')) > 0).astype(int)
    #df = df.merge(df.groupby(key_var+['day'],as_index=False)['context_timestamp'].agg({nameBase+'_day_cnt':'count'}),'inner',key_var+['day'])
    return df    

def focus_one_record(df,key_var=[],time_diff=False):
    if not isinstance(key_var,list):
        key_var = [key_var]
    nameBase = '_'.join(key_var)
    ###当天前后的数据情况
    df[nameBase+'_before_cnt'] = df.groupby(key_var+['day'])['context_timestamp'].rank(method='min') - 1
    df[nameBase+'_after_cnt'] = df.groupby(key_var+['day'])['context_timestamp'].rank(method='min',ascending=False)- 1
    df[nameBase+'_sametime_cnt'] = df.groupby(key_var+['day'])['context_timestamp'].rank(method='max') - df.groupby(key_var+['day'])['context_timestamp'].rank(method='min')+1
    df = df.merge(df.groupby(key_var+['day'],as_index=False)['context_timestamp'].agg({nameBase+'_day_cnt':'count'}),'inner',key_var+['day'])
    for feat in ['_before_cnt','_after_cnt','_sametime_cnt']:
        df[nameBase+feat+'_ratio'] = df[nameBase+feat]*1.0/df[nameBase+'_day_cnt']
                      
    if time_diff:
        ###广告展示上下间隔
        dfTmp = df[[nameBase+'_before_cnt',nameBase+'_after_cnt',nameBase+'_sametime_cnt','time']+key_var+['day']]
        dfTmp.rename(columns={'time':'new_time'},inplace=True)
        dfTmp['next_record'] = dfTmp[nameBase+'_before_cnt'] + dfTmp[nameBase+'_sametime_cnt'] + 1
        dfTmp['last_record'] = dfTmp[nameBase+'_after_cnt'] + dfTmp[nameBase+'_sametime_cnt'] + 1
        df = df.merge(dfTmp[key_var+['day','next_record','new_time']],'left',left_on = key_var+['day',nameBase+'_before_cnt'],right_on = key_var+['day','next_record'])
        df[nameBase + '_next_time_dur'] = (pd.to_datetime(df['time'])-pd.to_datetime(df['new_time'])).dt.seconds
        df[nameBase + '_next_time_dur'].fillna(999999,inplace=True)
        df.loc[df[nameBase+'_sametime_cnt']>1,nameBase + '_next_time_dur'] = 0
        del df['new_time']
        del df['next_record']

        df = df.merge(dfTmp[key_var+['day','last_record','new_time']],'left',left_on = key_var+['day',nameBase+'_after_cnt'],right_on = key_var+['day','last_record'])
        df[nameBase + '_last_time_dur'] = (pd.to_datetime(df['new_time'])-pd.to_datetime(df['time'])).dt.seconds
        df[nameBase + '_last_time_dur'].fillna(999999,inplace=True)
        df.loc[df[nameBase+'_sametime_cnt']>1,nameBase + '_last_time_dur'] = 0
        del df['new_time']
        del df['last_record']
    for feat in ['_before_cnt','_after_cnt','_sametime_cnt']:
        del df[nameBase+feat]
    return df    
    

def _offline_feat(df,key_var='user_id',stat_var=[],part_var=[],mean_var=[],train_feat_col=None):
    if not isinstance(key_var,list):
        key_var = [key_var]
    left_key = key_var.copy()
    base_name = '~'.join(key_var)
    if train_feat_col:
        key_var.append(train_feat_col[1])
        left_key.append(train_feat_col[0])
    df = df.merge(df.groupby(key_var,as_index=False)['instance_id'].agg({base_name+'_cnt':'count'}).rename(columns={train_feat_col[1]:train_feat_col[0]}),'left',left_key)
    df = df.merge(df.groupby(key_var,as_index=False)['is_trade'].agg({base_name+'_trade_cnt':'sum',base_name+'_trade_ratio':'mean'}).rename(columns={train_feat_col[1]:train_feat_col[0]}),'left',left_key)
    df[base_name+'_notrade_cnt'] = df[base_name+'_cnt']-df[base_name+'_trade_cnt']
    dfTmp = df.loc[df['is_trade']==1]
    for stat in stat_var:
        df = df.merge(df.groupby(key_var,as_index=False)[stat].agg({base_name+'_'+stat+'_min':'min',base_name+'_'+stat+'_max':'max'}).rename(columns={train_feat_col[1]:train_feat_col[0]}),'left',left_key)    
    for part in part_var:
        df = df.merge(df.groupby(key_var,as_index=False)[part].agg({base_name+'_'+part+'_cnt':'nunique'}).rename(columns={train_feat_col[1]:train_feat_col[0]}),'left',left_key)
        df = df.merge(dfTmp.groupby(key_var,as_index=False)[part].agg({base_name+'_'+part+'_trade_cnt':'nunique'}).rename(columns={train_feat_col[1]:train_feat_col[0]}),'left',left_key)
        df[base_name+'_'+part+'_trade_cnt'].fillna(0,inplace=True)
        df[base_name+'_'+part+'_trade_ratio'] = 1.0*df[base_name+'_'+part+'_trade_cnt']/df[base_name+'_'+part+'_cnt']
    for var in mean_var:
        df = df.merge(df.groupby(key_var+[var],as_index=False)['is_trade'].sum().groupby(key_var,as_index=False)['is_trade'].agg({base_name+'_'+var+'_avg_trade':'mean'}).rename(columns={train_feat_col[1]:train_feat_col[0]}),'left',left_key)
    return df








def map_col(df,drop=False):
    map_dict = {
        'item_price_level':[4,5,6,7,8,9],
        'item_sales_level':[4,6,9,10,11,12,13,14,16],
        'item_pv_level':[6,9,10,11,12,13,14,15,16,17,18,19,20],
        'user_age_level':[1001,1002,1003,1004,1005],
        'context_page_id':[4001,4002,4004,4006,4008,4010,4013,4016,4018],
        'shop_review_num_level':[5,9,14,15,16,17,18,20,21],
        'hour':[6,9,12,17,20],
        'user_occupation_id':{-1:2003},
        'user_star_level':{-1:3000}
    }
    for key,value in map_dict.items():
        if isinstance(value,list):
            df[key+'_mapped'] = 0
            for i in range(len(value)):
                df.loc[df[key]>value[i],key+'_mapped'] = i+1
        else:
            '''df[key+'_mapped'] = df[key]
            for key_sub,value_sub in value.items():
                df.loc[df[key]==key_sub,key+'_mapped'] = value_sub'''
            df[key+'_mapped'] = df[key].apply(lambda x:value.get(x,x))
        if drop:
            df[key] = df[key+'_mapped']
            del df[key+'_mapped']
    return df



def cross_feat_plus(df,base_list,order=2):
    if order<2:
        return df
    subset = powerset(base_list)
    subset = [i for i in subset if len(i)==order]
    for sub in subset:
        sub = list(sub)
        baseName = '~'.join(sub)+'_plus'
        df[baseName] = df[sub].sum(axis=1)
    return df


def interaction_ratio(df,base_list=[],cal_list=[],rank_list = []):
    for base_var in base_list:
        if not isinstance(base_var,list):
            base_var = [base_var]
        if not '_'.join(base_var)+'_cnt' in df.columns:
            df = df.merge(df.groupby(base_var,as_index=False)['instance_id'].agg({'_'.join(base_var)+'_cnt':'count'}),'left',base_var)
        print('ratio part')
        for cal_var in cal_list:
            if not isinstance(cal_var,list):
                cal_var = [cal_var]
            if cal_var==base_var or base_var==['cnt_rec']:
                continue
            nameBase = '_'.join(base_var)+'~'+'_'.join(cal_var)
            print(nameBase)
            df = df.merge(df.groupby(base_var+cal_var,as_index=False)['instance_id'].agg({nameBase+'_cnt':'count'}),'left',base_var+cal_var)
            df[nameBase+'_ratio'] = df[nameBase+'_cnt']*1.0/df['_'.join(base_var)+'_cnt']
            del df[nameBase+'_cnt']
        
        print('rank part')
        for rank_var in rank_list:
            if not isinstance(rank_var,list):
                rank_var = [rank_var]
            if rank_var==base_var:
                continue
            nameBase = '_'.join(base_var)+'~'+'_'.join(rank_var)
            print(nameBase)
            df[nameBase+'_rank'] = dfAll.groupby(base_var)[rank_var].rank(method='min')
            df[nameBase+'_rank_ratio'] = df[nameBase+'_rank']*1.0/df['_'.join(base_var)+'_cnt']
            del df[nameBase+'_rank']
        del df['_'.join(base_var)+'_cnt']
    return df
            


<font color=#0099ff size=5 face="黑体">读取数据</font>

In [None]:
dfTrain = pd.read_table(config.TRAIN_FILE,sep=' ')
dfTrain.drop_duplicates(inplace=True)
dfTrain.reset_index(inplace=True,drop =True)
dfTest = pd.read_table(config.TEST_FILE,sep=' ')

dfTrain = process(dfTrain)
dfTest = process(dfTest)

dfSubmit = pd.read_table(config.TEST_FILE_NEW,sep=' ')
idSubmit = dfSubmit['instance_id'].tolist()
del dfSubmit

dfAll = pd.concat([dfTrain,dfTest],axis=0)
dfAll.reset_index(inplace=True,drop=True)
trainNum = dfTrain.shape[0]
dfAll['cnt_rec'] = 1
print(dfAll.shape)

dfAll = labelencoder(dfAll)
dfAll = text_cosine(dfAll)

dfAll.shape

<font color=#0099ff size=5 face="黑体">特征工程</font>

In [7]:
###单特征map
dfAll = map_col(dfAll,True)
print(dfAll.shape)
featBase = [i for i in dfAll.columns.tolist() if not i in config.IGNORE_COLS]

(539370, 49)


In [8]:
###平滑后CTR
#keyList = ['item_id']
keyList = config.CATEGORICAL_COLS

if os.path.exists('../../Data/advertisment/Cache/smooth_new.csv'):
    dfSmooth = pd.read_csv('../../Data/advertisment/Cache/smooth_new.csv')
    dfAll = pd.concat([dfAll,dfSmooth],axis=1)
    del dfSmooth
else:
    dfAll = smooth_ctr(dfAll,keyList)
    '''toSave = dfAll.iloc[:,49:]
    toSave.head()
    toSave.to_csv('../../Data/advertisment/Cache/smooth_new.csv',index=False)'''

print(dfAll.shape)

(539370, 67)


In [11]:
###线下特征集合
dfAll['feat_set'] = dfAll['day'] + 1
keyList = ['user_id','shop_id','item_id','hour','item_category_list_bin1']
partList = [
    ['item_id','shop_id'],
    ['user_id','item_id'],
    ['user_id','shop_id'],
    ['user_id','item_id','shop_id'],
    ['user_id','item_id','shop_id']
]
meanList = [
    ['shop_id'],
    ['item_id'],
    [],
    ['user_id','shop_id','item_id'],
    ['user_id','shop_id','item_id']
]
for i in range(len(keyList)):
    keyVar = keyList[i]
    partVar = partList[i]
    meanVar = meanList[i]
    statVar = []
    if isinstance(keyVar,str):
        for key,value in config.STAT_DICT.items():
            if key==keyVar:
                continue
            statVar += value
    dfAll = _offline_feat(dfAll,keyVar,statVar,partVar,meanVar,['day','feat_set'])
del dfAll['feat_set']
print(dfAll.shape)

(539370, 237)


In [12]:
###连续型变量交叉特征
conList = [
    'user_gender_id','user_age_level', 'user_star_level',
    'item_price_level', 'item_sales_level','item_collected_level', 'item_pv_level',
    'context_page_id',
    'shop_review_num_level','shop_star_level'
]
dfAll = cross_feat_plus(dfAll,conList,order=2)
dfAll = cross_feat_plus(dfAll,conList,order=3)
print(dfAll.shape)

(539370, 402)


In [13]:
###当天信息的trick
keyList = ['user_id',['user_id','shop_id'],['user_id','item_category_list_bin1']]
#,'shop_id','item_id','item_city_id','item_brand_id'
for keyVar in keyList:
    dfAll = same_day_trick(dfAll,keyVar)
print(dfAll.shape)

(539370, 411)


In [14]:
###两两类别变量的比例/rank 顺序
'''baseList = [
    'cnt_rec',
    'user_id','user_gender_id', 'user_occupation_id','user_age_level', 'user_star_level',
    'item_id', 'item_brand_id', 'item_city_id', 'item_price_level', 'item_sales_level','item_collected_level', 'item_pv_level',
    'item_category_list_bin1','item_category_list_bin2',
    'shop_id', 'shop_review_num_level','shop_star_level'
    
]

calList = [
    'user_id','user_gender_id', 'user_occupation_id','item_id', 'item_brand_id', 'item_city_id',
    'item_category_list_bin1','item_category_list_bin2','shop_id'
]
rankList = [
    'user_age_level', 'user_star_level','item_price_level', 'item_sales_level','item_collected_level', 'item_pv_level','shop_review_num_level','shop_star_level'
]

dfAll = interaction_ratio(dfAll,baseList,calList,rankList)'''
dfCross = pd.read_csv('../../Data/advertisment/Cache/ratio_rank_new.csv')
dfAll = pd.concat([dfAll,dfCross],axis=1)
del dfCross

print(dfAll.shape)

(539370, 691)


<font color=#0099ff size=5 face="黑体">拆分样本</font>

In [15]:
features = [i for i in dfAll.columns.tolist() if not i in config.IGNORE_COLS]

train_idx = dfTrain.loc[(dfTrain['day']<24)&(dfTrain['day']>18)].index
valid_idx = dfTrain.loc[dfTrain['day']==24].index
Xi_train_, y_train_ = dfAll.loc[list(train_idx),features],dfTrain.loc[train_idx,'is_trade']
Xi_valid_, y_valid_ = dfAll.loc[list(valid_idx),features],dfTrain.loc[valid_idx,'is_trade']
Xi_test_ = dfAll.loc[trainNum:,features]

del dfAll

<font color=#0099ff size=5 face="黑体">模型</font>

In [43]:
clf = lgb.LGBMClassifier(
    boosting_type = 'gbdt',
    num_leaves=40, 
    max_depth=8,
    n_estimators=20000,
    n_jobs=20,
    learning_rate=0.05,
    colsample_bytree=0.8,
    subsample=0.9,
    max_bin=20
)
clf.fit(Xi_train_[features], y_train_, eval_set=[(Xi_valid_[features], y_valid_)],feature_name = features,
        categorical_feature=[],early_stopping_rounds=100)
#[i for i in ['item_category_list_bin1','item_category_list_bin2'] if i in features]
y_score_ = clf.predict_proba(Xi_valid_[features],)[:, 1]

print(pd.Series(clf.feature_importances_, features).sort_values(ascending=False).reset_index())
print(log_loss(y_valid_, y_score_))
print(ks_metric(y_valid_, y_score_))
bstIter = clf.best_iteration_



[1]	valid_0's binary_logloss: 0.647714
Training until validation scores don't improve for 100 rounds.
[2]	valid_0's binary_logloss: 0.606577
[3]	valid_0's binary_logloss: 0.569204
[4]	valid_0's binary_logloss: 0.535128
[5]	valid_0's binary_logloss: 0.503918
[6]	valid_0's binary_logloss: 0.475229
[7]	valid_0's binary_logloss: 0.448834
[8]	valid_0's binary_logloss: 0.4245
[9]	valid_0's binary_logloss: 0.401966
[10]	valid_0's binary_logloss: 0.381156
[11]	valid_0's binary_logloss: 0.361807
[12]	valid_0's binary_logloss: 0.343846
[13]	valid_0's binary_logloss: 0.327149
[14]	valid_0's binary_logloss: 0.311554
[15]	valid_0's binary_logloss: 0.297049
[16]	valid_0's binary_logloss: 0.283525
[17]	valid_0's binary_logloss: 0.270897
[18]	valid_0's binary_logloss: 0.259056
[19]	valid_0's binary_logloss: 0.247996
[20]	valid_0's binary_logloss: 0.237636
[21]	valid_0's binary_logloss: 0.227932
[22]	valid_0's binary_logloss: 0.218848
[23]	valid_0's binary_logloss: 0.210353
[24]	valid_0's binary_loglos

[199]	valid_0's binary_logloss: 0.0795885
[200]	valid_0's binary_logloss: 0.0795977
[201]	valid_0's binary_logloss: 0.0796024
[202]	valid_0's binary_logloss: 0.0796086
[203]	valid_0's binary_logloss: 0.0796056
[204]	valid_0's binary_logloss: 0.0796014
[205]	valid_0's binary_logloss: 0.0795957
[206]	valid_0's binary_logloss: 0.0795982
[207]	valid_0's binary_logloss: 0.0796017
[208]	valid_0's binary_logloss: 0.0796019
[209]	valid_0's binary_logloss: 0.0795985
[210]	valid_0's binary_logloss: 0.079608
[211]	valid_0's binary_logloss: 0.0796156
[212]	valid_0's binary_logloss: 0.0796131
[213]	valid_0's binary_logloss: 0.0796208
[214]	valid_0's binary_logloss: 0.0796296
[215]	valid_0's binary_logloss: 0.0796382
[216]	valid_0's binary_logloss: 0.0796335
[217]	valid_0's binary_logloss: 0.079641
[218]	valid_0's binary_logloss: 0.0796395
[219]	valid_0's binary_logloss: 0.0796424
[220]	valid_0's binary_logloss: 0.0796603
[221]	valid_0's binary_logloss: 0.0796638
[222]	valid_0's binary_logloss: 0.07

In [41]:
xx = pd.Series(clf.feature_importances_, features).sort_values(ascending=False).reset_index()
(xx[0]==0).sum()

3

In [42]:
features = xx.loc[xx[0]>0,'index'].tolist()

In [None]:
def score_change(score,base_rate,real_rate):
    base_change = np.log(base_rate/(1-base_rate)) - np.log(real_rate/(1-real_rate))
    score_adj = np.exp(np.log(score/(1-score)) - base_change)/(np.exp(np.log(score/(1-score)) - base_change)+1)
    return score_adj

In [None]:
del Xi_train_
del Xi_valid_

In [None]:
Xi_finnal_.shape

In [None]:
#Xi_finnal_ ,y_finnal_ = np.vstack((Xi_train_,Xi_valid_),np.hstack((y_train_,y_valid_))
Xi_finnal_ ,y_finnal_ = pd.concat([Xi_train_,Xi_valid_]), pd.concat([y_train_,y_valid_])
del Xi_train_
del Xi_valid_

clf = lgb.LGBMClassifier(
    num_leaves=40, 
    max_depth=8,
    n_estimators=bstIter,
    n_jobs=20,
    learning_rate=0.05,
    colsample_bytree=0.8,
    subsample=0.9,
    max_bin=20
)
clf.fit(Xi_finnal_[features], y_finnal_,feature_name = features,
        categorical_feature=[])
#[i for i in ['item_category_list_bin1','item_category_list_bin2'] if i in features]
y_test_meta = np.zeros((dfTest.shape[0], 1), dtype=float)
y_test_meta[:,0] += clf.predict_proba(Xi_test_[features])[:,1]
submit = pd.DataFrame({'instance_id':dfTest['instance_id'],'predicted_score':y_test_meta[:,0]})
#submit.to_csv('../../Submission/advertisement/gbm_trick_0330.txt', sep=" ", index=False, line_terminator='\n')

In [None]:
y_test_meta[:,0] += clf.predict_proba(Xi_test_[features])[:,1]
submit = pd.DataFrame({'instance_id':dfTest['instance_id'],'predicted_score':y_test_meta[:,0]})

In [None]:
submit = submit.loc[submit['instance_id'].isin(idSubmit)]
submit['predicted_score'].mean()

In [None]:
y_finnal_.mean()

In [None]:
submit.to_csv('../../Submission/advertisement/gbm_trick_testb_419.txt', sep=" ", index=False, line_terminator='\n')

In [None]:
submit['predicted_score'] = 0
submit.to_csv('../../Submission/advertisement/gbm_trick_testb_418.txt', sep=" ", index=False, line_terminator='\n')

In [None]:
submit['predicted_score'] = score_change(submit['predicted_score'],submit['predicted_score'].mean(),0.018116956)
print(submit['predicted_score'].mean())
submit.to_csv('../../Submission/advertisement/gbm_trick_testb_adj_419.txt', sep=" ", index=False, line_terminator='\n')

In [None]:
submit = pd.read_csv('../../Submission/advertisement/gbm_trick_text_417.txt',sep=" ")