In [None]:
import time
import pandas as pd
import numpy as np
import lightgbm as lgb
import random
import config
import re
import os
import gc
import scipy.special as special

from math import log
from numba import jit
from scipy.sparse import csr_matrix, hstack
from sklearn.metrics import log_loss,roc_curve
from sklearn.preprocessing import LabelEncoder,OneHotEncoder,StandardScaler
from sklearn.feature_extraction.text import CountVectorizer
from itertools import chain, combinations

In [None]:
def get_cosine(vec1, vec2):
    vec1=Counter(vec1)
    vec2=Counter(vec2)
    intersection = set(vec1.keys()) & set(vec2.keys())
    numerator = sum([vec1[x] * vec2[x] for x in intersection])
    sum1 = sum([vec1[x]**2 for x in vec1.keys()])
    sum2 = sum([vec2[x]**2 for x in vec2.keys()])
    denominator = math.sqrt(sum1) * math.sqrt(sum2)

    if not denominator:
        return 0.0
    else:
        return float(numerator) / denominator
            
def timestamp_datetime(value):
    return time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(value))

def time_feat(df,featList,featName):
    df[featName] = df.groupby(featList)['context_timestamp'].rank(method='first')   
    return df

def powerset(iterable):
    s = list(iterable)
    return list(chain.from_iterable(combinations(s, r) for r in range(len(s)+1)))

def del_na(lst):
    out = ''
    if len(lst)<2:
        return out        
    for i in range(0,len(lst),2):
        if not lst[i+1]=='-1':
            out += lst[i]+':'+lst[i+1]+';'
    try:  return out[:-1]
    except: return out

def ks_metric(true,score):
    fpr, tpr, thresholds = roc_curve(true,score)
    ks = max(tpr-fpr)
    return ks 

def score_change(score,base_rate,real_rate):
    base_change = np.log(base_rate/(1-base_rate)) - np.log(real_rate/(1-real_rate))
    score_adj = np.exp(np.log(score/(1-score)) - base_change)/(np.exp(np.log(score/(1-score)) - base_change)+1)
    return score_adj

In [None]:
class HyperParam(object):
    def __init__(self, alpha, beta):
        self.alpha = alpha
        self.beta = beta

    def update_from_data_by_FPI(self, tries, success, iter_num, epsilon):
        '''estimate alpha, beta using fixed point iteration'''
        for i in range(iter_num):
            new_alpha, new_beta = self.__fixed_point_iteration(tries, success, self.alpha, self.beta)
            if abs(new_alpha-self.alpha)<epsilon and abs(new_beta-self.beta)<epsilon:
                break
            self.alpha = new_alpha
            self.beta = new_beta

    def __fixed_point_iteration(self, tries, success, alpha, beta):
        '''fixed point iteration'''
        sumfenzialpha = 0.0
        sumfenzibeta = 0.0
        sumfenmu = 0.0
        for i in range(len(tries)):
            sumfenzialpha += (special.digamma(success[i]+alpha) - special.digamma(alpha))
            sumfenzibeta += (special.digamma(tries[i]-success[i]+beta) - special.digamma(beta))
            sumfenmu += (special.digamma(tries[i]+alpha+beta) - special.digamma(alpha+beta))

        return alpha*(sumfenzialpha/sumfenmu), beta*(sumfenzibeta/sumfenmu)

    def update_from_data_by_moment(self, tries, success):
        '''estimate alpha, beta using moment estimation'''
        mean, var = self.__compute_moment(tries, success)
        #print 'mean and variance: ', mean, var
        #self.alpha = mean*(mean*(1-mean)/(var+0.000001)-1)
        self.alpha = (mean+0.000001) * ((mean+0.000001) * (1.000001 - mean) / (var+0.000001) - 1)
        #self.beta = (1-mean)*(mean*(1-mean)/(var+0.000001)-1)
        self.beta = (1.000001 - mean) * ((mean+0.000001) * (1.000001 - mean) / (var+0.000001) - 1)

    def __compute_moment(self, tries, success):
        '''moment estimation'''
        ctr_list = []
        var = 0.0
        for i in range(len(tries)):
            ctr_list.append(float(success[i])/tries[i])
        mean = sum(ctr_list)/len(ctr_list)
        for ctr in ctr_list:
            var += pow(ctr-mean, 2)

        return mean, var/(len(ctr_list)-1)

In [None]:
def process(df):
    df['time'] = df.context_timestamp.apply(timestamp_datetime)
    df['day'] = df.time.apply(lambda x: int(x[8:10]))
    df['hour'] = df.time.apply(lambda x: int(x[11:13]))
    df['min'] = df.time.apply(lambda x: int(x[14:16]))

    df['item_property_list'] = df['item_property_list'].apply(lambda x:';'.join(sorted(set(str(x).split(';')))))
    df['predict_category_property'] = df['predict_category_property'].apply(lambda x:';'.join(sorted(set(str(x).split(';')))))
    df['predict_category_property'] =df['predict_category_property'].apply(lambda x: list(re.split('[:;]',x)))
    df['predict_category_property'] = df['predict_category_property'].map(del_na)
    df['len_item_property_list'] = df['item_property_list'].apply(lambda x: len(str(x).split(';')))
    df['len_predict_category_property'] = df['predict_category_property'].apply(lambda x: len(str(x).split(';')))
    
    
    lbl = LabelEncoder()
    for i in range(1,3):
        df['item_category_list_bin%d'%i] = lbl.fit_transform(df['item_category_list'].apply(lambda x: x.split(';')[i] if len(x.split(';'))>i else ''))
    '''
    for i in range(10):
        df['predict_category_property%d'%i] = lbl.fit_transform(df['predict_category_property'].apply(lambda x: x.split(';')[i] if len(x.split(';'))>i else ''))
    '''
    
    for var in ['time','predict_category_property','item_property_list','item_category_list']:
        del df[var]
    #df["missing_feat"] = np.sum((df == -1).values, axis=1)
    return df

def labelencoder(df):
    lbl = LabelEncoder()
    for var in ['user_id','item_id','shop_id','item_brand_id','item_city_id']:
        df[var] = lbl.fit_transform(df[var])
    return df     

def text_cosine(df):
    df['tmp_cate'] = df['item_category_list'].apply(lambda x: x.split(';')[2] if len(x.split(';'))>2 else x.split(';')[1])
    df['cate_predict_chk']=list(map(lambda x,y: 1 if x in y else 0 , df['tmp_cate'],df['predict_category_property']))
    del df['tmp_cate']
    
    df['tmp_set_predict_property'] =df['predict_category_property'].apply(lambda x: set(re.split('[:;]',x)[1::2]))
    df['tmp_set_item_property_list'] =df['item_property_list'].apply(lambda x: set(re.split('[;]',x)))
    df['property_join_cnt'] = df[['tmp_set_predict_property','tmp_set_item_property_list']].apply(lambda x: len(x[0]&x[1])*1.0/len(x[0]|x[1]),axis=1)
    df['property_gap1_cnt'] = df[['tmp_set_predict_property','tmp_set_item_property_list']].apply(lambda x: len(x[0]-x[1])*1.0/len(x[0]|x[1]),axis=1)
    df['property_gap2_cnt'] = df[['tmp_set_predict_property','tmp_set_item_property_list']].apply(lambda x: len(x[1]-x[0])*1.0/len(x[0]|x[1]),axis=1)
    del df['tmp_set_predict_property']
    del df['tmp_set_item_property_list']
    return df

def smooth_ctr(df,dfBase=None,base_list=[]):
    namePre = 'pre_days_'
    if type(dfBase) == type(None):
        namePre = 'pre_hour_'
        df['same_day_key'] = df['hour'].apply(lambda x: 12 if x>=12 else x)
        dfBase = df.copy()
        dfBase['same_day_key'] = dfBase['same_day_key'] + 1
    dfTrain = dfBase.loc[dfBase['is_trade'].notnull()]    
    for var in base_list:
        if not isinstance(var,list):
            var = [var]
        nameBase = namePre + '~'.join(var)
        if 'pre_hour_' in nameBase:
            if 'hour' in var:
                continue
            naFill = []
            with open('log.txt','w') as f:
                f.write(nameBase)
                f.write('\n')
            for hour_key in range(1,13):
                hyper = HyperParam(1,1)
                dfTrainTmp = dfTrain.loc[dfTrain['same_day_key']<=hour_key,var + ['is_trade']]
                dfTrainGroup=dfTrainTmp.groupby(var,as_index=False)['is_trade'].agg({'sum':'sum','size':'count'})
                hyper.update_from_data_by_FPI(dfTrainGroup['size'].tolist(), dfTrainGroup['sum'].tolist(), 100, 0.00000001)
                dfTrainGroup[nameBase + '_smooth_ctr'] = (dfTrainGroup['sum'] + hyper.alpha)/(dfTrainGroup['size'] + hyper.alpha + hyper.beta)
                dfTrainGroup = dfTrainGroup[var+[nameBase + '_smooth_ctr']]
                dfTrainGroup['same_day_key'] = hour_key
                naFill.append(hyper.alpha/(hyper.alpha+hyper.beta))
                if hour_key==1:
                    dfGroup = dfTrainGroup.copy()
                else:
                    dfGroup = pd.concat([dfGroup,dfTrainGroup])
            df = df.merge(dfGroup,'left',var+['same_day_key'])
            for hour_key in range(1,13):
                df.loc[df['same_day_key']==hour_key,nameBase + '_smooth_ctr'].fillna(naFill[hour_key-1],inplace=True)
        else:        
        
            hyper = HyperParam(1,1)
            dfTrainTmp = dfTrain[var + ['is_trade']]
            dfTrainGroup=dfTrainTmp.groupby(var,as_index=False)['is_trade'].agg({'sum':'sum','size':'count'})
            hyper.update_from_data_by_FPI(dfTrainGroup['size'].tolist(), dfTrainGroup['sum'].tolist(), 100, 0.00000001)
            dfTrainGroup[nameBase + '_smooth_ctr'] = (dfTrainGroup['sum'] + hyper.alpha)/(dfTrainGroup['size'] + hyper.alpha + hyper.beta)
            dfTrainGroup = dfTrainGroup[var+[nameBase + '_smooth_ctr']]
            naFill = hyper.alpha/(hyper.alpha+hyper.beta)
            dfGroup = dfTrainGroup.copy()
            df = df.merge(dfGroup,'left',var)
            df[nameBase + '_smooth_ctr'].fillna(naFill,inplace=True)
    try: del df['same_day_key']
    except: print('Pre_days version finished')
    gc.collect()
    return df


    
def same_day_trick(df,key_var=[]):
    if not isinstance(key_var,list):
        key_var = [key_var]
    nameBase = '~'.join(key_var)
    ###当天前后的数据情况
    df[nameBase+'_before_exist'] = ((df.groupby(key_var+['day'])['context_timestamp'].rank(method='min') - 1) > 0).astype(int)
    df[nameBase+'_after_exist'] = ((df.groupby(key_var+['day'])['context_timestamp'].rank(method='min',ascending=False)- 1) > 0).astype(int)
    df[nameBase+'_sametime_exist'] = ((df.groupby(key_var+['day'])['context_timestamp'].rank(method='max') - df.groupby(key_var+['day'])['context_timestamp'].rank(method='min')) > 0).astype(int)
    gc.collect()
    return df

   
def _offline_feat(df,dfBase=None,key_var='user_id',stat_var=[],part_var=[],mean_var=[]):
    if not isinstance(key_var,list):
        key_var = [key_var]
    base_name = 'pre_days_' + '~'.join(key_var)
    if type(dfBase) == type(None):
        print('test day features')
        base_name = 'pre_hour_' + '~'.join(key_var)
        dfBase = df.copy()
        dfBase['hour'] = dfBase['hour'] + 1
        key_var.append('hour')
    df = df.merge(dfBase.groupby(key_var,as_index=False)['instance_id'].agg({base_name+'_cnt':'count'}),'left',key_var)
    df = df.merge(dfBase.groupby(key_var,as_index=False)['is_trade'].agg({base_name+'_trade_cnt':'sum',base_name+'_trade_ratio':'mean'}),'left',key_var)
    df.fillna({x:0 for x in [base_name+'_cnt',base_name+'_trade_cnt',base_name+'_trade_ratio']}, inplace=True)
    df[base_name+'_notrade_cnt'] = df[base_name+'_cnt']-df[base_name+'_trade_cnt']
    dfTmp = dfBase.loc[dfBase['is_trade']==1]
    for stat in stat_var:
        df = df.merge(dfBase.groupby(key_var,as_index=False)[stat].agg({base_name+'_'+stat+'_min':'min',base_name+'_'+stat+'_max':'max'}),'left',key_var)    
        df.fillna({x:0 for x in [base_name+'_'+stat+'_min',base_name+'_'+stat+'_max']}, inplace=True)
    for part in part_var:
        df = df.merge(dfBase.groupby(key_var,as_index=False)[part].agg({base_name+'_'+part+'_cnt':'nunique'}),'left',key_var)
        df[base_name+'_'+part+'_cnt'].fillna(0.000001,inplace=True)
        df = df.merge(dfTmp.groupby(key_var,as_index=False)[part].agg({base_name+'_'+part+'_trade_cnt':'nunique'}),'left',key_var)
        df[base_name+'_'+part+'_trade_cnt'].fillna(0,inplace=True)
        df[base_name+'_'+part+'_trade_ratio'] = 1.0*df[base_name+'_'+part+'_trade_cnt']/df[base_name+'_'+part+'_cnt']
    for var in mean_var:
        df = df.merge(dfBase.groupby(key_var+[var],as_index=False)['is_trade'].sum().groupby(key_var,as_index=False)['is_trade'].agg({base_name+'_'+var+'_avg_trade':'mean'}),'left',key_var)
        df[base_name+'_'+var+'_avg_trade'].fillna(0,inplace=True)
    gc.collect()
    return df

def _offline_feat_new(df,dfBase=None,key_var='user_id',stat_var=[],part_var=[],mean_var=[]):
    if not isinstance(key_var,list):
        key_var = [key_var]
    base_name = 'pre_days_' + '~'.join(key_var)
    if type(dfBase) == type(None):
        print('test day features')
        base_name = 'pre_hour_' + '~'.join(key_var)
        df['same_day_key'] = df['hour'].apply(lambda x: 12 if x>=12 else x)
        dfBase = df.copy()
        dfBase['same_day_key'] = dfBase['same_day_key'] + 1
        key_var.append('same_day_key')
    for delVar in [base_name+'_cnt',base_name+'_trade_cnt',base_name+'_trade_ratio',base_name+'_notrade_cnt']:
        del df[delVar]
    
    df = df.merge(dfBase.groupby(key_var,as_index=False)['instance_id'].agg({base_name+'_cnt':'count'}),'left',key_var)
    df = df.merge(dfBase.groupby(key_var,as_index=False)['is_trade'].agg({base_name+'_trade_cnt':'sum',base_name+'_trade_ratio':'mean'}),'left',key_var)
    df.fillna({x:0 for x in [base_name+'_cnt',base_name+'_trade_cnt',base_name+'_trade_ratio']}, inplace=True)
    df[base_name+'_notrade_cnt'] = df[base_name+'_cnt']-df[base_name+'_trade_cnt']
    dfTmp = dfBase.loc[dfBase['is_trade']==1]
    for part in part_var:
        for delVar in [base_name+'_'+part+'_cnt',base_name+'_'+part+'_trade_cnt',base_name+'_'+part+'_trade_ratio']:
            del df[delVar]
        df = df.merge(dfBase.groupby(key_var,as_index=False)[part].agg({base_name+'_'+part+'_cnt':'nunique'}),'left',key_var)
        df[base_name+'_'+part+'_cnt'].fillna(0.000001,inplace=True)
        df = df.merge(dfTmp.groupby(key_var,as_index=False)[part].agg({base_name+'_'+part+'_trade_cnt':'nunique'}),'left',key_var)
        df[base_name+'_'+part+'_trade_cnt'].fillna(0,inplace=True)
        df[base_name+'_'+part+'_trade_ratio'] = 1.0*df[base_name+'_'+part+'_trade_cnt']/df[base_name+'_'+part+'_cnt']
    for var in mean_var:
        del df[base_name+'_'+var+'_avg_trade']
        df = df.merge(dfBase.groupby(key_var+[var],as_index=False)['is_trade'].sum().groupby(key_var,as_index=False)['is_trade'].agg({base_name+'_'+var+'_avg_trade':'mean'}),'left',key_var)
        df[base_name+'_'+var+'_avg_trade'].fillna(0,inplace=True)
    if 'pre_hour_' in base_name:
        del df['same_day_key']
    gc.collect()
    return df


def map_col(df,drop=False):
    map_dict = {
        'item_price_level':[4,5,6,7,8,9],
        'item_sales_level':[4,6,9,10,11,12,13,14,16],
        'item_pv_level':[6,9,10,11,12,13,14,15,16,17,18,19,20],
        'user_age_level':[1001,1002,1003,1004,1005],
        'context_page_id':[4001,4002,4004,4006,4008,4010,4013,4016,4018],
        'shop_review_num_level':[5,9,14,15,16,17,18,20,21],
        #'hour':[6,9,12,17,20],
        'user_occupation_id':{-1:2003},
        'user_star_level':{-1:3000}
    }
    for key,value in map_dict.items():
        if isinstance(value,list):
            df[key+'_mapped'] = 0
            for i in range(len(value)):
                df.loc[df[key]>value[i],key+'_mapped'] = i+1
        else:
            '''df[key+'_mapped'] = df[key]
            for key_sub,value_sub in value.items():
                df.loc[df[key]==key_sub,key+'_mapped'] = value_sub'''
            df[key+'_mapped'] = df[key].apply(lambda x:value.get(x,x))
        if drop:
            df[key] = df[key+'_mapped']
            del df[key+'_mapped']
    gc.collect()
    return df



def cross_feat_plus(df,base_list,order=2):
    if order<2:
        return df
    subset = powerset(base_list)
    subset = [i for i in subset if len(i)==order]
    for sub in subset:
        sub = list(sub)
        baseName = '~'.join(sub)+'_plus'
        df[baseName] = df[sub].sum(axis=1)
        gc.collect()
    return df

def interaction_ratio(df,dfBase=None,base_list=[],cal_list=[],rank_list =[]):
    titlePre = 'pre_days_'
    if type(dfBase) == type(None):
        titlePre = 'same_day_'
        dfBase = df.copy()
    else:
        dfBase = pd.concat([df,dfBase],axis=0)
        dfBase.reset_index(inplace=True,drop =True)
    for base_var in base_list:
        if not isinstance(base_var,list):
            base_var = [base_var]
        titleBase = titlePre+ '_'.join(base_var)
            
        if not titleBase+'_cnt' in df.columns:
            df = df.merge(dfBase.groupby(base_var,as_index=False)['instance_id'].agg({titleBase+'_cnt':'count'}),'left',base_var)
        print('ratio part')
        for cal_var in cal_list:
            if not isinstance(cal_var,list):
                cal_var = [cal_var]
            if cal_var==base_var or base_var==['cnt_rec']:
                continue
            nameBase = titleBase+'~'+'_'.join(cal_var)
            print(nameBase)
            df = df.merge(dfBase.groupby(base_var+cal_var,as_index=False)['instance_id'].agg({nameBase+'_cnt':'count'}),'left',base_var+cal_var)
            df[nameBase+'_ratio'] = df[nameBase+'_cnt']*1.0/df[titleBase+'_cnt']
            df[nameBase+'_ratio'] = pd.to_numeric(df[nameBase+'_ratio'],downcast='float')
            del df[nameBase+'_cnt']
        
        print('rank part')
        for rank_var in rank_list:
            if not isinstance(rank_var,list):
                rank_var = [rank_var]
            if rank_var==base_var:
                continue
            nameBase = titleBase+'~'+'_'.join(rank_var)
            print(nameBase)
            
            df[nameBase+'_rank'] = dfBase.groupby(base_var)[rank_var].rank(method='min')[:df.shape[0]]
            
            df[nameBase+'_rank_ratio'] = df[nameBase+'_rank']*1.0/df[titleBase+'_cnt']
            df[nameBase+'_rank_ratio'] = pd.to_numeric(df[nameBase+'_rank_ratio'],downcast='float')
            del df[nameBase+'_rank']
        del df[titleBase+'_cnt']
        gc.collect()
    return df

<font color=#0099ff size=5 face="黑体">读取数据</font>

In [None]:
featureDtypes = {'cnt_rec': 'int8',
 'context_id': 'int64',
 'context_page_id': 'int16',
 'context_timestamp': 'int32',
 'day': 'int8',
 'hour': 'int8',
 'instance_id': 'int64',
 'is_trade': 'float32',
 'item_brand_id': 'int16',
 'item_category_list_bin1': 'int8',
 'item_category_list_bin2': 'int8',
 'item_city_id': 'int16',
 'item_collected_level': 'int8',
 'item_id': 'int32',
 'item_price_level': 'int8',
 'item_pv_level': 'int8',
 'item_sales_level': 'int8',
 'len_item_property_list': 'int8',
 'len_predict_category_property': 'int8',
 'min': 'int8',
 'shop_id': 'int16',
 'shop_review_num_level': 'int8',
 'shop_review_positive_rate': 'float32',
 'shop_score_delivery': 'float32',
 'shop_score_description': 'float32',
 'shop_score_service': 'float32',
 'shop_star_level': 'int16',
 'user_age_level': 'int16',
 'user_gender_id': 'int8',
 'user_id': 'int32',
 'user_occupation_id': 'int16',
 'user_star_level': 'int16'}

if not os.path.exists(config.FEATURE_SET):
    dfTrain = pd.read_table(config.TRAIN_FILE,sep=' ')
    dfTrain.drop_duplicates(inplace=True)
    dfTrain.reset_index(inplace=True,drop =True)
    dfTrain = process(dfTrain)
    dfTest = pd.read_table(config.TEST_FILE,sep=' ')
    dfTest = process(dfTest)
    dfTrain.loc[dfTrain['day'] == 31,'day'] = 0
    dfAll = pd.concat([dfTrain,dfTest],axis=0)
    dfAll.reset_index(inplace=True,drop=True)
    del dfTrain
    del dfTest
    dfAll['cnt_rec'] = 1
    dfAll = labelencoder(dfAll)
    dfSet = dfAll.loc[dfAll['day']==7]
    dfBase = dfAll.loc[dfAll['day']!=7]
    dfSet.to_csv(config.FEATURE_SET,sep=' ',index=False, line_terminator='\n')
    dfBase.to_csv(config.FEATURE_BASE,sep=' ',index=False, line_terminator='\n')
    del dfAll
else:
    dfSet = pd.read_table(config.FEATURE_SET,sep=' ',dtype=featureDtypes)
    dfBase = pd.read_table(config.FEATURE_BASE,sep=' ',dtype=featureDtypes)

    
'''for var in dfSet:
    if var not in ['shop_review_positive_rate','shop_score_delivery','shop_score_description','shop_score_service']:
        print(var)
        dfSet[var] = pd.to_numeric(dfSet[var],downcast='signed')
        
for var in dfSet:
    if var in ['shop_review_positive_rate','shop_score_delivery','shop_score_description','shop_score_service']:
        dfSet[var] = pd.to_numeric(dfSet[var],downcast='float')'''

<font color=#0099ff size=5 face="黑体">特征工程</font>

In [None]:
#dataRootDir = '/data/5/data/maoli/learn/advertisement/Cache/'
dataRootDir = '../../Data/advertisment/Cache/'
dfAll = dfSet

In [None]:
###单特征map
dfAll = map_col(dfAll,True); gc.collect()
print(dfAll.shape)
featBound = dfAll.shape[1]
#featBase = [i for i in dfAll.columns.tolist() if not i in config.IGNORE_COLS]

In [None]:
###平滑后CTR
#keyList = ['item_id']
keyList = config.CATEGORICAL_COLS



if os.path.exists(dataRootDir + 'smooth.csv'):
    dfSmooth = pd.read_csv(dataRootDir + 'smooth.csv')
    dfAll = pd.concat([dfAll,dfSmooth],axis=1)
    del dfSmooth
else:
    dfAll = smooth_ctr(dfAll,None,keyList); gc.collect()
    dfAll = smooth_ctr(dfAll,dfBase,keyList); gc.collect()
    toSave = dfAll.iloc[:,featBound:]
    toSave.head()
    toSave.to_csv(dataRootDir + 'smooth.csv',index=False)

print(dfAll.shape)
#dfAll = dfAll.iloc[:,:featBound]

In [None]:
###平滑后CTR
#keyList = ['item_id']
keyList = [list(i) for i in powerset(config.CATEGORICAL_COLS) if len(i)==2 and not 'user_id' in i]

if os.path.exists(dataRootDir + 'smooth_2order.csv'):
    dfSmooth = pd.read_csv(dataRootDir + 'smooth_2order.csv')
    dfAll = pd.concat([dfAll,dfSmooth],axis=1)
    del dfSmooth
else:
    dfAll = smooth_ctr(dfAll,None,keyList); gc.collect()
    dfAll = smooth_ctr(dfAll,dfBase,keyList); gc.collect()
    toSave = dfAll.iloc[:,featBound:]
    toSave.head()
    toSave.to_csv(dataRootDir + 'smooth_2order.csv',index=False)

print(dfAll.shape)
#dfAll = dfAll.iloc[:,:featBound]

In [None]:
###线下特征集合
keyList = ['user_id','shop_id','item_id','hour','item_category_list_bin1']
partList = [
    ['item_id','shop_id'],
    ['user_id','item_id'],
    ['user_id','shop_id'],
    ['user_id','item_id','shop_id'],
    ['user_id','item_id','shop_id']
]
meanList = [
    ['shop_id'],
    ['item_id'],
    [],
    ['user_id','shop_id','item_id'],
    ['user_id','shop_id','item_id']
]

if os.path.exists(dataRootDir + 'offline.csv'):
    dfOffline = pd.read_csv(dataRootDir + 'offline.csv')
    dfAll = pd.concat([dfAll,dfOffline],axis=1)
    del dfOffline
else:
    for i in range(len(keyList)):
    #for i in range(0,1):
        keyVar = keyList[i]
        partVar = partList[i]
        meanVar = meanList[i]
        statVar = []
        if isinstance(keyVar,str):
            for key,value in config.STAT_DICT.items():
                if key==keyVar:
                    continue
                statVar += value
        if not 'hour' in keyVar:
            dfAll = _offline_feat(dfAll,None,keyVar,statVar,partVar,meanVar); gc.collect()
        dfAll = _offline_feat(dfAll,dfBase,keyVar,statVar,partVar,meanVar); gc.collect()
    toSave = dfAll.iloc[:,featBound:]
    toSave.head()
    toSave.to_csv(dataRootDir + 'offline.csv',index=False)

print(dfAll.shape)
#dfAll = dfAll.iloc[:,:featBound]


In [None]:
###连续型变量交叉特征
conList = [
    'user_gender_id','user_age_level', 'user_star_level',
    'item_price_level', 'item_sales_level','item_collected_level', 'item_pv_level',
    'context_page_id',
    'shop_review_num_level','shop_star_level'
]

if os.path.exists(dataRootDir + 'cross_plus.csv'):
    dfCrossPlus = pd.read_csv(dataRootDir + 'cross_plus.csv')
    dfAll = pd.concat([dfAll,dfCrossPlus],axis=1)
    del dfCrossPlus
else:
    dfAll = cross_feat_plus(dfAll,conList,order=2); gc.collect()
    dfAll = cross_feat_plus(dfAll,conList,order=3); gc.collect()
    toSave = dfAll.iloc[:,featBound:]
    toSave.head()
    toSave.to_csv(dataRootDir + 'cross_plus.csv',index=False)
print(dfAll.shape)
#dfAll = dfAll.iloc[:,:featBound]

In [None]:
###当天信息的trick
keyList = ['user_id'] + [['user_id',i] for i in config.CATEGORICAL_COLS if i!='user_id']

if os.path.exists(dataRootDir + 'trick_userid.csv'):
    dfTrick = pd.read_csv(dataRootDir + 'trick_userid.csv')
    dfAll = pd.concat([dfAll,dfTrick],axis=1)
    del dfTrick
else:
    for keyVar in keyList:
        dfAll = same_day_trick(dfAll,keyVar); gc.collect()
    toSave = dfAll.iloc[:,featBound:]
    toSave.head()
    toSave.to_csv(dataRootDir + 'trick_userid.csv',index=False)
print(dfAll.shape)
#dfAll = dfAll.iloc[:,:featBound]

In [None]:
###两两类别变量的比例/rank 顺序
baseList = [
    'cnt_rec',
    'user_id','user_gender_id', 'user_occupation_id','user_age_level', 'user_star_level',
    'item_id', 'item_brand_id', 'item_city_id', 'item_price_level', 'item_sales_level','item_collected_level', 'item_pv_level',
    'item_category_list_bin1','item_category_list_bin2',
    'shop_id', 'shop_review_num_level','shop_star_level'
    
]

calList = [
    'user_id','user_gender_id', 'user_occupation_id','item_id', 'item_brand_id', 'item_city_id',
    'item_category_list_bin1','item_category_list_bin2','shop_id'
]
rankList = [
    'user_age_level', 'user_star_level','item_price_level', 'item_sales_level','item_collected_level', 'item_pv_level','shop_review_num_level','shop_star_level'
]


if os.path.exists(dataRootDir + 'ratio_rank.csv'):
    dfRank = pd.read_csv(dataRootDir + 'ratio_rank.csv')
    dfAll = pd.concat([dfAll,dfRank],axis=1)
    del dfRank
else:
    dfAll = interaction_ratio(dfAll,None,baseList,calList,rankList); gc.collect()
    #dfAll = interaction_ratio(dfAll,dfBase,baseList,calList,rankList); gc.collect()
    toSave = dfAll.iloc[:,featBound:]
    toSave.head()
    toSave.to_csv(dataRootDir + 'ratio_rank.csv',index=False)
print(dfAll.shape)
#dfAll = dfAll.iloc[:,:featBound]


if os.path.exists(dataRootDir + 'ratio_rank_preday.csv'):
    dfRank = pd.read_csv(dataRootDir + 'ratio_rank_preday.csv')
    dfAll = pd.concat([dfAll,dfRank],axis=1)
    del dfRank
else:
    #dfAll = interaction_ratio(dfAll,None,baseList,calList,rankList); gc.collect()
    dfAll = interaction_ratio(dfAll,dfBase,baseList,calList,rankList); gc.collect()
    toSave = dfAll.iloc[:,featBound:]
    toSave.head()
    toSave.to_csv(dataRootDir + 'ratio_rank_preday.csv',index=False)
print(dfAll.shape)
#dfAll = dfAll.iloc[:,:featBound]

<font color=#0099ff size=5 face="黑体">拆分样本</font>

In [None]:
features = [i for i in dfAll.columns.tolist() if not i in config.IGNORE_COLS]

train_idx = dfAll.loc[(dfAll['hour']<10)&(dfAll['hour']>0)].index
valid_idx = dfAll.loc[(dfAll['hour']<12)&(dfAll['hour']>9)].index
Xi_train_, y_train_ = dfAll.loc[list(train_idx),features],dfTrain.loc[train_idx,'is_trade']
Xi_valid_, y_valid_ = dfAll.loc[list(valid_idx),features],dfTrain.loc[valid_idx,'is_trade']
Xi_test_ = dfAll.loc[(dfAll['hour']>=12),features]

del dfAll

<font color=#0099ff size=5 face="黑体">模型</font>

In [None]:
clf = lgb.LGBMClassifier(
    boosting_type = 'gbdt',
    num_leaves=40, 
    max_depth=8,
    n_estimators=20000,
    n_jobs=20,
    learning_rate=0.05,
    colsample_bytree=0.8,
    subsample=0.9,
    max_bin=20
)
clf.fit(Xi_train_[features], y_train_, eval_set=[(Xi_valid_[features], y_valid_)],feature_name = features,
        categorical_feature=[],early_stopping_rounds=100)
#[i for i in ['item_category_list_bin1','item_category_list_bin2'] if i in features]
y_score_ = clf.predict_proba(Xi_valid_[features],)[:, 1]

print(pd.Series(clf.feature_importances_, features).sort_values(ascending=False).reset_index())
print(log_loss(y_valid_, y_score_))
print(ks_metric(y_valid_, y_score_))
bstIter = clf.best_iteration_

In [None]:
xx = pd.Series(clf.feature_importances_, features).sort_values(ascending=False).reset_index()
(xx[0]==0).sum()

In [None]:
features = xx.loc[xx[0]>0,'index'].tolist()

In [None]:
def score_change(score,base_rate,real_rate):
    base_change = np.log(base_rate/(1-base_rate)) - np.log(real_rate/(1-real_rate))
    score_adj = np.exp(np.log(score/(1-score)) - base_change)/(np.exp(np.log(score/(1-score)) - base_change)+1)
    return score_adj

In [None]:
del Xi_train_
del Xi_valid_

In [None]:
Xi_finnal_.shape

In [None]:
#Xi_finnal_ ,y_finnal_ = np.vstack((Xi_train_,Xi_valid_),np.hstack((y_train_,y_valid_))
Xi_finnal_ ,y_finnal_ = pd.concat([Xi_train_,Xi_valid_]), pd.concat([y_train_,y_valid_])
del Xi_train_
del Xi_valid_

clf = lgb.LGBMClassifier(
    num_leaves=40, 
    max_depth=8,
    n_estimators=bstIter,
    n_jobs=20,
    learning_rate=0.05,
    colsample_bytree=0.8,
    subsample=0.9,
    max_bin=20
)
clf.fit(Xi_finnal_[features], y_finnal_,feature_name = features,
        categorical_feature=[])
#[i for i in ['item_category_list_bin1','item_category_list_bin2'] if i in features]
y_test_meta = np.zeros((dfTest.shape[0], 1), dtype=float)
y_test_meta[:,0] += clf.predict_proba(Xi_test_[features])[:,1]
submit = pd.DataFrame({'instance_id':dfTest['instance_id'],'predicted_score':y_test_meta[:,0]})
#submit.to_csv('../../Submission/advertisement/gbm_trick_0330.txt', sep=" ", index=False, line_terminator='\n')

In [None]:
y_test_meta[:,0] += clf.predict_proba(Xi_test_[features])[:,1]
submit = pd.DataFrame({'instance_id':dfTest['instance_id'],'predicted_score':y_test_meta[:,0]})

In [None]:
submit = submit.loc[submit['instance_id'].isin(idSubmit)]
submit['predicted_score'].mean()

In [None]:
y_finnal_.mean()

In [None]:
submit.to_csv('../../Submission/advertisement/gbm_trick_testb_419.txt', sep=" ", index=False, line_terminator='\n')

In [None]:
submit['predicted_score'] = 0
submit.to_csv('../../Submission/advertisement/gbm_trick_testb_418.txt', sep=" ", index=False, line_terminator='\n')

In [None]:
submit['predicted_score'] = score_change(submit['predicted_score'],submit['predicted_score'].mean(),0.018116956)
print(submit['predicted_score'].mean())
submit.to_csv('../../Submission/advertisement/gbm_trick_testb_adj_419.txt', sep=" ", index=False, line_terminator='\n')

In [None]:
submit = pd.read_csv('../../Submission/advertisement/gbm_trick_text_417.txt',sep=" ")