In [None]:
import time
import pandas as pd
import numpy as np
import lightgbm as lgb
import random
import config
import re
import os
import gc
import scipy.special as special

from math import log
from numba import jit
from scipy.sparse import csr_matrix, hstack
from sklearn.metrics import log_loss,roc_curve
from sklearn.preprocessing import LabelEncoder,OneHotEncoder,StandardScaler
from sklearn.feature_extraction.text import CountVectorizer
from itertools import chain, combinations

In [None]:
def ks_metric(true,score):
    fpr, tpr, thresholds = roc_curve(true,score)
    ks = max(tpr-fpr)
    return ks 

def score_change(score,base_rate,real_rate):
    base_change = np.log(base_rate/(1-base_rate)) - np.log(real_rate/(1-real_rate))
    score_adj = np.exp(np.log(score/(1-score)) - base_change)/(np.exp(np.log(score/(1-score)) - base_change)+1)
    return score_adj

def memory_saving(df):
    for var in df:
        if df[var].dtypes == float:
            df[var] = pd.to_numeric(df[var],downcast='float')
        else:
            df[var] = pd.to_numeric(df[var],downcast='signed')
    return df

def map_col(df,drop=False):
    map_dict = {
        'item_price_level':[4,5,6,7,8,9],
        'item_sales_level':[4,6,9,10,11,12,13,14,16],
        'item_pv_level':[6,9,10,11,12,13,14,15,16,17,18,19,20],
        'user_age_level':[1001,1002,1003,1004,1005],
        'context_page_id':[4001,4002,4004,4006,4008,4010,4013,4016,4018],
        'shop_review_num_level':[5,9,14,15,16,17,18,20,21],
        #'hour':[6,9,12,17,20],
        'user_occupation_id':{-1:2003},
        'user_star_level':{-1:3000}
    }
    for key,value in map_dict.items():
        if isinstance(value,list):
            df[key+'_mapped'] = 0
            for i in range(len(value)):
                df.loc[df[key]>value[i],key+'_mapped'] = i+1
        else:
            '''df[key+'_mapped'] = df[key]
            for key_sub,value_sub in value.items():
                df.loc[df[key]==key_sub,key+'_mapped'] = value_sub'''
            df[key+'_mapped'] = df[key].apply(lambda x:value.get(x,x))
        if drop:
            df[key] = df[key+'_mapped']
            del df[key+'_mapped']
    gc.collect()
    return df

<font color=#0099ff size=5 face="黑体">读取数据</font>

In [None]:
featureDtypes = {'cnt_rec': 'int8',
 'context_id': 'int64',
 'context_page_id': 'int16',
 'context_timestamp': 'int32',
 'day': 'int8',
 'hour': 'int8',
 'instance_id': 'int64',
 'is_trade': 'float32',
 'item_brand_id': 'int16',
 'item_category_list_bin1': 'int8',
 'item_category_list_bin2': 'int8',
 'item_city_id': 'int16',
 'item_collected_level': 'int8',
 'item_id': 'int32',
 'item_price_level': 'int8',
 'item_pv_level': 'int8',
 'item_sales_level': 'int8',
 'len_item_property_list': 'int8',
 'len_predict_category_property': 'int8',
 'min': 'int8',
 'shop_id': 'int16',
 'shop_review_num_level': 'int8',
 'shop_review_positive_rate': 'float32',
 'shop_score_delivery': 'float32',
 'shop_score_description': 'float32',
 'shop_score_service': 'float32',
 'shop_star_level': 'int16',
 'user_age_level': 'int16',
 'user_gender_id': 'int8',
 'user_id': 'int32',
 'user_occupation_id': 'int16',
 'user_star_level': 'int16'}

dfAll = pd.read_table(config.FEATURE_SET,sep=' ',dtype=featureDtypes)
dfAll = map_col(dfAll,True); gc.collect()

In [None]:
dataRootDir = '../../Data/advertisment/Cache/'
dataAdded = [
    'ratio_rank',
    #'ratio_rank_preday',
    'smooth',
    'offline',
    'cross_plus',
    'trick_userid',
]

for add in dataAdded:
    tmpDf = pd.read_csv(dataRootDir + add +'.csv')
    tmpDf = memory_saving(tmpDf)
    dfAll = pd.concat([dfAll,tmpDf],axis=1)
    del tmpDf
    gc.collect()
    print('%s is loaded, shape is %d'%(add,dfAll.shape[1]))

<font color=#0099ff size=5 face="黑体">拆分样本</font>

In [None]:
features = [i for i in dfAll.columns.tolist() if not i in config.IGNORE_COLS+['min']]

train_idx = dfAll.loc[(dfAll['hour']<10)&(dfAll['hour']>0)].index
valid_idx = dfAll.loc[(dfAll['hour']<12)&(dfAll['hour']>9)].index
Xi_train_, y_train_ = dfAll.loc[list(train_idx),features],dfAll.loc[train_idx,'is_trade']
Xi_valid_, y_valid_ = dfAll.loc[list(valid_idx),features],dfAll.loc[valid_idx,'is_trade']
Xi_test_ = dfAll.loc[(dfAll['hour']>=12),features]

del dfAll
del train_idx
del valid_idx

In [None]:
features = [i for i in Xi_train_.columns.tolist() if not i in config.IGNORE_COLS+['min']]


<font color=#0099ff size=5 face="黑体">模型</font>

In [None]:
clf = lgb.LGBMClassifier(
    boosting_type = 'gbdt',
    num_leaves=40, 
    max_depth=8,
    n_estimators=20000,
    n_jobs=20,
    learning_rate=0.05,
    colsample_bytree=0.8,
    subsample=0.9,
    max_bin=20
)
clf.fit(Xi_train_[features], y_train_, eval_set=[(Xi_valid_[features], y_valid_)],feature_name = features,
        categorical_feature=[],early_stopping_rounds=100)
#[i for i in ['item_category_list_bin1','item_category_list_bin2'] if i in features]
y_score_ = clf.predict_proba(Xi_valid_[features],)[:, 1]

print(pd.Series(clf.feature_importances_, features).sort_values(ascending=False).reset_index())
print(log_loss(y_valid_, y_score_))
print(ks_metric(y_valid_, y_score_))
bstIter = clf.best_iteration_

In [None]:
feat_del =[]

In [None]:
0.16444608803
0.342700836673
0.164416256943
0.343129542574



0.164356482036
0.343516388482

0.164336989298
0.34542592653

0.164476294674
0.340895520415

0.164404493982
0.342978068874

In [None]:
xx = pd.Series(clf.feature_importances_, features).sort_values(ascending=False).reset_index()
(xx[0]==0).sum()

In [None]:
features = xx.loc[xx[0]>0,'index'].tolist()
feat_del += xx.loc[xx[0]==0,'index'].tolist()

In [None]:
import pickle
with open(dataRootDir+'del_features.pkl','wb') as f:
    pickle.dump(feat_del,f)

In [None]:
#Xi_finnal_ ,y_finnal_ = np.vstack((Xi_train_,Xi_valid_),np.hstack((y_train_,y_valid_))
'''Xi_finnal_ ,y_finnal_ = pd.concat([Xi_train_,Xi_valid_]), pd.concat([y_train_,y_valid_])
del Xi_train_
del Xi_valid_'''

clf = lgb.LGBMClassifier(
    num_leaves=40, 
    max_depth=8,
    n_estimators=bstIter,
    n_jobs=20,
    learning_rate=0.05,
    colsample_bytree=0.8,
    subsample=0.9,
    max_bin=20
)
clf.fit(Xi_finnal_[features], y_finnal_,feature_name = features,
        categorical_feature=[])
#[i for i in ['item_category_list_bin1','item_category_list_bin2'] if i in features]
y_test_meta = np.zeros((Xi_test_.shape[0], 1), dtype=float)
y_test_meta[:,0] += clf.predict_proba(Xi_test_[features])[:,1]
dfinstance = pd.read_table(config.FEATURE_SET,sep=' ',dtype=featureDtypes,usecols=[4,5])
submit = pd.DataFrame({'instance_id':dfinstance.loc[dfinstance['hour']>=12,'instance_id'],'predicted_score':y_test_meta[:,0]})
#submit.to_csv('../../Submission/advertisement/gbm_trick_0330.txt', sep=" ", index=False, line_terminator='\n')

In [None]:
y_test_meta = np.zeros((Xi_test_.shape[0], 1), dtype=float)
y_test_meta[:,0] += clf.predict_proba(Xi_test_[features])[:,1]
dfinstance = pd.read_table(config.FEATURE_SET,sep=' ',dtype=featureDtypes,usecols=[4,5])
submit = pd.DataFrame({'instance_id':dfinstance.loc[dfinstance['hour']>=12,'instance_id'],'predicted_score':y_test_meta[:,0]})

In [None]:
print(y_finnal_.mean())
print(submit['predicted_score'].mean())

In [None]:
submit.to_csv('../../Submission/advertisement/lgb_trick_cross_offline_430.txt', sep=" ", index=False, line_terminator='\n')

In [None]:
submit['predicted_score'] = score_change(submit['predicted_score'],submit['predicted_score'].mean(),0.0359194123126834)
print(submit['predicted_score'].mean())
submit.to_csv('../../Submission/advertisement/lgb_trick_cross_offline_adj_430.txt', sep=" ", index=False, line_terminator='\n')

In [None]:
submit = pd.read_csv('../../Submission/advertisement/gbm_trick_text_417.txt',sep=" ")