In [1]:
import time
import pandas as pd
import numpy as np
import lightgbm as lgb
import random
import config
import re
import os
import scipy.special as special

from math import log
from numba import jit
from scipy.sparse import csr_matrix, hstack
from sklearn.metrics import log_loss,roc_curve
from sklearn.preprocessing import LabelEncoder,OneHotEncoder,StandardScaler
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from itertools import chain, combinations

In [2]:
def timestamp_datetime(value):
    return time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(value))

def time_feat(df,featList,featName):
    df[featName] = df.groupby(featList)['context_timestamp'].rank(method='first')   
    return df

def del_na(lst):
    out = ''
    if len(lst)<2:
        return out        
    for i in range(0,len(lst),2):
        if not lst[i+1]=='-1':
            out += lst[i]+':'+lst[i+1]+';'
    try:  return out[:-1]
    except: return out

def ks_metric(true,score):
    fpr, tpr, thresholds = roc_curve(true,score)
    ks = max(tpr-fpr)
    return ks 

def score_change(score,base_rate,real_rate):
    base_change = np.log(base_rate/(1-base_rate)) - np.log(real_rate/(1-real_rate))
    score_adj = np.exp(np.log(score/(1-score)) - base_change)/(np.exp(np.log(score/(1-score)) - base_change)+1)
    return score_adj

In [3]:
def process(df):
    df['time'] = df.context_timestamp.apply(timestamp_datetime)
    df['day'] = df.time.apply(lambda x: int(x[8:10]))
    df['hour'] = df.time.apply(lambda x: int(x[11:13]))
    '''for lst in timeFeatList:
        df = time_feat(df,lst,'_'.join(lst))'''
    df['item_property_list'] = df['item_property_list'].apply(lambda x:';'.join(sorted(set(str(x).split(';')))))
    df['predict_category_property'] = df['predict_category_property'].apply(lambda x:';'.join(sorted(set(str(x).split(';')))))
    df['predict_category_property'] =df['predict_category_property'].apply(lambda x: list(re.split('[:;]',x)))
    df['predict_category_property'] = df['predict_category_property'].map(del_na)
    df['len_item_property_list'] = df['item_property_list'].apply(lambda x: len(str(x).split(';')))
    df['len_predict_category_property'] = df['predict_category_property'].apply(lambda x: len(str(x).split(';')))
    return df

<font color=#0099ff size=5 face="黑体">读取数据</font>

In [11]:
dfTrain = pd.read_table(config.TRAIN_FILE,sep=' ')
dfTrain.drop_duplicates(inplace=True)
dfTrain.reset_index(inplace=True,drop =True)
dfTest = pd.read_table(config.TEST_FILE,sep=' ')

dfTrain = process(dfTrain)
dfTest = process(dfTest)

dfSubmit = pd.read_table(config.TEST_FILE_NEW,sep=' ')
idSubmit = dfSubmit['instance_id'].tolist()
del dfSubmit

dfAll = pd.concat([dfTrain,dfTest],axis=0)
dfAll.reset_index(inplace=True,drop=True)
trainNum = dfTrain.shape[0]

dfAll.shape

(539370, 32)

In [12]:
count = CountVectorizer(ngram_range=(1,1))
sparse_merge = count.fit_transform(dfAll['item_property_list'])
#sparse_merge = hstack((sparse_merge,count.fit_transform(dfAll['predict_category_property']))).tocsr()
sparse_merge.shape

(539370, 63890)

In [13]:
cutoff = 5
sparse_merge = sparse_merge[:, np.array(np.clip(sparse_merge[:trainNum,:].getnnz(axis=0) -cutoff, 0, 1), dtype=bool)]
sparse_merge.shape

(539370, 37805)

In [14]:
train_idx = dfTrain.loc[(dfTrain['day']<24)&(dfTrain['day']>18)].index
valid_idx = dfTrain.loc[dfTrain['day']==24].index
Xi_train_, y_train_ = sparse_merge[list(train_idx),:],dfTrain.loc[train_idx,'is_trade']
Xi_valid_, y_valid_ = sparse_merge[list(valid_idx),:],dfTrain.loc[valid_idx,'is_trade']
Xi_test_ = sparse_merge[trainNum:,:]
features = range(sparse_merge.shape[1])

In [15]:
Xi_train_

<342432x37805 sparse matrix of type '<class 'numpy.int64'>'
	with 11704887 stored elements in Compressed Sparse Row format>

In [16]:
Xi_train_ = Xi_train_.astype(float)
Xi_valid_ = Xi_valid_.astype(float)

In [17]:
clf = lgb.LGBMClassifier(
    boosting_type = 'gbdt',
    num_leaves=40, 
    max_depth=8,
    n_estimators=20000,
    n_jobs=20,
    learning_rate=0.05,
    colsample_bytree=0.8,
    subsample=0.9,
    max_bin=20
)
clf.fit(Xi_train_, y_train_, eval_set=[(Xi_valid_, y_valid_)],
        categorical_feature=[],early_stopping_rounds=100)
#[i for i in ['item_category_list_bin1','item_category_list_bin2'] if i in features]
y_score_ = clf.predict_proba(Xi_valid_,)[:, 1]

print(pd.Series(clf.feature_importances_, features).sort_values(ascending=False).reset_index())
print(log_loss(y_valid_, y_score_))
print(ks_metric(y_valid_, y_score_))
bstIter = clf.best_iteration_



[1]	valid_0's binary_logloss: 0.64775
Training until validation scores don't improve for 100 rounds.
[2]	valid_0's binary_logloss: 0.606674
[3]	valid_0's binary_logloss: 0.569328
[4]	valid_0's binary_logloss: 0.535257
[5]	valid_0's binary_logloss: 0.504079
[6]	valid_0's binary_logloss: 0.475443
[7]	valid_0's binary_logloss: 0.449097
[8]	valid_0's binary_logloss: 0.424776
[9]	valid_0's binary_logloss: 0.402315
[10]	valid_0's binary_logloss: 0.381503
[11]	valid_0's binary_logloss: 0.362212
[12]	valid_0's binary_logloss: 0.344286
[13]	valid_0's binary_logloss: 0.327612
[14]	valid_0's binary_logloss: 0.312076
[15]	valid_0's binary_logloss: 0.2976
[16]	valid_0's binary_logloss: 0.2841
[17]	valid_0's binary_logloss: 0.271487
[18]	valid_0's binary_logloss: 0.259705
[19]	valid_0's binary_logloss: 0.248675
[20]	valid_0's binary_logloss: 0.238354
[21]	valid_0's binary_logloss: 0.228687
[22]	valid_0's binary_logloss: 0.219634
[23]	valid_0's binary_logloss: 0.211154
[24]	valid_0's binary_logloss: 

[200]	valid_0's binary_logloss: 0.0827287
[201]	valid_0's binary_logloss: 0.0827244
[202]	valid_0's binary_logloss: 0.0827269
[203]	valid_0's binary_logloss: 0.08273
[204]	valid_0's binary_logloss: 0.0827279
[205]	valid_0's binary_logloss: 0.0827289
[206]	valid_0's binary_logloss: 0.0827256
[207]	valid_0's binary_logloss: 0.0827333
[208]	valid_0's binary_logloss: 0.0827262
[209]	valid_0's binary_logloss: 0.0827236
[210]	valid_0's binary_logloss: 0.0827185
[211]	valid_0's binary_logloss: 0.0827213
[212]	valid_0's binary_logloss: 0.0827235
[213]	valid_0's binary_logloss: 0.0827188
[214]	valid_0's binary_logloss: 0.0827153
[215]	valid_0's binary_logloss: 0.0827123
[216]	valid_0's binary_logloss: 0.0827065
[217]	valid_0's binary_logloss: 0.0827028
[218]	valid_0's binary_logloss: 0.082701
[219]	valid_0's binary_logloss: 0.082703
[220]	valid_0's binary_logloss: 0.0827007
[221]	valid_0's binary_logloss: 0.0827025
[222]	valid_0's binary_logloss: 0.082702
[223]	valid_0's binary_logloss: 0.08270

In [None]:
##
0.0827499242483
0.237778550462

In [None]:
tt = ';'.join(dfAll.loc[dfAll['predict_category_property']!='','predict_category_property'].tolist())

In [None]:
cate = set(re.split('[:;]',tt)[::2])

In [None]:
tmp = set(','.join(list(cate)).split(','))

In [None]:
len(tmp)

In [None]:
cate&tmp

<font color=#0099ff size=5 face="黑体">特征工程</font>

In [None]:
###单特征map
dfAll = map_col(dfAll,True)
print(dfAll.shape)
featBase = [i for i in dfAll.columns.tolist() if not i in config.IGNORE_COLS]

In [None]:
###平滑后CTR
#keyList = ['item_id']
keyList = config.CATEGORICAL_COLS

if os.path.exists('../../Data/advertisment/Cache/smooth_new.csv'):
    dfSmooth = pd.read_csv('../../Data/advertisment/Cache/smooth_new.csv')
    dfAll = pd.concat([dfAll,dfSmooth],axis=1)
    del dfSmooth
else:
    dfAll = smooth_ctr(dfAll,keyList)
    '''toSave = dfAll.iloc[:,49:]
    toSave.head()
    toSave.to_csv('../../Data/advertisment/Cache/smooth_new.csv',index=False)'''

print(dfAll.shape)

In [None]:
###线下特征集合
dfAll['feat_set'] = dfAll['day'] + 1
keyList = ['user_id','shop_id','item_id','hour','item_category_list_bin1']
partList = [
    ['item_id','shop_id'],
    ['user_id','item_id'],
    ['user_id','shop_id'],
    ['user_id','item_id','shop_id'],
    ['user_id','item_id','shop_id']
]
meanList = [
    ['shop_id'],
    ['item_id'],
    [],
    ['user_id','shop_id','item_id'],
    ['user_id','shop_id','item_id']
]
for i in range(len(keyList)):
    keyVar = keyList[i]
    partVar = partList[i]
    meanVar = meanList[i]
    statVar = []
    if isinstance(keyVar,str):
        for key,value in config.STAT_DICT.items():
            if key==keyVar:
                continue
            statVar += value
    dfAll = _offline_feat(dfAll,keyVar,statVar,partVar,meanVar,['day','feat_set'])
del dfAll['feat_set']
print(dfAll.shape)

In [None]:
###连续型变量交叉特征
conList = [
    'user_gender_id','user_age_level', 'user_star_level',
    'item_price_level', 'item_sales_level','item_collected_level', 'item_pv_level',
    'context_page_id',
    'shop_review_num_level','shop_star_level'
]
dfAll = cross_feat_plus(dfAll,conList,order=2)
dfAll = cross_feat_plus(dfAll,conList,order=3)
print(dfAll.shape)

In [None]:
###当天信息的trick
keyList = ['user_id',['user_id','shop_id'],['user_id','item_category_list_bin1']]
#,'shop_id','item_id','item_city_id','item_brand_id'
for keyVar in keyList:
    dfAll = same_day_trick(dfAll,keyVar)
print(dfAll.shape)

In [None]:
###两两类别变量的比例/rank 顺序
'''baseList = [
    'cnt_rec',
    'user_id','user_gender_id', 'user_occupation_id','user_age_level', 'user_star_level',
    'item_id', 'item_brand_id', 'item_city_id', 'item_price_level', 'item_sales_level','item_collected_level', 'item_pv_level',
    'item_category_list_bin1','item_category_list_bin2',
    'shop_id', 'shop_review_num_level','shop_star_level'
    
]

calList = [
    'user_id','user_gender_id', 'user_occupation_id','item_id', 'item_brand_id', 'item_city_id',
    'item_category_list_bin1','item_category_list_bin2','shop_id'
]
rankList = [
    'user_age_level', 'user_star_level','item_price_level', 'item_sales_level','item_collected_level', 'item_pv_level','shop_review_num_level','shop_star_level'
]

dfAll = interaction_ratio(dfAll,baseList,calList,rankList)'''
dfCross = pd.read_csv('../../Data/advertisment/Cache/ratio_rank_new.csv')
dfAll = pd.concat([dfAll,dfCross],axis=1)
del dfCross

print(dfAll.shape)

<font color=#0099ff size=5 face="黑体">拆分样本</font>

In [None]:
features = [i for i in dfAll.columns.tolist() if not i in config.IGNORE_COLS]

train_idx = dfTrain.loc[(dfTrain['day']<24)&(dfTrain['day']>18)].index
valid_idx = dfTrain.loc[dfTrain['day']==24].index
Xi_train_, y_train_ = dfAll.loc[list(train_idx),features],dfTrain.loc[train_idx,'is_trade']
Xi_valid_, y_valid_ = dfAll.loc[list(valid_idx),features],dfTrain.loc[valid_idx,'is_trade']
Xi_test_ = dfAll.loc[trainNum:,features]

del dfAll

<font color=#0099ff size=5 face="黑体">模型</font>

In [None]:
clf = lgb.LGBMClassifier(
    boosting_type = 'gbdt',
    num_leaves=40, 
    max_depth=8,
    n_estimators=20000,
    n_jobs=20,
    learning_rate=0.05,
    colsample_bytree=0.8,
    subsample=0.9,
    max_bin=20
)
clf.fit(Xi_train_[features], y_train_, eval_set=[(Xi_valid_[features], y_valid_)],feature_name = features,
        categorical_feature=[],early_stopping_rounds=100)
#[i for i in ['item_category_list_bin1','item_category_list_bin2'] if i in features]
y_score_ = clf.predict_proba(Xi_valid_[features],)[:, 1]

print(pd.Series(clf.feature_importances_, features).sort_values(ascending=False).reset_index())
print(log_loss(y_valid_, y_score_))
print(ks_metric(y_valid_, y_score_))
bstIter = clf.best_iteration_

In [None]:
xx = pd.Series(clf.feature_importances_, features).sort_values(ascending=False).reset_index()
(xx[0]==0).sum()

In [None]:
features = xx.loc[xx[0]>0,'index'].tolist()

In [None]:
def score_change(score,base_rate,real_rate):
    base_change = np.log(base_rate/(1-base_rate)) - np.log(real_rate/(1-real_rate))
    score_adj = np.exp(np.log(score/(1-score)) - base_change)/(np.exp(np.log(score/(1-score)) - base_change)+1)
    return score_adj

In [None]:
del Xi_train_
del Xi_valid_

In [None]:
Xi_finnal_.shape

In [None]:
#Xi_finnal_ ,y_finnal_ = np.vstack((Xi_train_,Xi_valid_),np.hstack((y_train_,y_valid_))
Xi_finnal_ ,y_finnal_ = pd.concat([Xi_train_,Xi_valid_]), pd.concat([y_train_,y_valid_])
del Xi_train_
del Xi_valid_

clf = lgb.LGBMClassifier(
    num_leaves=40, 
    max_depth=8,
    n_estimators=bstIter,
    n_jobs=20,
    learning_rate=0.05,
    colsample_bytree=0.8,
    subsample=0.9,
    max_bin=20
)
clf.fit(Xi_finnal_[features], y_finnal_,feature_name = features,
        categorical_feature=[])
#[i for i in ['item_category_list_bin1','item_category_list_bin2'] if i in features]
y_test_meta = np.zeros((dfTest.shape[0], 1), dtype=float)
y_test_meta[:,0] += clf.predict_proba(Xi_test_[features])[:,1]
submit = pd.DataFrame({'instance_id':dfTest['instance_id'],'predicted_score':y_test_meta[:,0]})
#submit.to_csv('../../Submission/advertisement/gbm_trick_0330.txt', sep=" ", index=False, line_terminator='\n')

In [None]:
y_test_meta[:,0] += clf.predict_proba(Xi_test_[features])[:,1]
submit = pd.DataFrame({'instance_id':dfTest['instance_id'],'predicted_score':y_test_meta[:,0]})

In [None]:
submit = submit.loc[submit['instance_id'].isin(idSubmit)]
submit['predicted_score'].mean()

In [None]:
y_finnal_.mean()

In [None]:
submit.to_csv('../../Submission/advertisement/gbm_trick_testb_419.txt', sep=" ", index=False, line_terminator='\n')

In [None]:
submit['predicted_score'] = 0
submit.to_csv('../../Submission/advertisement/gbm_trick_testb_418.txt', sep=" ", index=False, line_terminator='\n')

In [None]:
submit['predicted_score'] = score_change(submit['predicted_score'],submit['predicted_score'].mean(),0.018116956)
print(submit['predicted_score'].mean())
submit.to_csv('../../Submission/advertisement/gbm_trick_testb_adj_419.txt', sep=" ", index=False, line_terminator='\n')

In [None]:
submit = pd.read_csv('../../Submission/advertisement/gbm_trick_text_417.txt',sep=" ")