In [1]:
import time
import pandas as pd
import numpy as np
import lightgbm as lgb
import config
from scipy.sparse import csr_matrix, hstack
from sklearn.metrics import log_loss
from sklearn.preprocessing import LabelEncoder,OneHotEncoder,StandardScaler
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
###组合信息
timeFeatList = [
    ['user_id','day'],
    ['user_id','day','hour'],
    ['item_id','day'],
    ['item_id','day','hour'],
    ['shop_id','day'],
    ['shop_id','day','hour'],
    ['item_brand_id','day'],
    ['item_brand_id','day','hour'],
    ['item_city_id','day'],
    ['item_city_id','day','hour'],
]
keyVarList = [
    ['user_id'],
    ['item_id'],
    ['shop_id'],
    ['user_id','item_id'],
    ['user_id','shop_id']
]

In [3]:
def timestamp_datetime(value):
    return time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(value))

def time_feat(df,featList,featName):
    '''scaler = StandardScaler()
    tmp = df.groupby(featList).size().reset_index().rename(columns={0:featName})
    tmp[featName] = scaler.fit_transform(tmp[featName].values.reshape(-1,1))
    df = df.merge(tmp,'left',on=featList)'''
    df[featName] = df.groupby(featList)['context_timestamp'].rank(method='first')   
    return df

def process(df):
    df['time'] = df.context_timestamp.apply(timestamp_datetime)
    df['day'] = df.time.apply(lambda x: int(x[8:10]))
    df['hour'] = df.time.apply(lambda x: int(x[11:13]))
    for lst in timeFeatList:
        df = time_feat(df,lst,'_'.join(lst))
    df['item_property_list_clean'] = df['item_property_list'].apply(lambda x:';'.join(sorted(set(str(x).split(';')))))
    for i in range(3):
        df['item_category_list_bin%d'%i] = df['item_category_list'].apply(lambda x: x.split(';')[i] if len(x.split(';'))>i else -1)
    df["missing_feat"] = np.sum((df == -1).values, axis=1)
    return df
def get_onehot(df,field):
    one = OneHotEncoder()
    lb = LabelEncoder()
    tmp = lb.fit_transform((list(df[field])))
    one.fit(tmp.reshape(-1,1))
    oneOut = one.transform(lb.transform(df[field]).reshape(-1,1))
    return oneOut

def focus_one_record(df,key_var=[]):
    nameBase = '_'.join(key_var)
    ###当天前后的数据情况
    df[nameBase+'_before_cnt'] = df.groupby(key_var+['day'])['context_timestamp'].rank(method='min') - 1
    df[nameBase+'_after_cnt'] = df.groupby(key_var+['day'])['context_timestamp'].rank(method='min',ascending=False)- 1
    df[nameBase+'_sametime_cnt'] = df.groupby(key_var+['day'])['context_timestamp'].rank(method='max') - df.groupby(key_var+['day'])['context_timestamp'].rank(method='min')+1
    df = df.merge(df.groupby(key_var+['day'],as_index=False)['context_timestamp'].agg({nameBase+'_day_cnt':'count'}),'inner',key_var+['day'])
    for feat in ['_before_cnt','_after_cnt','_sametime_cnt']:
        df[nameBase+feat+'_ratio'] = df[nameBase+feat]*1.0/df[nameBase+'_day_cnt']
        
    ###前一天购买/浏览的数量
    dfTmp =  df.groupby(key_var+['day'],as_index=False)['is_trade'].agg({nameBase+'_preday_trade_cnt':'sum',nameBase+'_preday_cnt':'count'})
    dfTmp['day'] = dfTmp['day']+1
    df = df.merge(dfTmp,'left',key_var+['day'])
    df[nameBase+'_preday_trade_ratio'] = df[nameBase+'_preday_trade_cnt']*1.0/df[nameBase+'_preday_cnt']
    for feat in ['_preday_trade_cnt','_preday_cnt','_preday_trade_ratio']:
        df[nameBase+feat].fillna(0,inplace=True)
        
    ###广告展示上下间隔
    dfTmp = df[[nameBase+'_before_cnt',nameBase+'_after_cnt',nameBase+'_sametime_cnt','time']+key_var+['day']]
    dfTmp.drop_duplicates(inplace=True)
    dfTmp.rename(columns={'time':'new_time'},inplace=True)
    dfTmp['next_record'] = dfTmp[nameBase+'_before_cnt'] + dfTmp[nameBase+'_sametime_cnt'] + 1
    dfTmp['last_record'] = dfTmp[nameBase+'_after_cnt'] + dfTmp[nameBase+'_sametime_cnt'] + 1
    df = df.merge(dfTmp[key_var+['day','next_record','new_time']],'left',left_on = key_var+['day',nameBase+'_before_cnt'],right_on = key_var+['day','next_record'])
    df[nameBase+'next_time_dur'] = (pd.to_datetime(df['time'])-pd.to_datetime(df['new_time'])).dt.seconds
    df[nameBase+'next_time_dur'].fillna(999999,inplace=True)
    df.loc[df[nameBase+'_sametime_cnt']>1,nameBase+'next_time_dur'] = 0
    del df['new_time']
    del df['next_record']
    
    df = df.merge(dfTmp[key_var+['day','last_record','new_time']],'left',left_on = key_var+['day',nameBase+'_after_cnt'],right_on = key_var+['day','last_record'])
    df[nameBase+'last_time_dur'] = (pd.to_datetime(df['new_time'])-pd.to_datetime(df['time'])).dt.seconds
    df[nameBase+'last_time_dur'].fillna(999999,inplace=True)
    df.loc[df[nameBase+'_sametime_cnt']>1,nameBase+'last_time_dur'] = 0
    del df['new_time']
    del df['last_record']
    return df

In [5]:
dfTrain = pd.read_table(config.TRAIN_FILE,sep=' ')
dfTrain.drop_duplicates(inplace=True)
dfTrain.reset_index(inplace=True,drop =True)
dfTest = pd.read_table(config.TEST_FILE,sep=' ')

dfTrain = process(dfTrain)
dfTest = process(dfTest)

dfAll = pd.concat([dfTrain,dfTest],axis=0)
dfAll.reset_index(inplace=True,drop=True)
trainNum = dfTrain.shape[0]
dfAll.shape

(496482, 45)

In [6]:
print(dfAll.shape)
for key_var in keyVarList:
    dfAll = focus_one_record(dfAll,key_var)
    print(dfAll.shape)

(496482, 45)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  return super(DataFrame, self).rename(**kwargs)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


(496482, 57)
(496482, 69)
(496482, 81)
(496482, 93)
(496482, 105)


In [9]:
for day in range(18,26):
    tmp = dfAll.loc[dfAll['day']==day]
    print(tmp.iloc[:,45:].describe())
    tmp.iloc[:,45:].describe().to_csv('../../Data/advertisment/Cache/describe_%d.csv'%day)

       user_id_before_cnt  user_id_after_cnt  user_id_sametime_cnt  \
count        78261.000000       78261.000000          78261.000000   
mean             1.243633           1.243633              1.162686   
std              2.176036           2.149533              0.416235   
min              0.000000           0.000000              1.000000   
25%              0.000000           0.000000              1.000000   
50%              0.000000           0.000000              1.000000   
75%              2.000000           2.000000              1.000000   
max             37.000000          37.000000              3.000000   

       user_id_day_cnt  user_id_before_cnt_ratio  user_id_after_cnt_ratio  \
count     78261.000000              78261.000000             78261.000000   
mean          3.649953                  0.236034                 0.236034   
std           3.347964                  0.284540                 0.277005   
min           1.000000                  0.000000             

       user_id_before_cnt  user_id_after_cnt  user_id_sametime_cnt  \
count        68384.000000       68384.000000          68384.000000   
mean             1.225711           1.225711              1.161968   
std              2.068045           2.039643              0.415119   
min              0.000000           0.000000              1.000000   
25%              0.000000           0.000000              1.000000   
50%              0.000000           0.000000              1.000000   
75%              2.000000           2.000000              1.000000   
max             32.000000          32.000000              3.000000   

       user_id_day_cnt  user_id_before_cnt_ratio  user_id_after_cnt_ratio  \
count     68384.000000              68384.000000             68384.000000   
mean          3.613389                  0.236146                 0.236146   
std           3.143234                  0.284853                 0.276697   
min           1.000000                  0.000000             

       user_id_before_cnt  user_id_after_cnt  user_id_sametime_cnt  \
count        68315.000000       68315.000000          68315.000000   
mean             1.241836           1.241836              1.160141   
std              2.114778           2.087152              0.413138   
min              0.000000           0.000000              1.000000   
25%              0.000000           0.000000              1.000000   
50%              0.000000           0.000000              1.000000   
75%              2.000000           2.000000              1.000000   
max             28.000000          28.000000              3.000000   

       user_id_day_cnt  user_id_before_cnt_ratio  user_id_after_cnt_ratio  \
count     68315.000000              68315.000000             68315.000000   
mean          3.643812                  0.236597                 0.236597   
std           3.229204                  0.284911                 0.276790   
min           1.000000                  0.000000             

       user_id_before_cnt  user_id_after_cnt  user_id_sametime_cnt  \
count        57418.000000       57418.000000          57418.000000   
mean             1.248372           1.248372              1.166742   
std              2.298762           2.271231              0.425628   
min              0.000000           0.000000              1.000000   
25%              0.000000           0.000000              1.000000   
50%              0.000000           0.000000              1.000000   
75%              2.000000           2.000000              1.000000   
max             41.000000          41.000000              3.000000   

       user_id_day_cnt  user_id_before_cnt_ratio  user_id_after_cnt_ratio  \
count     57418.000000              57418.000000             57418.000000   
mean          3.663485                  0.234680                 0.234680   
std           3.577656                  0.284316                 0.276276   
min           1.000000                  0.000000             

<font color=#0099ff size=5 face="黑体">OneHot稀疏矩阵</font>

In [None]:
cutoff = 5
count = CountVectorizer(ngram_range=(1,1))
sparse_merge = count.fit_transform(dfAll['item_property_list_clean'])
for field in config.CATEGORICAL_COLS:
    tmp = get_onehot(dfAll,field)
    try:
        sparse_merge = hstack((sparse_merge,tmp)).tocsr()
    except:
        sparse_merge = tmp
           
sparse_merge = sparse_merge[:, np.array(np.clip(sparse_merge[:dfTrain.shape[0],:].getnnz(axis=0) -cutoff, 0, 1), dtype=bool)]

In [None]:
sparse_merge = sparse_merge[:, np.array(np.clip(sparse_merge[:dfTrain.shape[0],:].getnnz(axis=0) -cutoff, 0, 1), dtype=bool)]

In [None]:
denseFeatList = config.NUMERIC_COLS
#+['_'.join(lst) for lst in timeFeatList]
denseFeat = dfAll[denseFeatList].values
sparse_merge = hstack((sparse_merge,denseFeat)).tocsr()

In [None]:
train_idx = dfTrain.loc[dfTrain['day']<24].index
valid_idx = dfTrain.loc[dfTrain['day']==24].index
Xi_train_, y_train_ = sparse_merge[list(train_idx),:],dfTrain.loc[train_idx,'is_trade']
Xi_valid_, y_valid_ = sparse_merge[list(valid_idx),:],dfTrain.loc[valid_idx,'is_trade']
Xi_test_ = sparse_merge[trainNum:,:]

<font color=#0099ff size=5 face="黑体">WOE筛选变量</font>

In [None]:
from woe import calc_nominal_woe
from itertools import chain, combinations
import pickle

In [None]:
def powerset(iterable):
    "powerset([1,2,3]) --> () (1,) (2,) (3,) (1,2) (1,3) (2,3) (1,2,3)"
    s = list(iterable)
    return list(chain.from_iterable(combinations(s, r) for r in range(len(s)+1)))

def feat_select(df,featList,order=[1,3],cutoff=0.1):
    dfTrain = df.copy()
    subset = powerset(featList)
    selected = {}
    for sub in subset:
        sub=list(sub)
        if len(sub)<order[0] or len(sub)>order[1]:
            continue
        if len(sub)==1:
            tmp_woe = calc_nominal_woe(df,sub[0],'is_trade',bins=10,small=0.001)
        else:
            df['test'] = df[sub[0]]
            for i in range(1,len(sub)):
                df['test'] = df[['test',sub[i]]].apply(lambda x:str(x[0])+'_'+str(x[1]),axis=1)
            tmp_woe = calc_nominal_woe(df,'test','is_trade',bins=10,small=0.001)
        if tmp_woe[-1]>=cutoff:
            print(sub,tmp_woe[-1])
            selected['_'.join(sub)] = tmp_woe[0]
    return selected
            


In [None]:
tt = feat_select(dfTrain,config.CATEGORICAL_COLS,[3,3])

In [None]:
with open('../../Data/advertisment/Cache/WOE.pkl','rb') as f:
    woe_dict = f.load()
woe_dict=dict(woe_dict.items()+tt.items())


In [None]:
with open('../../Data/advertisment/Cache/WOE.pkl','wb') as f:
    pickle.dump(woe_dict,f)

<font color=#0099ff size=5 face="黑体">根据比值生成特征</font>

In [None]:
for var in dfTrain.columns:
    if var =='is_trade':
        continue
    else:
        dfAll = dfAll.merge(dfTrain.groupby(var)['is_trade'].mean().reset_index().rename(columns={'is_trade':var+'_rate'}),'left',left_on=var,right_on=var)
        dfAll[var+'_rate'].fillna(dfTrain['is_trade'].mean(),inplace=True)

In [16]:
scale = StandardScaler()

['user_id_before_cnt',
 'user_id_after_cnt',
 'user_id_sametime_cnt',
 'user_id_day_cnt',
 'user_id_before_cnt_ratio',
 'user_id_after_cnt_ratio',
 'user_id_sametime_cnt_ratio',
 'user_id_preday_trade_cnt',
 'user_id_preday_cnt',
 'user_id_preday_trade_ratio',
 'user_idnext_time_dur',
 'user_idlast_time_dur',
 'item_id_before_cnt',
 'item_id_after_cnt',
 'item_id_sametime_cnt',
 'item_id_day_cnt',
 'item_id_before_cnt_ratio',
 'item_id_after_cnt_ratio',
 'item_id_sametime_cnt_ratio',
 'item_id_preday_trade_cnt',
 'item_id_preday_cnt',
 'item_id_preday_trade_ratio',
 'item_idnext_time_dur',
 'item_idlast_time_dur',
 'shop_id_before_cnt',
 'shop_id_after_cnt',
 'shop_id_sametime_cnt',
 'shop_id_day_cnt',
 'shop_id_before_cnt_ratio',
 'shop_id_after_cnt_ratio',
 'shop_id_sametime_cnt_ratio',
 'shop_id_preday_trade_cnt',
 'shop_id_preday_cnt',
 'shop_id_preday_trade_ratio',
 'shop_idnext_time_dur',
 'shop_idlast_time_dur',
 'user_id_item_id_before_cnt',
 'user_id_item_id_after_cnt',
 'user

In [17]:
features = ['item_id', 'item_brand_id', 'item_city_id', 'item_price_level', 'item_sales_level',
                'item_collected_level', 'item_pv_level', 'user_gender_id', 'user_occupation_id',
                'user_age_level', 'user_star_level', 'user_query_day', 'user_query_day_hour',
                'context_page_id', 'hour', 'shop_id', 'shop_review_num_level', 'shop_star_level',
                'shop_review_positive_rate', 'shop_score_service', 'shop_score_delivery', 'shop_score_description'
                ]+dfAll.columns.tolist()[45:]
features = dfAll.columns.tolist()[45:]
#['_'.join(lst) for lst in timeFeatList]

#features = [var+'_rate' for var in features]

In [18]:
train_idx = dfTrain.loc[(dfTrain['day']<24)&(dfTrain['day']>18)].index
valid_idx = dfTrain.loc[dfTrain['day']==24].index
Xi_train_, y_train_ = dfAll.loc[list(train_idx),features],dfTrain.loc[train_idx,'is_trade']
Xi_valid_, y_valid_ = dfAll.loc[list(valid_idx),features],dfTrain.loc[valid_idx,'is_trade']
Xi_test_ = dfAll.loc[trainNum:,features]

In [19]:
Xi_train_.shape

(342432, 60)

<font color=#0099ff size=5 face="黑体">模型</font>

In [20]:
clf = lgb.LGBMClassifier(num_leaves=63, max_depth=7, n_estimators=80, n_jobs=20)
clf.fit(Xi_train_, y_train_,
        categorical_feature=[])
y_score_ = clf.predict_proba(Xi_valid_,)[:, 1]
print(log_loss(y_valid_, y_score_))



0.085962163782


In [33]:
Xi_finnal_ ,y_finnal_ = np.vstack((Xi_train_,Xi_valid_)),np.hstack((y_train_,y_valid_))
clf.fit(Xi_finnal_, y_finnal_,
        categorical_feature=[])

y_test_meta = np.zeros((dfTest.shape[0], 1), dtype=float)
y_test_meta[:,0] += clf.predict_proba(Xi_test_)[:,1]
submit = pd.DataFrame({'instance_id':dfTest['instance_id'],'predicted_score':y_test_meta[:,0]})
submit.to_csv('../../Submission/advertisement/gbm_trick_0330.txt', sep=" ", index=False, line_terminator='\n')



In [32]:
np.hstack((y_train_,y_valid_))

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [None]:
y_test_meta = np.zeros((dfTest.shape[0], 1), dtype=float)

In [None]:
submit = pd.DataFrame({'instance_id':dfTest['instance_id'],'predicted_score':y_test_meta[:,0]})

In [None]:
submit['predicted_score'] = 1.0
submit.to_csv('../../Submission/advertisement/test_0330.txt', sep=" ", index=False, line_terminator='\n')