In [1]:
import time
import pandas as pd
import numpy as np
import lightgbm as lgb
import config
from numba import jit
from scipy.sparse import csr_matrix, hstack
from sklearn.metrics import log_loss
from sklearn.preprocessing import LabelEncoder,OneHotEncoder,StandardScaler
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
def timestamp_datetime(value):
    return time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(value))

def time_feat(df,featList,featName):
    '''scaler = StandardScaler()
    tmp = df.groupby(featList).size().reset_index().rename(columns={0:featName})
    tmp[featName] = scaler.fit_transform(tmp[featName].values.reshape(-1,1))
    df = df.merge(tmp,'left',on=featList)'''
    df[featName] = df.groupby(featList)['context_timestamp'].rank(method='first')   
    return df

def process(df):
    df['time'] = df.context_timestamp.apply(timestamp_datetime)
    df['day'] = df.time.apply(lambda x: int(x[8:10]))
    df['hour'] = df.time.apply(lambda x: int(x[11:13]))
    '''for lst in timeFeatList:
        df = time_feat(df,lst,'_'.join(lst))'''
    df['item_property_list_clean'] = df['item_property_list'].apply(lambda x:';'.join(sorted(set(str(x).split(';')))))
    for i in range(3):
        df['item_category_list_bin%d'%i] = df['item_category_list'].apply(lambda x: x.split(';')[i] if len(x.split(';'))>i else -1)
    del df['item_category_list_bin0']
    df["missing_feat"] = np.sum((df == -1).values, axis=1)
    return df

def labelencoder(df):
    lbl = LabelEncoder()
    for var in ['user_id','item_id','shop_id','item_brand_id','item_city_id']:
        df[var] = lbl.fit_transform(df[var])
    return df
        

def focus_one_record(df,key_var=[],time_diff=False):
    if not isinstance(key_var,list):
        key_var = [key_var]
    nameBase = '_'.join(key_var)
    ###当天前后的数据情况
    df[nameBase+'_before_cnt'] = df.groupby(key_var+['day'])['context_timestamp'].rank(method='min') - 1
    df[nameBase+'_after_cnt'] = df.groupby(key_var+['day'])['context_timestamp'].rank(method='min',ascending=False)- 1
    df[nameBase+'_sametime_cnt'] = df.groupby(key_var+['day'])['context_timestamp'].rank(method='max') - df.groupby(key_var+['day'])['context_timestamp'].rank(method='min')+1
    df = df.merge(df.groupby(key_var+['day'],as_index=False)['context_timestamp'].agg({nameBase+'_day_cnt':'count'}),'inner',key_var+['day'])
    for feat in ['_before_cnt','_after_cnt','_sametime_cnt']:
        df[nameBase+feat+'_ratio'] = df[nameBase+feat]*1.0/df[nameBase+'_day_cnt']
                
    ###前一天购买/浏览的数量
    '''dfTmp =  df.groupby(key_var+['day'],as_index=False)['is_trade'].agg({nameBase+'_preday_trade_cnt':'sum',nameBase+'_preday_cnt':'count'})
    dfTmp['day'] = dfTmp['day']+1
    df = df.merge(dfTmp,'left',key_var+['day'])
    df[nameBase+'_preday_trade_ratio'] = df[nameBase+'_preday_trade_cnt']*1.0/df[nameBase+'_preday_cnt']
    for feat in ['_preday_trade_cnt','_preday_cnt','_preday_trade_ratio']:
        df[nameBase+feat].fillna(0,inplace=True)'''
        
    if time_diff:
        ###广告展示上下间隔
        dfTmp = df[[nameBase+'_before_cnt',nameBase+'_after_cnt',nameBase+'_sametime_cnt','time']+key_var+['day']]
        dfTmp.rename(columns={'time':'new_time'},inplace=True)
        dfTmp['next_record'] = dfTmp[nameBase+'_before_cnt'] + dfTmp[nameBase+'_sametime_cnt'] + 1
        dfTmp['last_record'] = dfTmp[nameBase+'_after_cnt'] + dfTmp[nameBase+'_sametime_cnt'] + 1
        df = df.merge(dfTmp[key_var+['day','next_record','new_time']],'left',left_on = key_var+['day',nameBase+'_before_cnt'],right_on = key_var+['day','next_record'])
        df[nameBase + '_next_time_dur'] = (pd.to_datetime(df['time'])-pd.to_datetime(df['new_time'])).dt.seconds
        df[nameBase + '_next_time_dur'].fillna(999999,inplace=True)
        df.loc[df[nameBase+'_sametime_cnt']>1,nameBase + '_next_time_dur'] = 0
        del df['new_time']
        del df['next_record']

        df = df.merge(dfTmp[key_var+['day','last_record','new_time']],'left',left_on = key_var+['day',nameBase+'_after_cnt'],right_on = key_var+['day','last_record'])
        df[nameBase + '_last_time_dur'] = (pd.to_datetime(df['new_time'])-pd.to_datetime(df['time'])).dt.seconds
        df[nameBase + '_last_time_dur'].fillna(999999,inplace=True)
        df.loc[df[nameBase+'_sametime_cnt']>1,nameBase + '_last_time_dur'] = 0
        del df['new_time']
        del df['last_record']
    for feat in ['_before_cnt','_after_cnt','_sametime_cnt']:
        del df[nameBase+feat]
    return df    
    
def Discretize(vlu, list_cat, greedy_f=False):
    if len(list_cat) == 0:
        return set([str(vlu)])
    cut_l = ['-Inf'] + [str(i) for i in list_cat if i < vlu]
    cut_h = [str(i) for i in list_cat if i >= vlu] + ['Inf']
    if greedy_f:
        return set([i + '_' + j for i in cut_l for j in cut_h])
    else:
        return set([cut_l[-1] + '_' + cut_h[0]])
    
def Discretization(rec, dict_cat, greedy_f=False):
    return (dict([(key, Discretize(rec[key], dict_cat[key], greedy_f)) for key in dict_cat]))

def cat_str(rec, dict_cat, greedy_f=False):
    cat_tag = Discretization(rec, dict_cat, greedy_f=False)
    flat_list = ';'.join([key + '_' + str(item) for key in cat_tag.keys() for item in cat_tag[key]])
    return (flat_list)

@jit
def _agg_df(df, grp_key, sum_var, cnt_var, stat_var):
    grouped = df.groupby(grp_key)
    agg_sum = grouped[sum_var].agg('sum').reset_index().melt(id_vars=grp_key, value_vars=sum_var)
    agg_sum['method'] = 'sum'
    agg_uniq = grouped[cnt_var].agg('nunique').reset_index().melt(id_vars=grp_key, value_vars=cnt_var)
    agg_uniq['method'] = 'cnt'
    agg_max = grouped[stat_var].agg('max').reset_index().melt(id_vars=grp_key, value_vars=stat_var)
    agg_max['method'] = 'max'
    agg_min = grouped[stat_var].agg('min').reset_index().melt(id_vars=grp_key, value_vars=stat_var)
    agg_min['method'] = 'min'
    agg_mst = pd.concat([agg_sum, agg_uniq, agg_max, agg_min])
    return (agg_mst)

def _sig_calc(cdr_rec, key_var, TS_cat, dict_cat,sum_var, cnt_var, stat_var):
    cdr_dict = cdr_rec.to_dict(orient='records')
    cdr_dict = [dict(list(x.items()) + [('TS', '~'.join(list(Discretization(x, TS_cat).values())[0]))] + [
        ('cat_str', cat_str(x, dict_cat))]) for x in cdr_dict]
    cdr_rec = pd.DataFrame(cdr_dict).reset_index()
    # one to many
    cat_driver = pd.concat([pd.Series(row['index'], row['cat_str'].split(';'))
                            for _, row in cdr_rec.iterrows()]).reset_index()
    cat_driver.columns = ['cat', 'index']
    cdr_rec_m = pd.merge(cdr_rec, cat_driver, on='index', how='inner')
    # Aggregation
    if key_var=='':
        key_var_tmp = ['TS']
    else:
        key_var_tmp = key_var+['TS']
    agg_mst_m = _agg_df(cdr_rec_m, key_var_tmp+['cat'], sum_var, cnt_var, stat_var)
    agg_mst_m['var_name'] = agg_mst_m[['method', 'variable', 'cat']].apply(
        lambda x: x[0] + '_' + x[1] + '_cat_' + str(x[2]), axis=1)
    if key_var=='':
        agg_mst_m['key'] = agg_mst_m[key_var_tmp]
    else:
        agg_mst_m['key'] = agg_mst_m[key_var_tmp].apply(lambda x: '{0}:{1}'.format(x[0], x[1]), axis=1)
    agg_df_m = agg_mst_m.pivot(index='key', columns='var_name', values='value').reset_index()

    agg_mst = _agg_df(cdr_rec, key_var_tmp, sum_var, cnt_var, stat_var)
    agg_mst['var_name'] = agg_mst[['method', 'variable']].apply(lambda x: x[0] + '_' + x[1] + '_cat_total_all', axis=1)
    if key_var=='':
        agg_mst['key'] = agg_mst[key_var_tmp]
    else:
        agg_mst['key'] = agg_mst[key_var_tmp].apply(lambda x: '{0}:{1}'.format(x[0], x[1]), axis=1)
    agg_df = agg_mst.pivot(index='key', columns='var_name', values='value').reset_index()

    agg_df = pd.merge(agg_df, agg_df_m, on='key', how='left')
    
    sig_ratio = [sig for sig in agg_df.columns if
                 (('sum_' in sig) or ('cnt_' in sig)) and ('cat_total_all' not in sig)]
    for sig in sig_ratio:
        agg_df['rto_' + sig] = agg_df[[sig, sig.split('cat')[0] + 'cat_total_all']].apply(
            lambda x: (x[0] + 0.0) / x[1], axis=1)
        del agg_df[sig]
    return agg_df

def _offline_feat(df,key_var='user_id',stat_var=[],part_var=[],mean_var=[],train_feat_col=None):
    if not isinstance(key_var,list):
        key_var = [key_var]
    left_key = key_var.copy()
    if not train_feat_col:
        key_var.append(train_feat_col[1])
        left_key.append(train_feat_col[0])
        
    base_name = '~'.join(key_var)
    df = df.merge(df.groupby(key_var,as_index=False)['instance_id'].agg({base_name+'_cnt':'count'}),'left',left_on =left_key, right_on = key_var)
    df = df.merge(df.groupby(key_var,as_index=False)['is_trade'].agg({base_name+'_trade_cnt':'sum',base_name+'_trade_ratio':'mean'}),'left',left_on =left_key, right_on = key_var)
    df[base_name+'_notrade_cnt'] = df[base_name+'_cnt']-df[base_name+'_trade_cnt']
    dfTmp = df.loc[df['is_trade']==1]
    for stat in stat_var:
        df = df.merge(df.groupby(key_var,as_index=False)[stat].agg({base_name+'_'+stat+'_min':'min',base_name+'_'+stat+'_max':'max'}),'left',left_on =left_key, right_on = key_var)    
    for part in part_var:
        df = df.merge(df.groupby(key_var,as_index=False)[part].agg({base_name+'_'+part+'_cnt':'nunique'}),'left',left_on =left_key, right_on = key_var)
        df = df.merge(dfTmp.groupby(key_var,as_index=False)[part].agg({base_name+'_'+part+'_trade_cnt':'nunique'}),'left',left_on =left_key, right_on = key_var)
        df[base_name+'_'+part+'_trade_cnt'].fillna(0,inplace=True)
        df[base_name+'_'+part+'_trade_ratio'] = 1.0*df[base_name+'_'+part+'_trade_cnt']/df[base_name+'_'+part+'_cnt']
    for var in mean_var:
        df = df.merge(df.groupby(key_var+[var],as_index=False)['is_trade'].sum().groupby(key_var,as_index=False)['is_trade'].agg({base_name+'_'+var+'_avg_trade':'mean'}),'left',left_on =left_key, right_on = key_var)
    return df

def map_col(df,drop=False):
    map_dict = {
        'item_price_level':[4,5,6,7,8,9],
        'item_sales_level':[4,6,9,10,11,12,13,14,16],
        'item_pv_level':[6,9,10,11,12,13,14,15,16,17,18,19,20],
        'user_age_level':[1001,1002,1003,1004,1005],
        'context_page_id':[4001,4002,4004,4006,4008,4010,4013,4016,4018],
        'shop_review_num_level':[5,9,14,15,16,17,18,20,21],
        'hour':[6,9,12,17,20],
        'user_occupation_id':{-1:2003},
        'user_star_level':{-1:3000}
    }
    for key,value in map_dict.items():
        if isinstance(value,list):
            df[key+'_mapped'] = 0
            for i in range(len(value)):
                df.loc[df[key]>value[i],key+'_mapped'] = i+1
        else:
            '''df[key+'_mapped'] = df[key]
            for key_sub,value_sub in value.items():
                df.loc[df[key]==key_sub,key+'_mapped'] = value_sub'''
            df[key+'_mapped'] = df[key].apply(lambda x:value.get(x,x))
        if drop:
            df[key] = df[key+'_mapped']
            del df[key+'_mapped']
    return df

def interaction_ratio(df,base_list=[],cal_list=[],rank_list = []):
    for base_var in base_list:
        if not isinstance(base_var,list):
            base_var = [base_var]
        if not '_'.join(base_var)+'_cnt' in df.columns:
            df = df.merge(df.groupby(base_var,as_index=False)['instance_id'].agg({'_'.join(base_var)+'_cnt':'count'}),'left',base_var)
        print('ratio part')
        for cal_var in cal_list:
            if not isinstance(cal_var,list):
                cal_var = [cal_var]
            if cal_var==base_var or base_var==['cnt_rec']:
                continue
            nameBase = '_'.join(base_var)+'~'+'_'.join(cal_var)
            print(nameBase)
            df = df.merge(df.groupby(base_var+cal_var,as_index=False)['instance_id'].agg({nameBase+'_cnt':'count'}),'left',base_var+cal_var)
            df[nameBase+'_ratio'] = df[nameBase+'_cnt']*1.0/df['_'.join(base_var)+'_cnt']
            del df[nameBase+'_cnt']
        
        print('rank part')
        for rank_var in rank_list:
            if not isinstance(rank_var,list):
                rank_var = [rank_var]
            if rank_var==base_var:
                continue
            nameBase = '_'.join(base_var)+'~'+'_'.join(rank_var)
            print(nameBase)
            df[nameBase+'_rank'] = dfAll.groupby(base_var)[rank_var].rank(method='min')
            df[nameBase+'_rank_ratio'] = df[nameBase+'_rank']*1.0/df['_'.join(base_var)+'_cnt']
            del df[nameBase+'_rank']
        del df['_'.join(base_var)+'_cnt']
    return df
            
            

<font color=#0099ff size=5 face="黑体">读取数据</font>

In [3]:
dfTrain = pd.read_table(config.TRAIN_FILE,sep=' ')
dfTrain.drop_duplicates(inplace=True)
dfTrain.reset_index(inplace=True,drop =True)
dfTest = pd.read_table(config.TEST_FILE,sep=' ')

dfTrain = process(dfTrain)
dfTest = process(dfTest)

dfAll = pd.concat([dfTrain,dfTest],axis=0)
dfAll.reset_index(inplace=True,drop=True)
trainNum = dfTrain.shape[0]
dfAll['cnt_rec'] = 1
dfAll = labelencoder(dfAll)
dfAll.shape

(496482, 35)

<font color=#0099ff size=5 face="黑体">特征工程</font>

In [4]:
###单特征map
dfAll = map_col(dfAll,True)
print(dfAll.shape)
featBase = [i for i in dfAll.columns.tolist() if not i in config.IGNORE_COLS]

(496482, 35)


In [5]:
###线下特征集合
dfAll['feat_set'] = dfAll['day'] + 1
keyList = ['user_id','shop_id','item_id']
partList = [
    ['item_id','shop_id'],
    ['user_id','item_id'],
    ['user_id','shop_id']
]
meanList = [
    ['shop_id'],
    ['item_id'],
    []
]
for i in range(3):
    keyVar = keyList[i]
    partVar = partList[i]
    meanVar = meanList[i]
    statVar = []
    if isinstance(keyVar,str):
        for key,value in config.STAT_DICT.items():
            if key==keyVar:
                continue
            statVar += value
    dfAll = _offline_feat(dfAll,keyVar,statVar,partVar,meanVar,['day','feat_set'])
del dfAll['feat_set']
print(dfAll.shape)

(496482, 121)


In [None]:
###当天信息的trick
keyList = ['user_id','shop_id','item_id','item_city_id','item_brand_id']
for keyVar in keyList:
    timeDiff = False if keyVar=='user_id' else True
    dfAll = focus_one_record(dfAll,keyVar,timeDiff)
print(dfAll.shape)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  return super(DataFrame, self).rename(**kwargs)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


(2283561, 149)


In [None]:
###两两类别变量的比例/rank 顺序
baseList = [
    'cnt_rec',
    'user_id','user_gender_id', 'user_occupation_id','user_age_level', 'user_star_level',
    'item_id', 'item_brand_id', 'item_city_id', 'item_price_level', 'item_sales_level','item_collected_level', 'item_pv_level',
    'item_category_list_bin1','item_category_list_bin2',
    'shop_id', 'shop_review_num_level','shop_star_level'
]
calList = [
    'user_id','user_gender_id', 'user_occupation_id','item_id', 'item_brand_id', 'item_city_id',
    'item_category_list_bin1','item_category_list_bin2','shop_id'
]
rankList = [
    'user_age_level', 'user_star_level','item_price_level', 'item_sales_level','item_collected_level', 'item_pv_level','shop_review_num_level','shop_star_level'
]

dfAll = interaction_ratio(dfAll,baseList,calList,rankList)
print(dfAll.shape)

ratio part
rank part
cnt_rec~user_age_level
cnt_rec~user_star_level
cnt_rec~item_price_level
cnt_rec~item_sales_level
cnt_rec~item_collected_level
cnt_rec~item_pv_level
cnt_rec~shop_review_num_level
cnt_rec~shop_star_level
ratio part
user_id~user_gender_id
user_id~user_occupation_id
user_id~item_id
user_id~item_brand_id
user_id~item_city_id
user_id~item_category_list_bin1
user_id~item_category_list_bin2
user_id~shop_id
rank part
user_id~user_age_level
user_id~user_star_level
user_id~item_price_level
user_id~item_sales_level
user_id~item_collected_level
user_id~item_pv_level
user_id~shop_review_num_level
user_id~shop_star_level
ratio part
user_gender_id~user_id
user_gender_id~user_occupation_id
user_gender_id~item_id
user_gender_id~item_brand_id
user_gender_id~item_city_id
user_gender_id~item_category_list_bin1
user_gender_id~item_category_list_bin2
user_gender_id~shop_id
rank part
user_gender_id~user_age_level
user_gender_id~user_star_level
user_gender_id~item_price_level
user_gender_i

In [None]:
toSave = dfAll.iloc[:,149:]
toSave.to_csv('../../Data/advertisment/Cache/ratio_rank_result.csv',index=False)

In [None]:
###变量的多组合
featCorr = dfAll[featBase].corr('spearman')
for i in range(len(featBase)-1):
    for j in range(i+1,len(featBase)):
        try: tmpBound = featCorr.loc[featBase[i],featBase[j]]
        except: continue
        if abs(tmpBound)>=0.5:
            continue
        baseName = '~'.join([featBase[i],featBase[j]])+'_com'
        dfAll[baseName] = dfAll[[featBase[i],featBase[j]]].apply(lambda x: str(x[0])+'_'+str(x[1]),axis=1)
print(dfAll.shape)

#dfAll.to_csv("../../Data/advertisment/Cache/All_multi_combination.csv",index=False)

In [None]:
features = [i for i in dfAll.columns.tolist() if not i in config.IGNORE_COLS]

In [None]:
lbl = LabelEncoder()
for var in ['item_category_list_bin2']:
    dfAll[var] = lbl.fit_transform(dfAll[var])

In [None]:
dfAll.loc[dfAll[var]==-1,var] = '-1'

<font color=#0099ff size=5 face="黑体">拆分样本</font>

In [None]:
train_idx = dfTrain.loc[(dfTrain['day']<24)&(dfTrain['day']>18)].index
valid_idx = dfTrain.loc[dfTrain['day']==24].index
Xi_train_, y_train_ = dfAll.loc[list(train_idx),features],dfTrain.loc[train_idx,'is_trade']
Xi_valid_, y_valid_ = dfAll.loc[list(valid_idx),features],dfTrain.loc[valid_idx,'is_trade']
Xi_test_ = dfAll.loc[trainNum:,features]

In [None]:
Xi_train_.shape

<font color=#0099ff size=5 face="黑体">模型</font>

In [None]:
clf = lgb.LGBMClassifier(
    num_leaves=63, 
    max_depth=8,
    n_estimators=20000,
    n_jobs=20,
    learning_rate=0.05,
    colsample_bytree=0.8,
    subsample=0.9,
    max_bin=20
)
clf.fit(Xi_train_, y_train_, eval_set=[(Xi_valid_, y_valid_)],
        categorical_feature=['item_category_list_bin1','item_category_list_bin2'],early_stopping_rounds=200)
y_score_ = clf.predict_proba(Xi_valid_,)[:, 1]
print(log_loss(y_valid_, y_score_))
bstIter = clf.best_iteration_

In [None]:
Xi_finnal_ ,y_finnal_ = np.vstack((Xi_train_,Xi_valid_)),np.hstack((y_train_,y_valid_))
clf = lgb.LGBMClassifier(
    num_leaves=63, 
    max_depth=8,
    n_estimators=bstIter,
    n_jobs=20,
    learning_rate=0.05,
    colsample_bytree=0.8,
    subsample=0.9,
    max_bin=20
)
clf.fit(Xi_finnal_, y_finnal_,
        categorical_feature=[])
y_test_meta = np.zeros((dfTest.shape[0], 1), dtype=float)
y_test_meta[:,0] += clf.predict_proba(Xi_test_)[:,1]
submit = pd.DataFrame({'instance_id':dfTest['instance_id'],'predicted_score':y_test_meta[:,0]})
submit.to_csv('../../Submission/advertisement/gbm_trick_0330.txt', sep=" ", index=False, line_terminator='\n')

In [None]:
np.hstack((y_train_,y_valid_))

In [None]:
y_test_meta = np.zeros((dfTest.shape[0], 1), dtype=float)

In [None]:
submit = pd.DataFrame({'instance_id':dfTest['instance_id'],'predicted_score':y_test_meta[:,0]})

In [None]:
submit['predicted_score'] = 1.0
submit.to_csv('../../Submission/advertisement/test_0330.txt', sep=" ", index=False, line_terminator='\n')