In [2]:
import pandas as pd
from tqdm import tqdm
import warnings
import gc
import os
import lightgbm as lgb
from sklearn.model_selection import GroupKFold, KFold
from sklearn.preprocessing import LabelEncoder
from gensim.models import Word2Vec
from collections import OrderedDict
from sklearn.feature_extraction.text import TfidfVectorizer
from pandarallel import pandarallel
import time

pandarallel.initialize()

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

warnings.filterwarnings('ignore')

INFO: Pandarallel will run on 4 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [3]:
# !pip install pandarallel

In [4]:
seed = 2020

In [5]:
train_feature = pd.read_pickle('data/train_data.pkl')
test_feature = pd.read_pickle('data/test_data.pkl')
train_log = pd.read_pickle('data/train_log.pkl')
test_log = pd.read_pickle('data/test_log.pkl')

In [6]:
df_feature = train_feature.append(test_feature)
df_feature.reset_index(drop=True, inplace=True)
df_feature = df_feature[['userid', 'productId', 'target']]

df_log = train_log.append(test_log)
df_log.reset_index(drop=True, inplace=True)
df_log.sort_values(['eventtime'], inplace=True)

In [7]:
print('训练集正样本数', df_feature[df_feature['target'] == 1].shape[0])
print('训练集负样本数', df_feature[df_feature['target'] == 0].shape[0])
print('测试集样本数', df_feature['target'].isnull().sum())

训练集正样本数 19870
训练集负样本数 407000
测试集样本数 292268


In [8]:
df_feature.shape

(719138, 3)

# 特征工程

In [9]:
# embedding
def emb_mean(df, f1, f2, emb_size=16):
    tmp = df.groupby(f1, as_index=False)[f2].agg(
        {'{}_{}_list'.format(f1, f2): list})
    sentences = tmp['{}_{}_list'.format(f1, f2)].values.tolist()
    del tmp['{}_{}_list'.format(f1, f2)]
    for i in range(len(sentences)):
        sentences[i] = [str(x) for x in sentences[i]]
    
    if os.path.exists('model/w2v_{}_{}.m'.format(f2, emb_size)):
        model = Word2Vec.load('model/w2v_{}_{}.m'.format(f2, emb_size))
    else:
        model = Word2Vec(sentences, size=emb_size, window=5,
                     min_count=1, sg=0, hs=1, seed=seed)
        model.save('model/w2v_{}_{}.m'.format(f2, emb_size))

    emb_matrix = []
    for seq in sentences:
        vec = []
        for w in seq:
            if w in model:
                vec.append(model[w])
        if len(vec) > 0:
            emb_matrix.append(np.mean(vec, axis=0))
        else:
            emb_matrix.append([0] * emb_size)

    df_emb = pd.DataFrame(emb_matrix)
    df_emb.columns = ['{}_{}_emb_{}'.format(
        f1, f2, i) for i in range(emb_size)]

    tmp = pd.concat([tmp, df_emb], axis=1)

    del model, emb_matrix, sentences
    return tmp

In [10]:
df_log['lat'] = df_log['lat'].replace('\\N', 0)
df_log['lgt'] = df_log['lgt'].replace('\\N', 0)
df_log['lat_lgt'] = df_log['lat'].astype('str') + '_' + df_log['lgt'].astype('str')
df_log['lat'] = df_log['lat'].astype('float')
df_log['lgt'] = df_log['lgt'].astype('float')

df_log['url_startTime'] = df_log['url_startTime'].astype('float')
df_log['url_startTime_eventtime_diff'] = df_log['eventtime'] - df_log['url_startTime']

In [11]:
df_log['eventtime_date'] = pd.to_datetime(df_log['eventtime'],unit='ms')
df_log['hour'] = df_log['eventtime_date'].dt.hour

In [12]:
df_log['time_diff'] = df_log.groupby(['userid'])['eventtime'].diff().fillna(0)

In [13]:
# df_log['time_category'] = 0
# df_log.loc[(df_log['hour'] > 1) & (df_log['hour'] <= 5), 'time_category'] = 1
# df_log.loc[(df_log['hour'] > 5) & (df_log['hour'] <= 11), 'time_category'] = 2
# df_log.loc[(df_log['hour'] > 11) & (df_log['hour'] <= 14), 'time_category'] = 3
# df_log.loc[(df_log['hour'] > 14) & (df_log['hour'] <= 18), 'time_category'] = 4
# df_log.loc[(df_log['hour'] > 18) & (df_log['hour'] <= 21), 'time_category'] = 5

In [14]:
# 众数
for f in ['uaos', 'nettype', 'uadevice', 'region', 'city', 'hour']:
    df_temp = df_log.groupby(['userid', f]).size().reset_index()
    df_temp.drop([0], axis=1, inplace=True)
    df_temp = df_temp.sort_values(by = ['userid', f] ,ascending =['asc','asc'])
    df_temp.drop_duplicates('userid', keep='last', inplace=True)
    
    df_feature = df_feature.merge(df_temp, how='left')

In [15]:
# dict_total = {
#     '20200309387999': 12266, '20200210741736': 6129, '20190815856774': 15024, '20190815675296': 20917,
#     '20180531263676': 35095, '20180531194127': 15055, '20180531390867': 410239, '20180531211243': 237453,
#     '20200218174314': 203667, '20180531184230': 65452, '2020022815278': 55022, '20180530244125': 50775,
#     '20190810782447': 29750, '20190423119297': 28599, '20180531685516': 28215, '20180806735689': 26649,
#     '20180531157314': 26101, '20200301448260': 25694, '20181106538615': 18774, '20190829958079': 17865,
#     '20190508189069': 16765, '2020022175979': 15103, '20200220160007': 14301, '20180614751254': 13450,
#     '20200301801446': 12441, '20200210696720': 10835, '20180531810706': 10833, '20191114199289': 10631,
#     '20200407796159': 9243, '2020031948380': 5854, '20200316644689': 5854, '20191121595608': 5794,
#     '20191126774471': 5030, '2019123176289': 4355, '20200219418368': 3648, '20181106547495': 3266,
#     '20190809905752': 3001, '20200304394802': 2761, '20180531237016': 2752, '20190829850245': 2442,
#     '20191129999329': 2063, '20190129511511': 1846, '20200327987123': 1213, '20200407795677': 353,
#     '20200403495571': 299, '20190222546822': 269, '20191231160884': 255, '20190115978886': 214}
# df_feature['productId_total'] = df_feature['productId'].map(dict_total)


In [16]:
# dict_money = {
#     '20200309387999': 175, '20200210741736': 100, '20190815856774': 78, '20190815675296': 1,
#     '20180531263676': 68, '20180531194127': 19.6, '20180531390867': 0.7, '20180531211243': 100,
#     '20200218174314': 18, '20180531184230': 70, '2020022815278': 12, '20180530244125': 6.9,
#     '20190810782447': 25, '20190423119297': 0.01, '20180531685516': 38, '20180806735689': 13,
#     '20180531157314': 3, '20200301448260': 0.01, '20181106538615': 12, '20190829958079': 52.8,
#     '20190508189069': 5.8, '2020022175979': 19.9, '20200220160007': 19, '20180614751254': 1,
#     '20200301801446': 12, '20200210696720': 71, '20180531810706': 5, '20191114199289': 1.2,
#     '20200407796159': 2, '2020031948380': 25, '20200316644689': 120, '20191121595608': 1,
#     '20191126774471': 19.9, '2019123176289': 100, '20200219418368': 80, '20181106547495': 25,
#     '20190809905752': 109, '20200304394802': 128, '20180531237016': 10, '20190829850245': 390,
#     '20191129999329': 12, '20190129511511': 300, '20200327987123': 295, '20200407795677': 0.01,
#     '20200403495571': 0.01, '20190222546822': 365, '20191231160884': 6.8, '20190115978886': 220}
# df_feature['productId_money'] = df_feature['productId'].map(dict_money)

In [17]:
# df_feature['productId_year'] = df_feature['productId'].apply(
#     lambda x: int(x[:4]))
# df_feature['productId_month'] = df_feature['productId'].apply(
#     lambda x: int(x[4:6]))
# df_feature['productId_day'] = df_feature['productId'].apply(
#     lambda x: int(x[6:8]))
# df_feature['productId_time'] = df_feature['productId_year'] .astype(
#     str) + '-' + df_feature['productId_month'].astype(str) + '-' + df_feature['productId_day'].astype(str)
# df_feature['productId_time'] = pd.to_datetime(df_feature['productId_time'])
# df_feature['productId_week'] = df_feature['productId_time'].dt.dayofweek
# del df_feature['productId_time']


## 最后7天

In [18]:
df_last_7day_log = df_log[df_log['date'] >= 20200414].copy()
df_last_7day_log.head()

Unnamed: 0,region,city,eventname,eventtime,lat,lgt,nettype,title,uadevice,uaos,userid,date,hzcModule,productId,productName,url_startTime,lat_lgt,url_startTime_eventtime_diff,eventtime_date,hour,time_diff
910052,吉林,Guangdong,32010091,1586793627507,42.7683,129.3364,Wifi,\N,iPhone 7,iOS,74390763,20200414,,,,,42.7683_129.3364,,2020-04-13 16:00:27.507,16,284667421.0
910031,吉林,Guangdong,HCZ_O00005786,1586793627673,42.7683,129.3364,Wifi,\N,iPhone 7,iOS,74390763,20200414,,,,,42.7683_129.3364,,2020-04-13 16:00:27.673,16,166.0
910097,吉林,Guangdong,32010091,1586793628009,42.7683,129.3364,Wifi,非车险首页,iPhone 7,iOS,74390763,20200414,,,,,42.7683_129.3364,,2020-04-13 16:00:28.009,16,336.0
909925,吉林,Guangdong,HCZ_O00005785,1586793628012,42.7683,129.3364,Wifi,非车险首页,iPhone 7,iOS,74390763,20200414,,,,,42.7683_129.3364,,2020-04-13 16:00:28.012,16,3.0
909954,吉林,Guangdong,HCZ_O00013865,1586793628974,42.7683,129.3364,Wifi,非车险首页,iPhone 7,iOS,74390763,20200414,,,周五保险抢购日,,42.7683_129.3364,,2020-04-13 16:00:28.974,16,962.0


In [19]:
# 计数
for f in [['productId'], ['userid', 'productId']]:
    df_temp = df_last_7day_log.groupby(f).size().reset_index()
    df_temp.columns = f + ['last_7day_{}_count'.format('_'.join(f))]
    df_feature = df_feature.merge(df_temp, how='left')

In [20]:
# # mean
# for f in ['hour']:
#     df_temp = df_last_7day_log.groupby('userid')[f].mean().reset_index()
#     df_temp.columns = ['userid'] + ['last_7day_{}_mean'.format(f)]
#     df_feature = df_feature.merge(df_temp, how='left')

In [21]:
df_feature.head()

Unnamed: 0,userid,productId,target,uaos,nettype,uadevice,region,city,hour,last_7day_productId_count,last_7day_userid_productId_count
0,18403,20180530244125,0.0,iOS,\N,"iPhone12,1",广东,广州,9,275.0,
1,18403,20180531263676,0.0,iOS,\N,"iPhone12,1",广东,广州,9,5469.0,
2,18403,20181010361410,0.0,iOS,\N,"iPhone12,1",广东,广州,9,33610.0,10.0
3,18403,20200221398853,0.0,iOS,\N,"iPhone12,1",广东,广州,9,4.0,
4,18403,2019123176289,0.0,iOS,\N,"iPhone12,1",广东,广州,9,10247.0,


## 全量统计

In [22]:
df_log.head()

Unnamed: 0,region,city,eventname,eventtime,lat,lgt,nettype,title,uadevice,uaos,userid,date,hzcModule,productId,productName,url_startTime,lat_lgt,url_startTime_eventtime_diff,eventtime_date,hour,time_diff
1118942,广东,广州,11010479,1585670400279,23.1167,113.25,wifi,平安好车主,LIO-AN00,Android,73959474,20200401,,,,,23.1167_113.25,,2020-03-31 16:00:00.279,16,0.0
1119349,广东,广州,32010091,1585670400300,23.1167,113.25,wifi,平安好车主,LIO-AN00,Android,73959474,20200401,,,,,23.1167_113.25,,2020-03-31 16:00:00.300,16,21.0
1118894,广东,广州,HCZ_O00005786,1585670400305,23.1167,113.25,wifi,平安好车主,LIO-AN00,Android,73959474,20200401,,,,,23.1167_113.25,,2020-03-31 16:00:00.305,16,5.0
1119320,广东,广州,32010091,1585670400407,23.1167,113.25,wifi,非车险首页,LIO-AN00,Android,73959474,20200401,,,,,23.1167_113.25,,2020-03-31 16:00:00.407,16,102.0
1119272,广东,广州,HCZ_O00005785,1585670400413,23.1167,113.25,wifi,非车险首页,LIO-AN00,Android,73959474,20200401,,,,,23.1167_113.25,,2020-03-31 16:00:00.413,16,6.0


In [23]:
df_temp = df_log[df_log['eventname'] == 'h320057'][['userid', 'productId']].copy()
df_temp = df_temp.groupby(['userid'])['productId'].agg({'productId_click_list': set}).reset_index()
df_feature = df_feature.merge(df_temp, how='left')


def func(x):
    productId = x['productId']
    productId_click_list = x['productId_click_list']
    
    if type(productId_click_list) == float:
        return 0
    
    return productId in productId_click_list

df_feature['is_in_click_history'] = df_feature[['productId', 'productId_click_list']].apply(func, axis=1)
del df_feature['productId_click_list']
df_feature.head()

Unnamed: 0,userid,productId,target,uaos,nettype,uadevice,region,city,hour,last_7day_productId_count,last_7day_userid_productId_count,is_in_click_history
0,18403,20180530244125,0.0,iOS,\N,"iPhone12,1",广东,广州,9,275.0,,False
1,18403,20180531263676,0.0,iOS,\N,"iPhone12,1",广东,广州,9,5469.0,,False
2,18403,20181010361410,0.0,iOS,\N,"iPhone12,1",广东,广州,9,33610.0,10.0,True
3,18403,20200221398853,0.0,iOS,\N,"iPhone12,1",广东,广州,9,4.0,,False
4,18403,2019123176289,0.0,iOS,\N,"iPhone12,1",广东,广州,9,10247.0,,False


In [24]:
# 计数
for f in [['productId'], ['userid', 'productId'], ['uaos']]:
    df_temp = df_log.groupby(f).size().reset_index()
    df_temp.columns = f + ['{}_count'.format('_'.join(f))]
    df_feature = df_feature.merge(df_temp, how='left')

In [25]:
# mean
for f in ['lat', 'url_startTime_eventtime_diff']:
    df_temp = df_log.groupby('userid')[f].mean().reset_index()
    df_temp.columns = ['userid'] + ['{}_mean'.format(f)]
    df_feature = df_feature.merge(df_temp, how='left')
    
# # mean
# for f in ['time_diff']:
#     df_temp = df_log.groupby(['userid', 'productId'])[f].mean().reset_index()
#     df_temp.columns = ['userid', 'productId'] + ['userid_productId_{}_mean'.format(f)]
#     df_feature = df_feature.merge(df_temp, how='left')

In [26]:
# std
for f in ['hour']:
    df_temp = df_log.groupby('userid')[f].std().reset_index()
    df_temp.columns = ['userid'] + ['{}_std'.format(f)]
    df_feature = df_feature.merge(df_temp, how='left')

In [27]:
# max
for f in ['eventtime']:
    df_temp = df_log.groupby('userid')[f].max().reset_index()
    df_temp.columns = ['userid'] + ['{}_max'.format(f)]
    df_feature = df_feature.merge(df_temp, how='left')

In [28]:
# # min
# for f in ['eventtime']:
#     df_temp = df_log.groupby('userid')[f].min().reset_index()
#     df_temp.columns = ['userid'] + ['{}_min'.format(f)]
#     df_feature = df_feature.merge(df_temp, how='left')

In [29]:
# df_feature['eventtime_max_min_diff'] = df_feature['eventtime_max'] - df_feature['eventtime_min']
# del df_feature['eventtime_min']

In [30]:
kvs = {
    'eventname': ['11010479', '329000048', 'HCZ_O00013865'],
}
kvs = OrderedDict(kvs)

for k in kvs:
    for v in kvs[k]:
        df_t = df_log.groupby(['userid', k]).size().reset_index()
        df_t = df_t[df_t[k] == v]
        del df_t[k]
        df_t.columns = ['userid', '{}_{}_count'.format(k, v)]
        
        df_feature = df_feature.merge(df_t, how='left')

In [31]:
for f1, f2, dim in tqdm([['userid', 'title', 16], ['userid', 'productId', 16], ['userid', 'hzcModule', 16], 
                    ['userid', 'eventname', 16]]):
    df_feature = df_feature.merge(emb_mean(df_log, f1, f2, dim), on=f1, how='left')

100%|██████████| 4/4 [00:40<00:00, 10.20s/it]


In [32]:
def get_tfidf(data_, colname, max_features):
    data = data_.copy()

    def f(x):
        x = [i for i in x if type(i) == str]
        return ','.join(x)

    df_tmp = df_log.groupby(['userid'], as_index=False)[
        colname].agg({'list': f})
    text = list(df_tmp['list'].fillna('nan').values)

    tf = TfidfVectorizer(min_df=0,
                         ngram_range=(1, 1),
                         max_features=max_features)
    tf.fit(text)
    X = tf.transform(text)

    df_tfidf = pd.DataFrame(X.todense())
    df_tfidf.columns = [f'{colname}_tfidf{i}' for i in range(max_features)]
    df_tfidf['userid'] = df_tmp['userid']

    return df_tfidf

In [33]:
for f, dim in [['eventname', 16]]:
    df_tfidf = get_tfidf(df_log, f, dim)
    df_feature = df_feature.merge(df_tfidf, how='left')

In [34]:
# label encoder
def stat(df, df_merge, group_by, agg):
    group = df.groupby(group_by).agg(agg)

    columns = []
    for on, methods in agg.items():
        for method in methods:
            columns.append('{}_{}_{}'.format('_'.join(group_by), on, method))
    group.columns = columns
    group.reset_index(inplace=True)
    df_merge = df_merge.merge(group, on=group_by, how='left')

    del (group)
    gc.collect()
    return df_merge
    
def statis_feat(df_know, df_unknow):
    df_unknow = stat(df_know, df_unknow, ['productId'], {'target': ['mean', 'sum']})
    df_unknow = stat(df_know, df_unknow, ['region'], {'target': ['mean']})
    df_unknow = stat(df_know, df_unknow, ['uadevice'], {'target': ['mean']})
    df_unknow = stat(df_know, df_unknow, ['uadevice', 'productId'], {'target': ['mean']})

    return df_unknow


df_train = df_feature[~df_feature['target'].isnull()]
df_train = df_train.reset_index(drop=True)
df_test = df_feature[df_feature['target'].isnull()]
df_test = df_test.reset_index(drop=True)
   
df_stas_feat = None
kf = GroupKFold(n_splits=5)
for train_index, val_index in kf.split(
        df_train, df_train, df_train['userid']):
    df_fold_train = df_train.iloc[train_index]
    df_fold_val = df_train.iloc[val_index]
    
    df_fold_val = statis_feat(df_fold_train, df_fold_val)
    df_stas_feat = pd.concat([df_stas_feat, df_fold_val], axis=0)
    
    del(df_fold_train)
    del(df_fold_val)
    gc.collect()

df_test = statis_feat(df_train, df_test)
df_feature = pd.concat([df_stas_feat, df_test], axis=0)

del(df_stas_feat)
del(df_train)
del(df_test)
gc.collect()

43

In [35]:
start_time = time.time()

df_temp = df_log[df_log['eventname'] ==
                 'h320057'][['userid', 'productId']].copy()
df_temp = df_temp.groupby(['userid'])['productId'].agg(
    {'list': list}).reset_index()
df_temp.columns = ['userid', 'click_productIds']
user_click_product_dict = dict(
    zip(df_temp['userid'].values, df_temp['click_productIds'].values))


def func_sim_last(x):
    productId = x['productId']
    userid = x['userid']

    emb_size = 16
    if os.path.exists('model/w2v_productId_{}.m'.format(emb_size)):
        model = Word2Vec.load('model/w2v_productId_{}.m'.format(emb_size))
    else:
        raise Exception('no model!')

    sim = 0
    try:
        click_products = user_click_product_dict[userid]
        sim = model.similarity(click_products[-1], productId)
    except Exception as _:
        pass

    return sim


def func_sim_rolling_mean(x, window):
    productId = x['productId']
    userid = x['userid']

    emb_size = 16
    if os.path.exists('model/w2v_productId_{}.m'.format(emb_size)):
        model = Word2Vec.load('model/w2v_productId_{}.m'.format(emb_size))
    else:
        raise Exception('no model!')

    sim_list = []
    if userid in user_click_product_dict:
        click_products = user_click_product_dict[userid][-window:]
        for p in click_products:
            try:
                sim = model.similarity(p, productId)
                sim_list.append(sim)
            except Exception as _:
                pass
    else:
        return 0

    return np.mean(sim_list)


if os.path.exists('data/last_product_sim.pkl'):
    df = pd.read_pickle('data/last_product_sim.pkl')
    df_feature = df_feature.merge(df, how='left')
else:
    df_feature['last_product_sim'] = df_feature[[
        'userid', 'productId']].parallel_apply(func_sim_last, axis=1)
    df_feature[['userid', 'productId', 'last_product_sim']
               ].to_pickle('data/last_product_sim.pkl')


# if os.path.exists('data/product_sim_rolling3_mean.pkl'):
#     df = pd.read_pickle('data/product_sim_rolling3_mean.pkl')
#     df_feature = df_feature.merge(df, how='left')
# else:
#     df_feature['product_sim_rolling3_mean'] = df_feature[[
#         'userid', 'productId']].parallel_apply(lambda x: func_sim_rolling_mean(x, 3), axis=1)
#     df_feature[['userid', 'productId', 'product_sim_rolling3_mean']
#               ].to_pickle('data/product_sim_rolling3_mean.pkl')

end_time = time.time()
print(end_time - start_time)

del df_temp
gc.collect()

1.5381317138671875


84

In [36]:
df_feature.head()

Unnamed: 0,userid,productId,target,uaos,nettype,uadevice,region,city,hour,last_7day_productId_count,last_7day_userid_productId_count,is_in_click_history,productId_count,userid_productId_count,uaos_count,lat_mean,url_startTime_eventtime_diff_mean,hour_std,eventtime_max,eventname_11010479_count,eventname_329000048_count,eventname_HCZ_O00013865_count,userid_title_emb_0,userid_title_emb_1,userid_title_emb_2,userid_title_emb_3,userid_title_emb_4,userid_title_emb_5,userid_title_emb_6,userid_title_emb_7,userid_title_emb_8,userid_title_emb_9,userid_title_emb_10,userid_title_emb_11,userid_title_emb_12,userid_title_emb_13,userid_title_emb_14,userid_title_emb_15,userid_productId_emb_0,userid_productId_emb_1,userid_productId_emb_2,userid_productId_emb_3,userid_productId_emb_4,userid_productId_emb_5,userid_productId_emb_6,userid_productId_emb_7,userid_productId_emb_8,userid_productId_emb_9,userid_productId_emb_10,userid_productId_emb_11,userid_productId_emb_12,userid_productId_emb_13,userid_productId_emb_14,userid_productId_emb_15,userid_hzcModule_emb_0,userid_hzcModule_emb_1,userid_hzcModule_emb_2,userid_hzcModule_emb_3,userid_hzcModule_emb_4,userid_hzcModule_emb_5,userid_hzcModule_emb_6,userid_hzcModule_emb_7,userid_hzcModule_emb_8,userid_hzcModule_emb_9,userid_hzcModule_emb_10,userid_hzcModule_emb_11,userid_hzcModule_emb_12,userid_hzcModule_emb_13,userid_hzcModule_emb_14,userid_hzcModule_emb_15,userid_eventname_emb_0,userid_eventname_emb_1,userid_eventname_emb_2,userid_eventname_emb_3,userid_eventname_emb_4,userid_eventname_emb_5,userid_eventname_emb_6,userid_eventname_emb_7,userid_eventname_emb_8,userid_eventname_emb_9,userid_eventname_emb_10,userid_eventname_emb_11,userid_eventname_emb_12,userid_eventname_emb_13,userid_eventname_emb_14,userid_eventname_emb_15,eventname_tfidf0,eventname_tfidf1,eventname_tfidf2,eventname_tfidf3,eventname_tfidf4,eventname_tfidf5,eventname_tfidf6,eventname_tfidf7,eventname_tfidf8,eventname_tfidf9,eventname_tfidf10,eventname_tfidf11,eventname_tfidf12,eventname_tfidf13,eventname_tfidf14,eventname_tfidf15,productId_target_mean,productId_target_sum,region_target_mean,uadevice_target_mean,uadevice_productId_target_mean,last_product_sim
0,19823,20180530244125,0.0,iOS,Wifi,iPhone 7,浙江省,宁波,10,275.0,,False,753.0,,372465,29.8782,32077.133333,1.410759,1587363640328,8.0,,4.0,-0.169623,-0.87515,1.442965,0.283776,-1.290443,1.338506,0.07655,-0.496715,0.021581,0.403609,-0.241506,0.919572,-0.595706,0.458613,0.405169,0.877267,0.02558,0.554176,-0.775753,-0.637584,-0.27416,0.71421,-0.878604,-1.512585,-0.195261,0.242496,0.304706,0.280001,0.014035,0.535602,-0.934907,1.104328,0.398499,1.030108,-0.60033,-0.206224,-0.404548,0.730864,0.277349,-0.571467,-0.179171,0.322232,0.608569,-1.916789,1.02683,0.378506,-1.535189,-0.616795,0.405953,0.805352,0.462744,-0.576125,-0.057167,0.498681,0.069567,0.647471,0.040253,0.294924,0.424642,-0.364401,0.111598,0.166745,0.611307,0.491611,0.206251,0.32708,0.565596,0.152609,0.113527,0.369029,0.046452,0.156539,0.440848,0.151847,0.0,0.039327,0.152035,0.151649,0.23168,0.11322,0.011983,66.0,0.046137,0.043245,0.0,-0.170382
1,19823,20180531263676,0.0,iOS,Wifi,iPhone 7,浙江省,宁波,10,5469.0,,False,13733.0,,372465,29.8782,32077.133333,1.410759,1587363640328,8.0,,4.0,-0.169623,-0.87515,1.442965,0.283776,-1.290443,1.338506,0.07655,-0.496715,0.021581,0.403609,-0.241506,0.919572,-0.595706,0.458613,0.405169,0.877267,0.02558,0.554176,-0.775753,-0.637584,-0.27416,0.71421,-0.878604,-1.512585,-0.195261,0.242496,0.304706,0.280001,0.014035,0.535602,-0.934907,1.104328,0.398499,1.030108,-0.60033,-0.206224,-0.404548,0.730864,0.277349,-0.571467,-0.179171,0.322232,0.608569,-1.916789,1.02683,0.378506,-1.535189,-0.616795,0.405953,0.805352,0.462744,-0.576125,-0.057167,0.498681,0.069567,0.647471,0.040253,0.294924,0.424642,-0.364401,0.111598,0.166745,0.611307,0.491611,0.206251,0.32708,0.565596,0.152609,0.113527,0.369029,0.046452,0.156539,0.440848,0.151847,0.0,0.039327,0.152035,0.151649,0.23168,0.11322,0.054285,299.0,0.046137,0.043245,0.050725,-0.058608
2,19823,20181010361410,0.0,iOS,Wifi,iPhone 7,浙江省,宁波,10,33610.0,,True,76338.0,6.0,372465,29.8782,32077.133333,1.410759,1587363640328,8.0,,4.0,-0.169623,-0.87515,1.442965,0.283776,-1.290443,1.338506,0.07655,-0.496715,0.021581,0.403609,-0.241506,0.919572,-0.595706,0.458613,0.405169,0.877267,0.02558,0.554176,-0.775753,-0.637584,-0.27416,0.71421,-0.878604,-1.512585,-0.195261,0.242496,0.304706,0.280001,0.014035,0.535602,-0.934907,1.104328,0.398499,1.030108,-0.60033,-0.206224,-0.404548,0.730864,0.277349,-0.571467,-0.179171,0.322232,0.608569,-1.916789,1.02683,0.378506,-1.535189,-0.616795,0.405953,0.805352,0.462744,-0.576125,-0.057167,0.498681,0.069567,0.647471,0.040253,0.294924,0.424642,-0.364401,0.111598,0.166745,0.611307,0.491611,0.206251,0.32708,0.565596,0.152609,0.113527,0.369029,0.046452,0.156539,0.440848,0.151847,0.0,0.039327,0.152035,0.151649,0.23168,0.11322,0.217683,1199.0,0.046137,0.043245,0.181159,-0.110004
3,19823,20200221398853,0.0,iOS,Wifi,iPhone 7,浙江省,宁波,10,4.0,,False,8.0,,372465,29.8782,32077.133333,1.410759,1587363640328,8.0,,4.0,-0.169623,-0.87515,1.442965,0.283776,-1.290443,1.338506,0.07655,-0.496715,0.021581,0.403609,-0.241506,0.919572,-0.595706,0.458613,0.405169,0.877267,0.02558,0.554176,-0.775753,-0.637584,-0.27416,0.71421,-0.878604,-1.512585,-0.195261,0.242496,0.304706,0.280001,0.014035,0.535602,-0.934907,1.104328,0.398499,1.030108,-0.60033,-0.206224,-0.404548,0.730864,0.277349,-0.571467,-0.179171,0.322232,0.608569,-1.916789,1.02683,0.378506,-1.535189,-0.616795,0.405953,0.805352,0.462744,-0.576125,-0.057167,0.498681,0.069567,0.647471,0.040253,0.294924,0.424642,-0.364401,0.111598,0.166745,0.611307,0.491611,0.206251,0.32708,0.565596,0.152609,0.113527,0.369029,0.046452,0.156539,0.440848,0.151847,0.0,0.039327,0.152035,0.151649,0.23168,0.11322,0.000545,3.0,0.046137,0.043245,0.0,-0.056762
4,19823,2019123176289,0.0,iOS,Wifi,iPhone 7,浙江省,宁波,10,10247.0,,False,34824.0,,372465,29.8782,32077.133333,1.410759,1587363640328,8.0,,4.0,-0.169623,-0.87515,1.442965,0.283776,-1.290443,1.338506,0.07655,-0.496715,0.021581,0.403609,-0.241506,0.919572,-0.595706,0.458613,0.405169,0.877267,0.02558,0.554176,-0.775753,-0.637584,-0.27416,0.71421,-0.878604,-1.512585,-0.195261,0.242496,0.304706,0.280001,0.014035,0.535602,-0.934907,1.104328,0.398499,1.030108,-0.60033,-0.206224,-0.404548,0.730864,0.277349,-0.571467,-0.179171,0.322232,0.608569,-1.916789,1.02683,0.378506,-1.535189,-0.616795,0.405953,0.805352,0.462744,-0.576125,-0.057167,0.498681,0.069567,0.647471,0.040253,0.294924,0.424642,-0.364401,0.111598,0.166745,0.611307,0.491611,0.206251,0.32708,0.565596,0.152609,0.113527,0.369029,0.046452,0.156539,0.440848,0.151847,0.0,0.039327,0.152035,0.151649,0.23168,0.11322,0.068083,375.0,0.046137,0.043245,0.043478,0.141887


In [37]:
df_feature.shape

(719138, 108)

# 模型训练

In [38]:
le_productId = LabelEncoder()
df_feature['productId'] = le_productId.fit_transform(
    df_feature['productId'].astype('str')).astype('int')
    
for f in list(df_feature.select_dtypes('object')):
    le = LabelEncoder()
    df_feature[f] = le.fit_transform(
        df_feature[f].astype('str')).astype('int')


In [39]:
df_feature.to_pickle('data/feature.pkl')

In [40]:
df_train = df_feature[df_feature['target'].notnull()]
df_test = df_feature[df_feature['target'].isnull()]

gc.collect()

df_train['weight'] = 1
df_train['weight'] = df_train['userid'].map(
    df_train.loc[df_train['target'] == 1].groupby(['userid'])['productId'].nunique())
df_train['weight'] = 1 / df_train['weight']

In [41]:
ycol = 'target'
feature_names = list(
    filter(lambda x: x not in [ycol, 'weight'], df_train.columns))

model = lgb.LGBMClassifier(num_leaves=64,
                           max_depth=10,
                           learning_rate=0.01,
                           n_estimators=100000,
                           subsample=0.8,
                           feature_fraction=0.8,
                           reg_alpha=0.5,
                           reg_lambda=0.5,
                           random_state=seed,
                           metric=None)



oof = []
prediction = df_test[['userid', 'productId']]
prediction['pred'] = 0
df_importance_list = []

fold_num = 5

kfold = GroupKFold(n_splits=fold_num)
for fold_id, (trn_idx, val_idx) in enumerate(kfold.split(
        df_train[feature_names], df_train[ycol], df_train['userid'])):
    X_train = df_train.iloc[trn_idx][feature_names]
    Y_train = df_train.iloc[trn_idx][ycol]

    X_val = df_train.iloc[val_idx][feature_names]
    Y_val = df_train.iloc[val_idx][ycol]

    print('\nFold_{} Training ================================\n'.format(fold_id+1))

    lgb_model = model.fit(X_train,
                          Y_train,
                          eval_names=['train', 'valid'],
                          eval_set=[(X_train, Y_train), (X_val, Y_val)],
                          verbose=500,
                          eval_metric='auc',
                          sample_weight=df_train.iloc[trn_idx]['weight'],
                          early_stopping_rounds=50)

    pred_val = lgb_model.predict_proba(
        X_val, num_iteration=lgb_model.best_iteration_)[:, 1]
    df_oof = df_train.iloc[val_idx][[
        'userid', 'productId', ycol]].copy()
    df_oof['pred'] = pred_val
    oof.append(df_oof)

    pred_test = lgb_model.predict_proba(
        df_test[feature_names], num_iteration=lgb_model.best_iteration_)[:, 1]
    prediction['pred'] += pred_test / fold_num

    df_importance = pd.DataFrame({
        'column': feature_names,
        'importance': lgb_model.feature_importances_,
    })
    df_importance_list.append(df_importance)

    del lgb_model, pred_val, pred_test, X_train, Y_train, X_val, Y_val
    gc.collect()




Training until validation scores don't improve for 50 rounds.
Early stopping, best iteration is:
[355]	train's auc: 0.890241	valid's auc: 0.842665


Training until validation scores don't improve for 50 rounds.
Early stopping, best iteration is:
[295]	train's auc: 0.887252	valid's auc: 0.842548


Training until validation scores don't improve for 50 rounds.
[500]	train's auc: 0.899651	valid's auc: 0.843369
Early stopping, best iteration is:
[722]	train's auc: 0.91059	valid's auc: 0.844063


Training until validation scores don't improve for 50 rounds.
[500]	train's auc: 0.901586	valid's auc: 0.841461
Early stopping, best iteration is:
[513]	train's auc: 0.902216	valid's auc: 0.841522


Training until validation scores don't improve for 50 rounds.
Early stopping, best iteration is:
[359]	train's auc: 0.89041	valid's auc: 0.841132


# 模型训练

In [42]:
df_importance = pd.concat(df_importance_list)
df_importance = df_importance.groupby(['column'])['importance'].agg(
    'mean').sort_values(ascending=False).reset_index()
df_importance

Unnamed: 0,column,importance
0,productId_target_mean,1716.8
1,userid_productId_count,994.2
2,last_product_sim,887.4
3,productId,732.8
4,last_7day_productId_count,678.4
5,uadevice_productId_target_mean,628.2
6,userid,489.2
7,eventtime_max,465.0
8,url_startTime_eventtime_diff_mean,461.4
9,uadevice_target_mean,442.0


In [43]:
# 计分
def cal_score(df_):
    df = df_.copy()
    
    df.sort_values(['userid', 'pred'], ascending = [True, False], inplace = True)
    
    gg = df.groupby(['userid'])
    scores = []
    for _, g in tqdm(gg):
        true_answers = set(g[g['target'] == 1]['productId'].values.tolist())
        if len(true_answers) == 0:
            continue
        
        top3 = set(g.head(3)['productId'].values.tolist())
        true_num = len(true_answers & top3)
        
        if len(true_answers) > 3:
            scores += [true_num / 3]
        else:
            scores += [true_num / len(true_answers)]
            
    return np.mean(scores)

In [44]:
df_oof = pd.concat(oof)
score = cal_score(df_oof)
print(score)

100%|██████████| 6885/6885 [00:06<00:00, 1074.60it/s]

0.422076978939724





In [45]:
df_oof.sort_values(['userid', 'pred'], ascending = [True, False], inplace = True)
df_oof.head()

Unnamed: 0,userid,productId,target,pred
341498,18403,12,0.0,0.358166
341547,18403,49,0.0,0.117307
341504,18403,58,0.0,0.086702
341522,18403,4,0.0,0.054798
341510,18403,8,0.0,0.048134


In [46]:
prediction = prediction.sort_values(['userid', 'pred'], ascending = [True, False])
sub = prediction.groupby(['userid']).head(3)[['userid', 'productId']]
sub['productId'] = le_productId.inverse_transform(sub['productId'])
sub_ = sub.copy()
sub = sub.groupby('userid')['productId'].apply(list).reset_index()[['userid']]
sub[['pred1', 'pred2', 'pred3']] = pd.DataFrame(list(sub_.groupby('userid')['productId'].apply(list).values))

In [47]:
df_oof['productId'] = le_productId.inverse_transform(df_oof['productId'])
df_oof.to_csv('prob/oof_{}.csv'.format(score), index=False)
prediction['productId'] = le_productId.inverse_transform(prediction['productId'])
prediction.to_csv('prob/sub_{}.csv'.format(score), index=False)

In [48]:
sub.head()

Unnamed: 0,userid,pred1,pred2,pred3
0,25863,20180531390867,20200407796159,20200301448260
1,27379,20200407796159,20200301448260,20181010361410
2,28060,20200407796159,20200301448260,20181010361410
3,30799,20200407796159,20181010361410,20200301448260
4,30960,2019123176289,20200407796159,2020010375676


In [49]:
sub.to_csv('sub/{}.csv'.format(score),index=False)
sub.to_csv('sub/sub.csv',index=False)

In [54]:
# !wget -nv -O kesci_submit https://cdn.kesci.com/submit_tool/v4/kesci_submit&&chmod +x kesci_submit
# !./kesci_submit -token fab409df432da998 -file /home/kesci/work/sub/sub.csv

wget: /opt/conda/lib/libcrypto.so.1.0.0: no version information available (required by wget)
wget: /opt/conda/lib/libssl.so.1.0.0: no version information available (required by wget)
wget: /opt/conda/lib/libssl.so.1.0.0: no version information available (required by wget)
2020-06-25 16:29:32 URL:https://cdn.kesci.com/submit_tool/v4/kesci_submit [7357446/7357446] -> "kesci_submit" [1]
Kesci Submit Tool 4.0.0

> 已验证Token
> 提交文件 /home/kesci/work/sub/sub.csv (247.98 KiB), Target Qiniu
> 已上传 100 %
> 文件已上传        
> 服务器响应: 200 提交成功，请等待评审完成
> 提交完成


In [51]:
# !wget -nv -O kesci_submit https://cdn.kesci.com/submit_tool/v4/kesci_submit&&chmod +x kesci_submit
# !./kesci_submit -token fab409df432da998 -file /home/kesci/work/prob/oof_0.42297264584846284.csv

In [52]:
prediction.head()

Unnamed: 0,userid,productId,pred
426912,25863,20180531390867,0.213187
426878,25863,20200407796159,0.141481
426921,25863,20200301448260,0.108893
426872,25863,20181010361410,0.085165
426927,25863,20200218174314,0.082472


In [53]:
prediction.shape

(292268, 3)