In [1]:
%run utils.ipynb

import pandas as pd
import warnings
import lightgbm as lgb
from sklearn.model_selection import GroupKFold
import gc
from tqdm import tqdm
import numpy as np
from collections import defaultdict
import math
from pandarallel import pandarallel

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pandarallel.initialize()

warnings.filterwarnings('ignore')

INFO: Pandarallel will run on 32 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [2]:
seed = 2020
current_phase = 6

In [3]:
df_feature = pd.read_pickle('../user_data/data/rank_feature.pkl')
print(df_feature.shape)

(2496846, 90)


In [4]:
df_nn_train = pd.read_pickle('../user_data/data/nn/nn_trn.pkl')
df_nn_test = pd.read_pickle('../user_data/data/nn/nn_test.pkl')
df_nn = df_nn_train.append(df_nn_test)
print(df_nn.shape)
df_feature = df_feature.merge(
    df_nn[['user_id', 'phase', 'item_id', 'nn_prob']], how='left')
del df_nn
gc.collect()

In [5]:
df_nn2 = pd.read_pickle('../user_data/data/nn/nn2.pkl')
df_feature = df_feature.merge(
    df_nn2[['user_id', 'phase', 'item_id', 'pred']], how='left')
del df_nn2
gc.collect()

In [6]:
df_feature.head()

Unnamed: 0,user_id,phase,query_time,item_id,label,txt_vec10_pca0,txt_vec10_pca1,txt_vec10_pca2,txt_vec10_pca3,txt_vec10_pca4,txt_vec10_pca5,txt_vec10_pca6,txt_vec10_pca7,txt_vec10_pca8,txt_vec10_pca9,img_vec10_pca0,img_vec10_pca1,img_vec10_pca2,img_vec10_pca3,img_vec10_pca4,img_vec10_pca5,img_vec10_pca6,img_vec10_pca7,img_vec10_pca8,img_vec10_pca9,phase_item_clickd_count,phase_item_click_time_diff_mean,item_id_phase_user_age_level_mean,item_id_phase_user_age_level_min,item_id_phase_user_age_level_max,item_id_phase_user_age_level_std,phase_item_click_gender_mean,user_age_level,user_gender,user_city_level,phase_user_click_count,phase_user_age_level_click_count,user_id_phase_time_std,user_id_phase_time_max_min_diff,user_id_phase_query_lastbuy_time_diff,user_click_item_if_sim_sum,user_click_item_if_sim_max,user_last_click_item_if_sim,user_click_item_if_sim_rolling2_sum,user_click_item_tc_sim_sum,user_click_item_tc_sim_max,user_click_item_tc_sim_rolling2_sum,user_click_item_bn_sim_sum,item_w2v_0,item_w2v_1,item_w2v_2,item_w2v_3,item_w2v_4,item_w2v_5,item_w2v_6,item_w2v_7,item_w2v_8,item_w2v_9,item_w2v_10,item_w2v_11,item_w2v_12,item_w2v_13,item_w2v_14,item_w2v_15,item_w2v_16,item_w2v_17,item_w2v_18,item_w2v_19,item_w2v_20,item_w2v_21,item_w2v_22,item_w2v_23,item_w2v_24,item_w2v_25,item_w2v_26,item_w2v_27,item_w2v_28,item_w2v_29,item_w2v_30,item_w2v_31,user_last_click_item_w2w_sim,user_click_item_w2w_sim_sum_2,user_item_txt_sim,user_click_item_txt_sim_sum,user_click_item_txt_sim_max,user_last_click_item_txt_sim,user_click_item_txt_sim_rolling2_sum,user_click_item_txt_sim_rolling2_mean,user_click_item_txt_sim_rolling3_mean,user_item_img_sim
0,7,0,0.98394,591,0.0,,,,,,,,,,,,,,,,,,,,,98.0,2.324581e-06,4.25,1.0,8.0,1.900391,0.125,,,,4,,3.9e-05,6.8e-05,9.4e-05,0.012207,0.012207,0.012207,0.012207,0.013268,0.012352,0.013268,3.066406,0.557617,0.383301,-0.276855,-0.020309,0.281738,0.266846,-0.254639,-1.088867,0.54248,0.127197,0.030289,0.11676,0.413818,0.43335,-0.105347,0.318604,-0.17749,0.324951,0.070679,0.141724,1.375,-0.703125,0.449219,-0.600586,0.117676,0.730469,0.049347,0.347168,-0.49585,-0.4375,0.262939,0.01252,0.961914,1.739258,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,7,0,0.98394,20201,0.0,7.316406,2.445312,-2.457031,-0.265137,1.541992,-1.857422,-0.661133,1.368164,3.947266,1.537109,-3.902344,7.722656,1.712891,1.263672,-4.515625,1.919922,5.910156,-2.21875,1.426758,-5.519531,82.0,5.364418e-07,4.609375,2.0,7.0,1.269531,0.086975,,,,4,,3.9e-05,6.8e-05,9.4e-05,0.008041,0.011482,0.0,0.011482,0.009499,0.009499,0.009499,2.021484,0.777832,0.311035,-0.586426,0.419189,0.186768,-0.55127,0.460205,-1.097656,1.075195,0.95752,-0.011795,0.471924,0.365723,0.365723,-0.513672,-0.847168,-0.648438,0.738281,0.541992,-0.211792,1.394531,-0.319336,1.464844,-1.367188,-0.37085,0.998535,0.380615,0.174194,-0.910645,-0.997559,0.798828,0.014984,0.763672,1.75,0.470703,0.319336,0.465332,0.0,0.0,0.0,0.465332,-0.074768
2,7,0,0.98394,7709,0.0,,,,,,,,,,,,,,,,,,,,,89.0,6.020069e-06,4.609375,2.0,7.0,1.644531,0.086975,,,,4,,3.9e-05,6.8e-05,9.4e-05,0.00732,0.009956,0.000351,0.010307,0.011177,0.010094,0.010895,1.720703,0.357178,0.60498,-0.930176,0.57373,0.099182,-0.130615,-0.207642,-1.37793,0.694824,0.570801,0.018768,0.269043,0.474609,0.216797,-0.281738,-0.456299,-0.503418,0.54541,-0.074646,0.224976,1.435547,-0.242432,1.091797,-0.79541,-0.231934,0.993652,0.156616,-0.445312,-0.432129,-0.684082,0.436279,-0.29126,0.807617,1.706055,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,7,0,0.98394,4340,0.0,,,,,,,,,,,,,,,,,,,,,83.0,1.251698e-06,4.230469,2.0,8.0,2.177734,0.115356,,,,4,,3.9e-05,6.8e-05,9.4e-05,0.007843,0.007843,0.007843,0.007843,0.009254,0.007942,0.008759,1.928711,0.232666,0.606445,-0.684082,0.323242,-0.064392,0.851074,-0.243286,-1.154297,0.116272,-0.51709,-0.181274,0.134766,0.441162,0.509277,0.102173,0.543457,-0.009499,0.095398,-0.371094,0.037048,1.189453,-0.51416,0.18811,-0.59668,-0.103516,0.555664,-0.24292,0.166748,-0.252197,0.18335,-0.355225,-0.834473,0.744141,1.163086,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,7,0,0.98394,9338,0.0,-0.939453,4.632812,-4.4375,-0.88916,-1.173828,-4.121094,2.271484,5.328125,-1.557617,4.300781,-3.982422,9.820312,1.646484,-2.675781,-5.347656,2.044922,7.449219,-6.210938,-1.30957,-1.486328,95.0,2.980232e-06,4.421875,2.0,7.0,1.386719,0.0,,,,4,,3.9e-05,6.8e-05,9.4e-05,0.005604,0.007599,0.000285,0.007881,0.006947,0.006947,0.006947,2.136719,1.0,0.300293,-0.73877,0.388672,0.354736,-0.486328,0.543945,-1.198242,1.09375,0.745117,-0.105774,0.648438,0.333008,0.422607,-0.356201,-0.759766,-0.694824,0.838867,0.48584,-0.002966,1.365234,-0.403076,1.412109,-1.28125,-0.202148,0.942383,0.461182,0.060638,-0.975098,-1.021484,0.794434,-0.228271,0.788574,1.77832,0.570312,0.412598,0.546387,0.0,0.0,0.0,0.459717,-0.09668


In [7]:
df_feature['group'] = df_feature['user_id'].astype(
    'str') + '_' + df_feature['phase'].astype('str')

df_train = df_feature[df_feature['label'].notnull()]
df_test = df_feature[df_feature['label'].isnull()]

del df_feature
gc.collect()

0

In [8]:
ycol = 'label'

feature_names = list(
    filter(lambda x: x not in [ycol, 'group'], df_train.columns))

model = lgb.LGBMClassifier(num_leaves=64,
                           max_depth=10,
                           learning_rate=0.01,
                           n_estimators=10000,
                           subsample=0.8,
                           feature_fraction=0.8,
                           reg_alpha=0.5,
                           reg_lambda=0.5,
                           random_state=seed,
                           metric=None)

oof = []
prediction = df_test[['user_id', 'phase', 'item_id']]
prediction['pred'] = 0
df_importance_list = []

kfold = GroupKFold(n_splits=5)
for fold_id, (trn_idx, val_idx) in enumerate(
        kfold.split(df_train[feature_names], df_train[ycol],
                    df_train['group'])):
    X_train = df_train.iloc[trn_idx][feature_names]
    Y_train = df_train.iloc[trn_idx][ycol]

    X_val = df_train.iloc[val_idx][feature_names]
    Y_val = df_train.iloc[val_idx][ycol]

    print('\nFold_{} Training ================================\n'.format(
        fold_id + 1))

    lgb_model = model.fit(X_train,
                          Y_train,
                          eval_names=['train', 'valid'],
                          eval_set=[(X_val, Y_val)],
                          verbose=500,
                          eval_metric='auc',
                          early_stopping_rounds=50)

    pred_val = lgb_model.predict_proba(
        X_val, num_iteration=lgb_model.best_iteration_)[:, 1]
    df_oof = df_train.iloc[val_idx][['user_id', 'item_id', 'phase',
                                     ycol]].copy()
    df_oof['pred'] = pred_val
    oof.append(df_oof)

    pred_test = lgb_model.predict_proba(
        df_test[feature_names], num_iteration=lgb_model.best_iteration_)[:, 1]
    prediction['pred'] += pred_test / 5

    df_importance = pd.DataFrame({
        'column': feature_names,
        'importance': lgb_model.feature_importances_,
    })
    df_importance_list.append(df_importance)

    del lgb_model, pred_val, pred_test, X_train, Y_train, X_val, Y_val
    gc.collect()



Training until validation scores don't improve for 50 rounds
[500]	train's auc: 0.921983
Early stopping, best iteration is:
[886]	train's auc: 0.923365


Training until validation scores don't improve for 50 rounds
[500]	train's auc: 0.923251
Early stopping, best iteration is:
[753]	train's auc: 0.92401


Training until validation scores don't improve for 50 rounds
[500]	train's auc: 0.928198
Early stopping, best iteration is:
[801]	train's auc: 0.929782


Training until validation scores don't improve for 50 rounds
[500]	train's auc: 0.928499
[1000]	train's auc: 0.930006
Early stopping, best iteration is:
[970]	train's auc: 0.930031


Training until validation scores don't improve for 50 rounds
[500]	train's auc: 0.923051
Early stopping, best iteration is:
[617]	train's auc: 0.923601


In [9]:
df_importance = pd.concat(df_importance_list)
df_importance = df_importance.groupby([
    'column'
])['importance'].agg('mean').sort_values(ascending=False).reset_index()
df_importance

Unnamed: 0,column,importance
0,phase_item_clickd_count,1773.2
1,user_click_item_txt_sim_max,1583.0
2,user_click_item_w2w_sim_sum_2,1458.0
3,user_last_click_item_w2w_sim,1434.2
4,phase_item_click_time_diff_mean,1100.4
5,user_id_phase_query_lastbuy_time_diff,1010.4
6,user_last_click_item_txt_sim,958.0
7,user_click_item_tc_sim_max,924.2
8,query_time,848.0
9,txt_vec10_pca0,840.0


In [10]:
df_oof = pd.concat(oof)
df_oof.sort_values(['user_id', 'phase', 'pred'], inplace=True, ascending=False)
df_oof.head()

Unnamed: 0,user_id,item_id,phase,label,pred
2496243,35391,39113,0,0.0,0.036631
2496352,35391,41476,0,0.0,0.022769
2496278,35391,53767,0,0.0,0.019141
2496098,35391,96441,0,0.0,0.018784
2496239,35391,105073,0,0.0,0.018634


In [11]:
val_score = np.array([0.0, 0.0, 0.0, 0.0])
phases = sorted(list(df_oof['phase'].unique()))

for phase in phases:
    df_oof_phase = df_oof[df_oof['phase'] == phase]
    score = evaluate_scores(df_oof_phase, phase)
    val_score += score
    print(score)
val_score

100%|██████████| 4093/4093 [00:05<00:00, 758.52it/s]

(0.08666254767501695, 0.19291058069112932, 0.08709900577284518, 0.18988248781885927)





array([0.08666255, 0.19291058, 0.08709901, 0.18988249])

In [12]:
df_click = pd.read_pickle('../user_data/data/click.pkl')
df_count = df_click.groupby(['item_id', 'phase']).size().reset_index()
df_count.rename({0: 'count'}, inplace=True, axis=1)

In [13]:
# oof 后处理
val_score = np.array([0.0, 0.0, 0.0, 0.0])
df_oof_b = df_oof.merge(df_count, how='left')
df_oof_b['count'] = df_oof_b['count']**0.5
df_oof_b['pred'] = df_oof_b['pred'] / df_oof_b['count']
phases = sorted(list(df_oof_b['phase'].unique()))
df_oof_b.sort_values(['user_id', 'phase', 'pred'],
                     inplace=True,
                     ascending=False)

for phase in phases:
    df_oof_phase = df_oof_b[df_oof_b['phase'] == phase]
    score = evaluate_scores(df_oof_phase, phase)
    val_score += score
    print(score)
print(val_score)

100%|██████████| 4093/4093 [00:05<00:00, 750.82it/s]

(0.08181367422540728, 0.18679491746823418, 0.0963093690873841, 0.20020063055316709)
[0.08181367 0.18679492 0.09630937 0.20020063]





In [14]:
prediction_b = prediction.merge(df_count, how='left')
prediction_b['count'] = prediction_b['count']**0.5
prediction_b['pred'] = prediction_b['pred'] / prediction_b['count']

In [15]:
from random import sample
prediction.sort_values(['phase', 'user_id', 'pred'],
                       inplace=True,
                       ascending=False)
final_prediction = prediction_b[prediction_b['phase'] > 6]

gg = final_prediction.groupby(['user_id', 'phase'])
all_items = set(final_prediction['item_id'].values)

lines = []

for _, g in tqdm(gg):
    g = g.head(50)

    user_id = g['user_id'].values[0]
    items = g['item_id'].values.tolist()

    if len(set(items)) < 50:
        buchong = all_items - set(items)
        buchong = sample(buchong, 50 - len(set(items)))
        items += buchong

    assert len(set(items)) == 50

    lines.append([user_id] + items)

df_sub = pd.DataFrame(lines)

0it [00:00, ?it/s]


In [16]:
df_sub.head()

In [17]:
df_sub.to_csv('../prediction_result/result.csv', index=False, header=False)