In [1]:
import numpy as np
import pandas as pd
import pickle
from tqdm import tqdm
import gc, os
import time
from datetime import datetime
import lightgbm as lgb
from sklearn.preprocessing import MinMaxScaler
import warnings
warnings.filterwarnings('ignore')




In [2]:
data_path = './'
save_path = './'
offline = False


In [11]:
# 重新读取数据的时候，发现click_article_id是一个浮点数，所以将其转换成int类型
trn_user_item_feats_df = pd.read_csv(save_path + 'trn_user_item_feats_df.csv')
trn_user_item_feats_df['click_article_id'] = trn_user_item_feats_df['click_article_id'].astype(int)

if offline:
    val_user_item_feats_df = pd.read_csv(save_path + 'val_user_item_feats_df.csv')
    val_user_item_feats_df['click_article_id'] = val_user_item_feats_df['click_article_id'].astype(int)
else:
    val_user_item_feats_df = None
    
tst_user_item_feats_df = pd.read_csv(save_path + 'tst_user_item_feats_df.csv')
tst_user_item_feats_df['click_article_id'] = tst_user_item_feats_df['click_article_id'].astype(int)

# 做特征的时候为了方便，给测试集也打上了一个无效的标签，这里直接删掉就行
del tst_user_item_feats_df['label']


In [4]:
def submit(recall_df, topk=5, model_name=None):
    recall_df = recall_df.sort_values(by=['user_id', 'pred_score'])
    recall_df['rank'] = recall_df.groupby(['user_id'])['pred_score'].rank(ascending=False, method='first')
    
    # 判断是不是每个用户都有5篇文章及以上
    tmp = recall_df.groupby('user_id').apply(lambda x: x['rank'].max())
    assert tmp.min() >= topk
    
    del recall_df['pred_score']
    submit = recall_df[recall_df['rank'] <= topk].set_index(['user_id', 'rank']).unstack(-1).reset_index()
    
    submit.columns = [int(col) if isinstance(col, int) else col for col in submit.columns.droplevel(0)]
    # 按照提交格式定义列名
    submit = submit.rename(columns={'': 'user_id', 1: 'article_1', 2: 'article_2', 
                                                  3: 'article_3', 4: 'article_4', 5: 'article_5'})
    
    save_name = save_path + model_name + '_' + datetime.today().strftime('%m-%d') + '.csv'
    submit.to_csv(save_name, index=False, header=True)


In [5]:
# 排序结果归一化
def norm_sim(sim_df, weight=0.0):
    # print(sim_df.head())
    min_sim = sim_df.min()
    max_sim = sim_df.max()
    if max_sim == min_sim:
        sim_df = sim_df.apply(lambda sim: 1.0)
    else:
        sim_df = sim_df.apply(lambda sim: 1.0 * (sim - min_sim) / (max_sim - min_sim))

    sim_df = sim_df.apply(lambda sim: sim + weight)  # plus one
    return sim_df


In [16]:
# 防止中间出错之后重新读取数据
trn_user_item_feats_df_rank_model = trn_user_item_feats_df.copy()

if offline:
    val_user_item_feats_df_rank_model = val_user_item_feats_df.copy()
    
tst_user_item_feats_df_rank_model = tst_user_item_feats_df.copy()


In [17]:
# 定义特征列
lgb_cols = ['sim0', 'time_diff0', 'word_diff0','sim_max', 'sim_min', 'sim_sum', 
            'sim_mean', 'score','click_size', 'time_diff_mean', 'active_level',
            'click_environment','click_deviceGroup', 'click_os', 'click_country', 
            'click_region','click_referrer_type', 'user_time_hob1', 'user_time_hob2',
            'words_hbo', 'category_id', 'created_at_ts','words_count']


In [18]:
# 排序模型分组
trn_user_item_feats_df_rank_model.sort_values(by=['user_id'], inplace=True)
g_train = trn_user_item_feats_df_rank_model.groupby(['user_id'], as_index=False).count()["label"].values

if offline:
    val_user_item_feats_df_rank_model.sort_values(by=['user_id'], inplace=True)
    g_val = val_user_item_feats_df_rank_model.groupby(['user_id'], as_index=False).count()["label"].values


In [19]:
# 排序模型定义
lgb_ranker = lgb.LGBMRanker(boosting_type='gbdt', num_leaves=31, reg_alpha=0.0, reg_lambda=1,
                            max_depth=-1, n_estimators=100, subsample=0.7, colsample_bytree=0.7, subsample_freq=1,
                            learning_rate=0.01, min_child_weight=50, random_state=2018, n_jobs= 16)  


In [20]:
# 排序模型训练
if offline:
    lgb_ranker.fit(trn_user_item_feats_df_rank_model[lgb_cols], trn_user_item_feats_df_rank_model['label'], group=g_train,
                eval_set=[(val_user_item_feats_df_rank_model[lgb_cols], val_user_item_feats_df_rank_model['label'])], 
                eval_group= [g_val], eval_at=[1, 2, 3, 4, 5], eval_metric=['ndcg', ], early_stopping_rounds=50, )
else:
    lgb_ranker.fit(trn_user_item_feats_df[lgb_cols], trn_user_item_feats_df['label'], group=g_train)


In [21]:
# 模型预测
tst_user_item_feats_df['pred_score'] = lgb_ranker.predict(tst_user_item_feats_df[lgb_cols], num_iteration=lgb_ranker.best_iteration_)

# 将这里的排序结果保存一份，用户后面的模型融合
tst_user_item_feats_df[['user_id', 'click_article_id', 'pred_score']].to_csv(save_path + 'lgb_ranker_score.csv', index=False)


In [22]:
# 预测结果重新排序, 及生成提交结果
rank_results = tst_user_item_feats_df[['user_id', 'click_article_id', 'pred_score']]
rank_results['click_article_id'] = rank_results['click_article_id'].astype(int)
submit(rank_results, topk=5, model_name='lgb_ranker')


In [23]:
# 五折交叉验证，这里的五折交叉是以用户为目标进行五折划分
#  这一部分与前面的单独训练和验证是分开的
def get_kfold_users(trn_df, n=5):
    user_ids = trn_df['user_id'].unique()
    user_set = [user_ids[i::n] for i in range(n)]
    return user_set

k_fold = 5
trn_df = trn_user_item_feats_df_rank_model
user_set = get_kfold_users(trn_df, n=k_fold)

score_list = []
score_df = trn_df[['user_id', 'click_article_id','label']]
sub_preds = np.zeros(tst_user_item_feats_df_rank_model.shape[0])

# 五折交叉验证，并将中间结果保存用于staking
for n_fold, valid_user in enumerate(user_set):
    train_idx = trn_df[~trn_df['user_id'].isin(valid_user)] # add slide user
    valid_idx = trn_df[trn_df['user_id'].isin(valid_user)]
    
    # 训练集与验证集的用户分组
    train_idx.sort_values(by=['user_id'], inplace=True)
    g_train = train_idx.groupby(['user_id'], as_index=False).count()["label"].values
    
    valid_idx.sort_values(by=['user_id'], inplace=True)
    g_val = valid_idx.groupby(['user_id'], as_index=False).count()["label"].values
    
    # 定义模型
    lgb_ranker = lgb.LGBMRanker(boosting_type='gbdt', num_leaves=31, reg_alpha=0.0, reg_lambda=1,
                            max_depth=-1, n_estimators=100, subsample=0.7, colsample_bytree=0.7, subsample_freq=1,
                            learning_rate=0.01, min_child_weight=50, random_state=2018, n_jobs= 16)  
    # 训练模型
    lgb_ranker.fit(train_idx[lgb_cols], train_idx['label'], group=g_train,
                   eval_set=[(valid_idx[lgb_cols], valid_idx['label'])], eval_group= [g_val], 
                   eval_at=[1, 2, 3, 4, 5], eval_metric=['ndcg', ], early_stopping_rounds=50, )
    
    # 预测验证集结果
    valid_idx['pred_score'] = lgb_ranker.predict(valid_idx[lgb_cols], num_iteration=lgb_ranker.best_iteration_)
    
    # 对输出结果进行归一化
    valid_idx['pred_score'] = valid_idx[['pred_score']].transform(lambda x: norm_sim(x))
    
    valid_idx.sort_values(by=['user_id', 'pred_score'])
    valid_idx['pred_rank'] = valid_idx.groupby(['user_id'])['pred_score'].rank(ascending=False, method='first')
    
    # 将验证集的预测结果放到一个列表中，后面进行拼接
    score_list.append(valid_idx[['user_id', 'click_article_id', 'pred_score', 'pred_rank']])
    
    # 如果是线上测试，需要计算每次交叉验证的结果相加，最后求平均
    if not offline:
        sub_preds += lgb_ranker.predict(tst_user_item_feats_df_rank_model[lgb_cols], lgb_ranker.best_iteration_)
    
score_df_ = pd.concat(score_list, axis=0)
score_df = score_df.merge(score_df_, how='left', on=['user_id', 'click_article_id'])
# 保存训练集交叉验证产生的新特征
score_df[['user_id', 'click_article_id', 'pred_score', 'pred_rank', 'label']].to_csv(save_path + 'trn_lgb_ranker_feats.csv', index=False)
    
# 测试集的预测结果，多次交叉验证求平均,将预测的score和对应的rank特征保存，可以用于后面的staking，这里还可以构造其他更多的特征
tst_user_item_feats_df_rank_model['pred_score'] = sub_preds / k_fold
tst_user_item_feats_df_rank_model['pred_score'] = tst_user_item_feats_df_rank_model['pred_score'].transform(lambda x: norm_sim(x))
tst_user_item_feats_df_rank_model.sort_values(by=['user_id', 'pred_score'])
tst_user_item_feats_df_rank_model['pred_rank'] = tst_user_item_feats_df_rank_model.groupby(['user_id'])['pred_score'].rank(ascending=False, method='first')

# 保存测试集交叉验证的新特征
tst_user_item_feats_df_rank_model[['user_id', 'click_article_id', 'pred_score', 'pred_rank']].to_csv(save_path + 'tst_lgb_ranker_feats.csv', index=False)


[1]	valid_0's ndcg@1: 1	valid_0's ndcg@2: 1	valid_0's ndcg@3: 1	valid_0's ndcg@4: 1	valid_0's ndcg@5: 1
[2]	valid_0's ndcg@1: 1	valid_0's ndcg@2: 1	valid_0's ndcg@3: 1	valid_0's ndcg@4: 1	valid_0's ndcg@5: 1
[3]	valid_0's ndcg@1: 1	valid_0's ndcg@2: 1	valid_0's ndcg@3: 1	valid_0's ndcg@4: 1	valid_0's ndcg@5: 1
[4]	valid_0's ndcg@1: 1	valid_0's ndcg@2: 1	valid_0's ndcg@3: 1	valid_0's ndcg@4: 1	valid_0's ndcg@5: 1
[5]	valid_0's ndcg@1: 1	valid_0's ndcg@2: 1	valid_0's ndcg@3: 1	valid_0's ndcg@4: 1	valid_0's ndcg@5: 1
[6]	valid_0's ndcg@1: 1	valid_0's ndcg@2: 1	valid_0's ndcg@3: 1	valid_0's ndcg@4: 1	valid_0's ndcg@5: 1
[7]	valid_0's ndcg@1: 1	valid_0's ndcg@2: 1	valid_0's ndcg@3: 1	valid_0's ndcg@4: 1	valid_0's ndcg@5: 1
[8]	valid_0's ndcg@1: 1	valid_0's ndcg@2: 1	valid_0's ndcg@3: 1	valid_0's ndcg@4: 1	valid_0's ndcg@5: 1
[9]	valid_0's ndcg@1: 1	valid_0's ndcg@2: 1	valid_0's ndcg@3: 1	valid_0's ndcg@4: 1	valid_0's ndcg@5: 1
[10]	valid_0's ndcg@1: 1	valid_0's ndcg@2: 1	valid_0's ndcg@3: 1

[47]	valid_0's ndcg@1: 1	valid_0's ndcg@2: 1	valid_0's ndcg@3: 1	valid_0's ndcg@4: 1	valid_0's ndcg@5: 1
[48]	valid_0's ndcg@1: 1	valid_0's ndcg@2: 1	valid_0's ndcg@3: 1	valid_0's ndcg@4: 1	valid_0's ndcg@5: 1
[49]	valid_0's ndcg@1: 1	valid_0's ndcg@2: 1	valid_0's ndcg@3: 1	valid_0's ndcg@4: 1	valid_0's ndcg@5: 1
[50]	valid_0's ndcg@1: 1	valid_0's ndcg@2: 1	valid_0's ndcg@3: 1	valid_0's ndcg@4: 1	valid_0's ndcg@5: 1
[51]	valid_0's ndcg@1: 1	valid_0's ndcg@2: 1	valid_0's ndcg@3: 1	valid_0's ndcg@4: 1	valid_0's ndcg@5: 1
[1]	valid_0's ndcg@1: 1	valid_0's ndcg@2: 1	valid_0's ndcg@3: 1	valid_0's ndcg@4: 1	valid_0's ndcg@5: 1
[2]	valid_0's ndcg@1: 1	valid_0's ndcg@2: 1	valid_0's ndcg@3: 1	valid_0's ndcg@4: 1	valid_0's ndcg@5: 1
[3]	valid_0's ndcg@1: 1	valid_0's ndcg@2: 1	valid_0's ndcg@3: 1	valid_0's ndcg@4: 1	valid_0's ndcg@5: 1
[4]	valid_0's ndcg@1: 1	valid_0's ndcg@2: 1	valid_0's ndcg@3: 1	valid_0's ndcg@4: 1	valid_0's ndcg@5: 1
[5]	valid_0's ndcg@1: 1	valid_0's ndcg@2: 1	valid_0's ndcg@

[1]	valid_0's ndcg@1: 1	valid_0's ndcg@2: 1	valid_0's ndcg@3: 1	valid_0's ndcg@4: 1	valid_0's ndcg@5: 1
[2]	valid_0's ndcg@1: 1	valid_0's ndcg@2: 1	valid_0's ndcg@3: 1	valid_0's ndcg@4: 1	valid_0's ndcg@5: 1
[3]	valid_0's ndcg@1: 1	valid_0's ndcg@2: 1	valid_0's ndcg@3: 1	valid_0's ndcg@4: 1	valid_0's ndcg@5: 1
[4]	valid_0's ndcg@1: 1	valid_0's ndcg@2: 1	valid_0's ndcg@3: 1	valid_0's ndcg@4: 1	valid_0's ndcg@5: 1
[5]	valid_0's ndcg@1: 1	valid_0's ndcg@2: 1	valid_0's ndcg@3: 1	valid_0's ndcg@4: 1	valid_0's ndcg@5: 1
[6]	valid_0's ndcg@1: 1	valid_0's ndcg@2: 1	valid_0's ndcg@3: 1	valid_0's ndcg@4: 1	valid_0's ndcg@5: 1
[7]	valid_0's ndcg@1: 1	valid_0's ndcg@2: 1	valid_0's ndcg@3: 1	valid_0's ndcg@4: 1	valid_0's ndcg@5: 1
[8]	valid_0's ndcg@1: 1	valid_0's ndcg@2: 1	valid_0's ndcg@3: 1	valid_0's ndcg@4: 1	valid_0's ndcg@5: 1
[9]	valid_0's ndcg@1: 1	valid_0's ndcg@2: 1	valid_0's ndcg@3: 1	valid_0's ndcg@4: 1	valid_0's ndcg@5: 1
[10]	valid_0's ndcg@1: 1	valid_0's ndcg@2: 1	valid_0's ndcg@3: 1

In [24]:
# 模型及参数的定义
lgb_Classfication = lgb.LGBMClassifier(boosting_type='gbdt', num_leaves=31, reg_alpha=0.0, reg_lambda=1,
                            max_depth=-1, n_estimators=500, subsample=0.7, colsample_bytree=0.7, subsample_freq=1,
                            learning_rate=0.01, min_child_weight=50, random_state=2018, n_jobs= 16, verbose=10)  


In [25]:
# 模型训练
if offline:
    lgb_Classfication.fit(trn_user_item_feats_df_rank_model[lgb_cols], trn_user_item_feats_df_rank_model['label'],
                    eval_set=[(val_user_item_feats_df_rank_model[lgb_cols], val_user_item_feats_df_rank_model['label'])], 
                    eval_metric=['auc', ],early_stopping_rounds=50, )
else:
    lgb_Classfication.fit(trn_user_item_feats_df_rank_model[lgb_cols], trn_user_item_feats_df_rank_model['label'])


[LightGBM] [Info] Number of positive: 3, number of negative: 228878
[LightGBM] [Debug] Dataset::GetMultiBinFromAllFeatures: sparse rate 0.042851
[LightGBM] [Debug] init for col-wise cost 0.000035 seconds, init for row-wise cost 0.008118 seconds
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4032
[LightGBM] [Info] Number of data points in the train set: 228881, number of used features: 23
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.000013 -> initscore=-11.242332
[LightGBM] [Info] Start training from score -11.242332
[LightGBM] [Debug] Re-bagging, using 160407 data to train
[LightGBM] [Debug] Trained a tree with leaves = 1 and depth = 1
[LightGBM] [Debug] Re-bagging, using 159966 data to train
[LightGBM] [Debug] Trained a tree with leaves = 1 and depth = 1
[LightGBM] [Debug] Re-bagging, using 159958 data to train
[LightGBM] [Debug] Trained a tree with leaves = 1 and depth = 1
[LightGBM] [Debug] Re-bagging, using 159992 data to train
[LightGBM

[LightGBM] [Debug] Re-bagging, using 160042 data to train
[LightGBM] [Debug] Trained a tree with leaves = 1 and depth = 1
[LightGBM] [Debug] Re-bagging, using 160294 data to train
[LightGBM] [Debug] Trained a tree with leaves = 1 and depth = 1
[LightGBM] [Debug] Re-bagging, using 159918 data to train
[LightGBM] [Debug] Trained a tree with leaves = 1 and depth = 1
[LightGBM] [Debug] Re-bagging, using 159964 data to train
[LightGBM] [Debug] Trained a tree with leaves = 1 and depth = 1
[LightGBM] [Debug] Re-bagging, using 159848 data to train
[LightGBM] [Debug] Trained a tree with leaves = 1 and depth = 1
[LightGBM] [Debug] Re-bagging, using 160043 data to train
[LightGBM] [Debug] Trained a tree with leaves = 1 and depth = 1
[LightGBM] [Debug] Re-bagging, using 160184 data to train
[LightGBM] [Debug] Trained a tree with leaves = 1 and depth = 1
[LightGBM] [Debug] Re-bagging, using 159989 data to train
[LightGBM] [Debug] Trained a tree with leaves = 1 and depth = 1
[LightGBM] [Debug] Re-ba

[LightGBM] [Debug] Re-bagging, using 159976 data to train
[LightGBM] [Debug] Trained a tree with leaves = 1 and depth = 1
[LightGBM] [Debug] Re-bagging, using 160120 data to train
[LightGBM] [Debug] Trained a tree with leaves = 1 and depth = 1
[LightGBM] [Debug] Re-bagging, using 159808 data to train
[LightGBM] [Debug] Trained a tree with leaves = 1 and depth = 1
[LightGBM] [Debug] Re-bagging, using 160479 data to train
[LightGBM] [Debug] Trained a tree with leaves = 1 and depth = 1
[LightGBM] [Debug] Re-bagging, using 160160 data to train
[LightGBM] [Debug] Trained a tree with leaves = 1 and depth = 1
[LightGBM] [Debug] Re-bagging, using 160142 data to train
[LightGBM] [Debug] Trained a tree with leaves = 1 and depth = 1
[LightGBM] [Debug] Re-bagging, using 160478 data to train
[LightGBM] [Debug] Trained a tree with leaves = 1 and depth = 1
[LightGBM] [Debug] Re-bagging, using 160378 data to train
[LightGBM] [Debug] Trained a tree with leaves = 1 and depth = 1
[LightGBM] [Debug] Re-ba

[LightGBM] [Debug] Trained a tree with leaves = 1 and depth = 1
[LightGBM] [Debug] Re-bagging, using 160064 data to train
[LightGBM] [Debug] Trained a tree with leaves = 1 and depth = 1
[LightGBM] [Debug] Re-bagging, using 160460 data to train
[LightGBM] [Debug] Trained a tree with leaves = 1 and depth = 1
[LightGBM] [Debug] Re-bagging, using 160033 data to train
[LightGBM] [Debug] Trained a tree with leaves = 1 and depth = 1
[LightGBM] [Debug] Re-bagging, using 160112 data to train
[LightGBM] [Debug] Trained a tree with leaves = 1 and depth = 1
[LightGBM] [Debug] Re-bagging, using 160073 data to train
[LightGBM] [Debug] Trained a tree with leaves = 1 and depth = 1
[LightGBM] [Debug] Re-bagging, using 160414 data to train
[LightGBM] [Debug] Trained a tree with leaves = 1 and depth = 1
[LightGBM] [Debug] Re-bagging, using 160135 data to train
[LightGBM] [Debug] Trained a tree with leaves = 1 and depth = 1
[LightGBM] [Debug] Re-bagging, using 160361 data to train
[LightGBM] [Debug] Train

[LightGBM] [Debug] Re-bagging, using 160336 data to train
[LightGBM] [Debug] Trained a tree with leaves = 1 and depth = 1
[LightGBM] [Debug] Re-bagging, using 160491 data to train
[LightGBM] [Debug] Trained a tree with leaves = 1 and depth = 1
[LightGBM] [Debug] Re-bagging, using 160370 data to train
[LightGBM] [Debug] Trained a tree with leaves = 1 and depth = 1
[LightGBM] [Debug] Re-bagging, using 160084 data to train
[LightGBM] [Debug] Trained a tree with leaves = 1 and depth = 1
[LightGBM] [Debug] Re-bagging, using 160338 data to train
[LightGBM] [Debug] Trained a tree with leaves = 1 and depth = 1
[LightGBM] [Debug] Re-bagging, using 160336 data to train
[LightGBM] [Debug] Trained a tree with leaves = 1 and depth = 1
[LightGBM] [Debug] Re-bagging, using 159942 data to train
[LightGBM] [Debug] Trained a tree with leaves = 1 and depth = 1
[LightGBM] [Debug] Re-bagging, using 159955 data to train
[LightGBM] [Debug] Trained a tree with leaves = 1 and depth = 1
[LightGBM] [Debug] Re-ba

[LightGBM] [Debug] Re-bagging, using 160169 data to train
[LightGBM] [Debug] Trained a tree with leaves = 1 and depth = 1
[LightGBM] [Debug] Re-bagging, using 159904 data to train
[LightGBM] [Debug] Trained a tree with leaves = 1 and depth = 1
[LightGBM] [Debug] Re-bagging, using 160176 data to train
[LightGBM] [Debug] Trained a tree with leaves = 1 and depth = 1
[LightGBM] [Debug] Re-bagging, using 160438 data to train
[LightGBM] [Debug] Trained a tree with leaves = 1 and depth = 1
[LightGBM] [Debug] Re-bagging, using 159768 data to train
[LightGBM] [Debug] Trained a tree with leaves = 1 and depth = 1
[LightGBM] [Debug] Re-bagging, using 160383 data to train
[LightGBM] [Debug] Trained a tree with leaves = 1 and depth = 1
[LightGBM] [Debug] Re-bagging, using 160159 data to train
[LightGBM] [Debug] Trained a tree with leaves = 1 and depth = 1
[LightGBM] [Debug] Re-bagging, using 160475 data to train
[LightGBM] [Debug] Trained a tree with leaves = 1 and depth = 1
[LightGBM] [Debug] Re-ba

[LightGBM] [Debug] Re-bagging, using 160346 data to train
[LightGBM] [Debug] Trained a tree with leaves = 1 and depth = 1
[LightGBM] [Debug] Re-bagging, using 160140 data to train
[LightGBM] [Debug] Trained a tree with leaves = 1 and depth = 1
[LightGBM] [Debug] Re-bagging, using 160077 data to train
[LightGBM] [Debug] Trained a tree with leaves = 1 and depth = 1
[LightGBM] [Debug] Re-bagging, using 160285 data to train
[LightGBM] [Debug] Trained a tree with leaves = 1 and depth = 1
[LightGBM] [Debug] Re-bagging, using 160155 data to train
[LightGBM] [Debug] Trained a tree with leaves = 1 and depth = 1
[LightGBM] [Debug] Re-bagging, using 160117 data to train
[LightGBM] [Debug] Trained a tree with leaves = 1 and depth = 1
[LightGBM] [Debug] Re-bagging, using 160452 data to train
[LightGBM] [Debug] Trained a tree with leaves = 1 and depth = 1
[LightGBM] [Debug] Re-bagging, using 160351 data to train
[LightGBM] [Debug] Trained a tree with leaves = 1 and depth = 1
[LightGBM] [Debug] Re-ba

[LightGBM] [Debug] Trained a tree with leaves = 1 and depth = 1
[LightGBM] [Debug] Re-bagging, using 160008 data to train
[LightGBM] [Debug] Trained a tree with leaves = 1 and depth = 1
[LightGBM] [Debug] Re-bagging, using 159843 data to train
[LightGBM] [Debug] Trained a tree with leaves = 1 and depth = 1
[LightGBM] [Debug] Re-bagging, using 160183 data to train
[LightGBM] [Debug] Trained a tree with leaves = 1 and depth = 1
[LightGBM] [Debug] Re-bagging, using 159820 data to train
[LightGBM] [Debug] Trained a tree with leaves = 1 and depth = 1
[LightGBM] [Debug] Re-bagging, using 159892 data to train
[LightGBM] [Debug] Trained a tree with leaves = 1 and depth = 1
[LightGBM] [Debug] Re-bagging, using 160111 data to train
[LightGBM] [Debug] Trained a tree with leaves = 1 and depth = 1
[LightGBM] [Debug] Re-bagging, using 160832 data to train
[LightGBM] [Debug] Trained a tree with leaves = 1 and depth = 1
[LightGBM] [Debug] Re-bagging, using 159960 data to train
[LightGBM] [Debug] Train

In [26]:
# 模型预测
tst_user_item_feats_df['pred_score'] = lgb_Classfication.predict_proba(tst_user_item_feats_df[lgb_cols])[:,1]

# 将这里的排序结果保存一份，用户后面的模型融合
tst_user_item_feats_df[['user_id', 'click_article_id', 'pred_score']].to_csv(save_path + 'lgb_cls_score.csv', index=False)


In [27]:
# 预测结果重新排序, 及生成提交结果
rank_results = tst_user_item_feats_df[['user_id', 'click_article_id', 'pred_score']]
rank_results['click_article_id'] = rank_results['click_article_id'].astype(int)
submit(rank_results, topk=5, model_name='lgb_cls')


In [28]:
# 五折交叉验证，这里的五折交叉是以用户为目标进行五折划分
#  这一部分与前面的单独训练和验证是分开的
def get_kfold_users(trn_df, n=5):
    user_ids = trn_df['user_id'].unique()
    user_set = [user_ids[i::n] for i in range(n)]
    return user_set

k_fold = 5
trn_df = trn_user_item_feats_df_rank_model
user_set = get_kfold_users(trn_df, n=k_fold)

score_list = []
score_df = trn_df[['user_id', 'click_article_id', 'label']]
sub_preds = np.zeros(tst_user_item_feats_df_rank_model.shape[0])

# 五折交叉验证，并将中间结果保存用于staking
for n_fold, valid_user in enumerate(user_set):
    train_idx = trn_df[~trn_df['user_id'].isin(valid_user)] # add slide user
    valid_idx = trn_df[trn_df['user_id'].isin(valid_user)]
    
    # 模型及参数的定义
    lgb_Classfication = lgb.LGBMClassifier(boosting_type='gbdt', num_leaves=31, reg_alpha=0.0, reg_lambda=1,
                            max_depth=-1, n_estimators=100, subsample=0.7, colsample_bytree=0.7, subsample_freq=1,
                            learning_rate=0.01, min_child_weight=50, random_state=2018, n_jobs= 16, verbose=10)  
    # 训练模型
    lgb_Classfication.fit(train_idx[lgb_cols], train_idx['label'],eval_set=[(valid_idx[lgb_cols], valid_idx['label'])], 
                          eval_metric=['auc', ],early_stopping_rounds=50, )
    
    # 预测验证集结果
    valid_idx['pred_score'] = lgb_Classfication.predict_proba(valid_idx[lgb_cols], 
                                                              num_iteration=lgb_Classfication.best_iteration_)[:,1]
    
    # 对输出结果进行归一化 分类模型输出的值本身就是一个概率值不需要进行归一化
    # valid_idx['pred_score'] = valid_idx[['pred_score']].transform(lambda x: norm_sim(x))
    
    valid_idx.sort_values(by=['user_id', 'pred_score'])
    valid_idx['pred_rank'] = valid_idx.groupby(['user_id'])['pred_score'].rank(ascending=False, method='first')
    
    # 将验证集的预测结果放到一个列表中，后面进行拼接
    score_list.append(valid_idx[['user_id', 'click_article_id', 'pred_score', 'pred_rank']])
    
    # 如果是线上测试，需要计算每次交叉验证的结果相加，最后求平均
    if not offline:
        sub_preds += lgb_Classfication.predict_proba(tst_user_item_feats_df_rank_model[lgb_cols], 
                                                     num_iteration=lgb_Classfication.best_iteration_)[:,1]
    
score_df_ = pd.concat(score_list, axis=0)
score_df = score_df.merge(score_df_, how='left', on=['user_id', 'click_article_id'])
# 保存训练集交叉验证产生的新特征
score_df[['user_id', 'click_article_id', 'pred_score', 'pred_rank', 'label']].to_csv(save_path + 'trn_lgb_cls_feats.csv', index=False)
    
# 测试集的预测结果，多次交叉验证求平均,将预测的score和对应的rank特征保存，可以用于后面的staking，这里还可以构造其他更多的特征
tst_user_item_feats_df_rank_model['pred_score'] = sub_preds / k_fold
tst_user_item_feats_df_rank_model['pred_score'] = tst_user_item_feats_df_rank_model['pred_score'].transform(lambda x: norm_sim(x))
tst_user_item_feats_df_rank_model.sort_values(by=['user_id', 'pred_score'])
tst_user_item_feats_df_rank_model['pred_rank'] = tst_user_item_feats_df_rank_model.groupby(['user_id'])['pred_score'].rank(ascending=False, method='first')

# 保存测试集交叉验证的新特征
tst_user_item_feats_df_rank_model[['user_id', 'click_article_id', 'pred_score', 'pred_rank']].to_csv(save_path + 'tst_lgb_cls_feats.csv', index=False)


[LightGBM] [Info] Number of positive: 2, number of negative: 182958
[LightGBM] [Debug] Dataset::GetMultiBinFromAllFeatures: sparse rate 0.042842
[LightGBM] [Debug] init for col-wise cost 0.000046 seconds, init for row-wise cost 0.007156 seconds
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Debug] Using Dense Multi-Val Bin
[LightGBM] [Info] Total Bins 4030
[LightGBM] [Info] Number of data points in the train set: 182960, number of used features: 23
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.000011 -> initscore=-11.423865
[LightGBM] [Info] Start training from score -11.423865
[LightGBM] [Debug] Re-bagging, using 128244 data to train
[LightGBM] [Debug] Trained a tree with leaves = 1 and depth = 1
[1]	valid_0's auc: 0.5	valid_0's binary_logloss: 0.000497544
[LightGBM] [Debug] Re-bagging, using 127975 data to train
[LightGBM] [Debug] Trained a tree with leaves = 1 and depth = 1
[2]	valid_0's auc: 

[LightGBM] [Debug] Re-bagging, using 127702 data to train
[LightGBM] [Debug] Trained a tree with leaves = 1 and depth = 1
[38]	valid_0's auc: 0.5	valid_0's binary_logloss: 0.000497544
[LightGBM] [Debug] Re-bagging, using 128190 data to train
[LightGBM] [Debug] Trained a tree with leaves = 1 and depth = 1
[39]	valid_0's auc: 0.5	valid_0's binary_logloss: 0.000497544
[LightGBM] [Debug] Re-bagging, using 127992 data to train
[LightGBM] [Debug] Trained a tree with leaves = 1 and depth = 1
[40]	valid_0's auc: 0.5	valid_0's binary_logloss: 0.000497544
[LightGBM] [Debug] Re-bagging, using 128571 data to train
[LightGBM] [Debug] Trained a tree with leaves = 1 and depth = 1
[41]	valid_0's auc: 0.5	valid_0's binary_logloss: 0.000497544
[LightGBM] [Debug] Re-bagging, using 128186 data to train
[LightGBM] [Debug] Trained a tree with leaves = 1 and depth = 1
[42]	valid_0's auc: 0.5	valid_0's binary_logloss: 0.000497544
[LightGBM] [Debug] Re-bagging, using 128034 data to train
[LightGBM] [Debug] Tra

[LightGBM] [Debug] Trained a tree with leaves = 1 and depth = 1
[44]	valid_0's auc: 1	valid_0's binary_logloss: 2.68619e-10
[LightGBM] [Debug] Re-bagging, using 128341 data to train
[LightGBM] [Debug] Trained a tree with leaves = 1 and depth = 1
[45]	valid_0's auc: 1	valid_0's binary_logloss: 2.68619e-10
[LightGBM] [Debug] Re-bagging, using 128139 data to train
[LightGBM] [Debug] Trained a tree with leaves = 1 and depth = 1
[46]	valid_0's auc: 1	valid_0's binary_logloss: 2.68619e-10
[LightGBM] [Debug] Re-bagging, using 128096 data to train
[LightGBM] [Debug] Trained a tree with leaves = 1 and depth = 1
[47]	valid_0's auc: 1	valid_0's binary_logloss: 2.68619e-10
[LightGBM] [Debug] Re-bagging, using 127724 data to train
[LightGBM] [Debug] Trained a tree with leaves = 1 and depth = 1
[48]	valid_0's auc: 1	valid_0's binary_logloss: 2.68619e-10
[LightGBM] [Debug] Re-bagging, using 128233 data to train
[LightGBM] [Debug] Trained a tree with leaves = 1 and depth = 1
[49]	valid_0's auc: 1	vali

[LightGBM] [Debug] Trained a tree with leaves = 1 and depth = 1
[50]	valid_0's auc: 1	valid_0's binary_logloss: 2.68208e-10
[LightGBM] [Debug] Re-bagging, using 128035 data to train
[LightGBM] [Debug] Trained a tree with leaves = 1 and depth = 1
[51]	valid_0's auc: 1	valid_0's binary_logloss: 2.68208e-10
[LightGBM] [Info] Number of positive: 1, number of negative: 183152
[LightGBM] [Debug] Dataset::GetMultiBinFromAllFeatures: sparse rate 0.042863
[LightGBM] [Debug] init for col-wise cost 0.000043 seconds, init for row-wise cost 0.008055 seconds
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4028
[LightGBM] [Info] Number of data points in the train set: 183153, number of used features: 23
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.000005 -> initscore=-12.118072
[LightGBM] [Info] Start training from score -12.118072
[LightGBM] [Debug] Re-bagging, using 128374 data to train
[LightGBM] [Debug] Trained a tree with leaves = 1 and depth = 1
[1]	v

[LightGBM] [Debug] Re-bagging, using 128229 data to train
[LightGBM] [Debug] Trained a tree with leaves = 1 and depth = 1
[46]	valid_0's auc: 0.5	valid_0's binary_logloss: 0.00106001
[LightGBM] [Debug] Re-bagging, using 128169 data to train
[LightGBM] [Debug] Trained a tree with leaves = 1 and depth = 1
[47]	valid_0's auc: 0.5	valid_0's binary_logloss: 0.00106001
[LightGBM] [Debug] Re-bagging, using 127823 data to train
[LightGBM] [Debug] Trained a tree with leaves = 1 and depth = 1
[48]	valid_0's auc: 0.5	valid_0's binary_logloss: 0.00106001
[LightGBM] [Debug] Re-bagging, using 128322 data to train
[LightGBM] [Debug] Trained a tree with leaves = 1 and depth = 1
[49]	valid_0's auc: 0.5	valid_0's binary_logloss: 0.00106001
[LightGBM] [Debug] Re-bagging, using 128217 data to train
[LightGBM] [Debug] Trained a tree with leaves = 1 and depth = 1
[50]	valid_0's auc: 0.5	valid_0's binary_logloss: 0.00106001
[LightGBM] [Debug] Re-bagging, using 127957 data to train
[LightGBM] [Debug] Trained 

[47]	valid_0's auc: 1	valid_0's binary_logloss: 2.68229e-10
[LightGBM] [Debug] Re-bagging, using 127851 data to train
[LightGBM] [Debug] Trained a tree with leaves = 1 and depth = 1
[48]	valid_0's auc: 1	valid_0's binary_logloss: 2.68229e-10
[LightGBM] [Debug] Re-bagging, using 128322 data to train
[LightGBM] [Debug] Trained a tree with leaves = 1 and depth = 1
[49]	valid_0's auc: 1	valid_0's binary_logloss: 2.68229e-10
[LightGBM] [Debug] Re-bagging, using 128167 data to train
[LightGBM] [Debug] Trained a tree with leaves = 1 and depth = 1
[50]	valid_0's auc: 1	valid_0's binary_logloss: 2.68229e-10
[LightGBM] [Debug] Re-bagging, using 128020 data to train
[LightGBM] [Debug] Trained a tree with leaves = 1 and depth = 1
[51]	valid_0's auc: 1	valid_0's binary_logloss: 2.68229e-10


In [30]:
if offline:
    all_data = pd.read_csv('./train_click_log.csv')
else:
    trn_data = pd.read_csv('./train_click_log.csv')
    tst_data = pd.read_csv('./testA_click_log.csv')
    all_data = trn_data.append(tst_data)


In [31]:
hist_click =all_data[['user_id', 'click_article_id']].groupby('user_id').agg({list}).reset_index()
his_behavior_df = pd.DataFrame()
his_behavior_df['user_id'] = hist_click['user_id']
his_behavior_df['hist_click_article_id'] = hist_click['click_article_id']


In [32]:
trn_user_item_feats_df_din_model = trn_user_item_feats_df.copy()

if offline:
    val_user_item_feats_df_din_model = val_user_item_feats_df.copy()
else: 
    val_user_item_feats_df_din_model = None
    
tst_user_item_feats_df_din_model = tst_user_item_feats_df.copy()


In [40]:
trn_user_item_feats_df_din_model = trn_user_item_feats_df_din_model.merge(his_behavior_df, on='user_id')

if offline:
    val_user_item_feats_df_din_model = val_user_item_feats_df_din_model.merge(his_behavior_df, on='user_id')
else:
    val_user_item_feats_df_din_model = None

tst_user_item_feats_df_din_model = tst_user_item_feats_df_din_model.merge(his_behavior_df, on='user_id')

In [33]:
# 导入deepctr
from deepctr.models import DIN
from deepctr.feature_column import SparseFeat, VarLenSparseFeat, DenseFeat, get_feature_names
from tensorflow.keras.preprocessing.sequence import pad_sequences

from tensorflow.keras import backend as K
from tensorflow.keras.layers import *
from tensorflow.keras.models import *
from tensorflow.keras.callbacks import * 
import tensorflow as tf

import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "2"


In [56]:
# 数据准备函数
def get_din_feats_columns(df, dense_fea, sparse_fea, behavior_fea, his_behavior_fea, emb_dim=32, max_len=100):
    """
    数据准备函数:
    df: 数据集
    dense_fea: 数值型特征列
    sparse_fea: 离散型特征列
    behavior_fea: 用户的候选行为特征列
    his_behavior_fea: 用户的历史行为特征列
    embedding_dim: embedding的维度， 这里为了简单， 统一把离散型特征列采用一样的隐向量维度
    max_len: 用户序列的最大长度
    """
    
    for feat in sparse_fea:
        vocabulary_size=df[feat].nunique() 
        print(feat)
        print(vocabulary_size)
    sparse_feature_columns = [SparseFeat(feat, vocabulary_size=df[feat].nunique() + 2, embedding_dim=emb_dim) for feat in sparse_fea]
    
    dense_feature_columns = [DenseFeat(feat, 1, ) for feat in dense_fea]
    
    var_feature_columns = [VarLenSparseFeat(SparseFeat(feat, vocabulary_size=df['click_article_id'].nunique() + 1,
                                    embedding_dim=emb_dim, embedding_name='click_article_id'), maxlen=max_len) for feat in hist_behavior_fea]
    
    dnn_feature_columns = sparse_feature_columns + dense_feature_columns + var_feature_columns
    
    # 建立x, x是一个字典的形式
    x = {}
    for name in get_feature_names(dnn_feature_columns):
        if name in his_behavior_fea:
            # 这是历史行为序列
            his_list = [l for l in df[name]]
            x[name] = pad_sequences(his_list, maxlen=max_len, padding='post')      # 二维数组
        else:
            x[name] = df[name].values
    
    return x, dnn_feature_columns


In [57]:
# 把特征分开
sparse_fea = ['user_id', 'click_article_id', 'category_id', 'click_environment', 'click_deviceGroup', 
              'click_os', 'click_country', 'click_region', 'click_referrer_type', 'is_cat_hab']

behavior_fea = ['click_article_id']

hist_behavior_fea = ['hist_click_article_id']

dense_fea = ['sim0', 'time_diff0', 'word_diff0', 'sim_max', 'sim_min', 'sim_sum', 'sim_mean', 'score',
             'rank','click_size','time_diff_mean','active_level','user_time_hob1','user_time_hob2',
             'words_hbo','words_count']


In [58]:
# dense特征进行归一化, 神经网络训练都需要将数值进行归一化处理
mm = MinMaxScaler()

# 下面是做一些特殊处理，当在其他的地方出现无效值的时候，不处理无法进行归一化，刚开始可以先把他注释掉，在运行了下面的代码
# 之后如果发现报错，应该先去想办法处理如何不出现inf之类的值
# trn_user_item_feats_df_din_model.replace([np.inf, -np.inf], 0, inplace=True)
# tst_user_item_feats_df_din_model.replace([np.inf, -np.inf], 0, inplace=True)

for feat in dense_fea:
    trn_user_item_feats_df_din_model[feat] = mm.fit_transform(trn_user_item_feats_df_din_model[[feat]])
    
    if val_user_item_feats_df_din_model is not None:
        val_user_item_feats_df_din_model[feat] = mm.fit_transform(val_user_item_feats_df_din_model[[feat]])
    
    tst_user_item_feats_df_din_model[feat] = mm.fit_transform(tst_user_item_feats_df_din_model[[feat]])


In [61]:
# 准备训练数据
x_trn, dnn_feature_columns = get_din_feats_columns(trn_user_item_feats_df_din_model, dense_fea, 
                                               sparse_fea, behavior_fea, hist_behavior_fea, max_len=50)
y_trn = trn_user_item_feats_df_din_model['label'].values

# if offline:
#     # 准备验证数据
#     x_val, dnn_feature_columns = get_din_feats_columns(val_user_item_feats_df_din_model, dense_fea, 
#                                                    sparse_fea, behavior_fea, hist_behavior_fea, max_len=50)
#     y_val = val_user_item_feats_df_din_model['label'].values
    
# dense_fea = [x for x in dense_fea if x != 'label']
# x_tst, dnn_feature_columns = get_din_feats_columns(tst_user_item_feats_df_din_model, dense_fea, 
#                                                sparse_fea, behavior_fea, hist_behavior_fea, max_len=50)


user_id
200000
click_article_id
31517
category_id
44
click_environment
3
click_deviceGroup
4
click_os
8
click_country
11
click_region
28
click_referrer_type
7
is_cat_hab
1
user_id
50000
click_article_id
16764
category_id
40
click_environment
3
click_deviceGroup
4
click_os
8
click_country
11
click_region
28
click_referrer_type
7
is_cat_hab
1


In [62]:
# 建立模型
model = DIN(dnn_feature_columns, behavior_fea)

# 查看模型结构
model.summary()

# 模型编译
model.compile('adam', 'binary_crossentropy',metrics=['binary_crossentropy', tf.keras.metrics.AUC()])


Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
user_id (InputLayer)            [(None, 1)]          0                                            
__________________________________________________________________________________________________
click_article_id (InputLayer)   [(None, 1)]          0                                            
__________________________________________________________________________________________________
category_id (InputLayer)        [(None, 1)]          0                                            
__________________________________________________________________________________________________
click_environment (InputLayer)  [(None, 1)]          0                                            
____________________________________________________________________________________________

In [63]:
# 模型训练
if offline:
    history = model.fit(x_trn, y_trn, verbose=1, epochs=10, validation_data=(x_val, y_val) , batch_size=256)
else:
    # 也可以使用上面的语句用自己采样出来的验证集
    # history = model.fit(x_trn, y_trn, verbose=1, epochs=3, validation_split=0.3, batch_size=256)
    history = model.fit(x_trn, y_trn, verbose=1, epochs=2, batch_size=256)


Epoch 1/2


InvalidArgumentError:  indices[1,0] = 17 is not in [0, 10)
	 [[node model_2/sparse_emb_click_os/embedding_lookup (defined at var/folders/rj/kh58t7hj5s99k8krgn4brl1c0000gn/T/ipykernel_18352/492187509.py:7) ]] [Op:__inference_train_function_14845]

Errors may have originated from an input operation.
Input Source operations connected to node model_2/sparse_emb_click_os/embedding_lookup:
 model_2/sparse_emb_click_os/embedding_lookup/13084 (defined at Users/lhc456/opt/anaconda3/lib/python3.9/contextlib.py:119)

Function call stack:
train_function
