# 数据预处理

In [1]:
import numpy as np
import pandas as pd
import pickle
from tqdm import tqdm
import gc, os
import time
from datetime import datetime
import lightgbm as lgb
from sklearn.preprocessing import MinMaxScaler
import warnings
warnings.filterwarnings('ignore')

## 读取特征

In [2]:
data_path = './data/'
save_path = './temp_results/'
offline = False

In [3]:
# 重新读取数据的时候，发现click_article_id是一个浮点数，所以将其转换成int类型
trn_user_item_feats_df = pd.read_csv(save_path + 'trn_user_item_feats_df.csv')
trn_user_item_feats_df['click_article_id'] = trn_user_item_feats_df['click_article_id'].astype(int)

if offline:
    val_user_item_feats_df = pd.read_csv(save_path + 'val_user_item_feats_df.csv')
    val_user_item_feats_df['click_article_id'] = val_user_item_feats_df['click_article_id'].astype(int)
else:
    val_user_item_feats_df = None
    
tst_user_item_feats_df = pd.read_csv(save_path + 'tst_user_item_feats_df.csv')
tst_user_item_feats_df['click_article_id'] = tst_user_item_feats_df['click_article_id'].astype(int)

# 做特征的时候为了方便，给测试集也打上了一个无效的标签，这里直接删掉就行
del tst_user_item_feats_df['label']
    

In [4]:
trn_user_item_feats_df

Unnamed: 0,user_id,click_article_id,sim0,time_diff0,word_diff0,sim_max,sim_min,sim_sum,sim_mean,score,...,click_country,click_region,click_referrer_type,user_time_hob1,user_time_hob2,word_hbo,category_id,created_at_ts,words_count,is_cat_hab
0,0,191890,0.215748,1603305000,80,0.215748,0.215748,0.215748,0.215748,0.996221,...,1,25,2,0.343715,0.992865,266.000000,309,1506581786000,242,0
1,11,191890,0.068161,1600533000,54,0.068161,0.068161,0.068161,0.068161,0.996075,...,1,25,2,0.343551,0.992781,200.000000,309,1506581786000,242,0
2,31,191890,-0.023481,1595980000,28,-0.023481,-0.023481,-0.023481,-0.023481,0.995851,...,1,25,1,0.343456,0.992715,218.000000,309,1506581786000,242,0
3,86,191890,0.263226,1599786000,30,0.263226,0.263226,0.263226,0.263226,0.996370,...,1,25,2,0.343011,0.992780,213.500000,309,1506581786000,242,0
4,94,191890,-0.002702,1605934000,2,-0.002702,-0.002702,-0.002702,-0.002702,0.995544,...,1,25,2,0.342910,0.992766,244.500000,309,1506581786000,242,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
180281,190410,255667,0.248752,12916156000,66,0.248752,0.248752,0.248752,0.248752,0.878177,...,1,21,7,0.024216,0.968344,206.333333,389,1507532356000,143,0
180282,190814,223086,0.142165,49740944000,172,0.142165,0.142165,0.142165,0.142165,0.949023,...,1,25,5,0.251930,0.959082,189.375000,354,1457628889000,275,0
180283,191418,337052,0.359244,166246000,12,0.359244,0.359244,0.359244,0.359244,0.985636,...,1,22,5,0.130341,0.969127,240.000000,437,1507731534000,182,0
180284,193649,99177,0.637963,7058577000,7,0.637963,0.637963,0.637963,0.637963,0.973198,...,1,21,1,0.198660,0.950548,82.975000,223,1507297762000,41,0


## 评估

In [5]:
def submit(recall_df, topk=5, model_name=None):
    """变换为提交的格式"""
    
    recall_df = rank_results.sort_values(by=["user_id", "pred_score"])
    # 排序（由大到小）
    recall_df["rank"] = recall_df.groupby("user_id")["pred_score"].rank(ascending=False, method="first")
    # 为每一个用户推荐5篇新闻文章
    tmp = recall_df.groupby("user_id").apply(lambda x:x["rank"].max())

    assert tmp.min() > topk

    # 获取排名前五的文章
    submit = recall_df[recall_df["rank"] <=topk].set_index(["user_id", "rank"]).unstack(-1).reset_index()
    submit.columns = [int(col) if isinstance(col, int) else col for col in submit.columns.droplevel(0)]
    submit = submit.rename(columns={'': 'user_id', 1: 'article_1', 2: 'article_2', 
                                                  3: 'article_3', 4: 'article_4', 5: 'article_5'})

    save_name = save_path + model_name + "_" + datetime.today().strftime("%m-%d") + ".csv"
    submit.to_csv(save_name, index=False, header=True)

In [6]:
# 排序结果归一化
def norm_sim(sim_df, weight=0.0):

    min_sim = sim_df.min()
    max_sim = sim_df.max()
    if max_sim == min_sim:
        sim_df = sim_df.apply(lambda sim: 1.0)
    else:
        sim_df = sim_df.apply(lambda sim: 1.0 * (sim - min_sim) / (max_sim - min_sim))

    sim_df = sim_df.apply(lambda sim: sim + weight)  # plus one
    return sim_df

# 传统机器学习模型

## LGB排序模型

In [7]:
# rank数据
trn_user_item_feats_df_rank_model = trn_user_item_feats_df.copy()

if offline:
    val_user_item_feats_df_rank_model = val_user_item_feats_df.copy()

tst_user_item_feats_df_rank_model = tst_user_item_feats_df.copy()

In [8]:
trn_user_item_feats_df_rank_model.head()

Unnamed: 0,user_id,click_article_id,sim0,time_diff0,word_diff0,sim_max,sim_min,sim_sum,sim_mean,score,...,click_country,click_region,click_referrer_type,user_time_hob1,user_time_hob2,word_hbo,category_id,created_at_ts,words_count,is_cat_hab
0,0,191890,0.215748,1603305000,80,0.215748,0.215748,0.215748,0.215748,0.996221,...,1,25,2,0.343715,0.992865,266.0,309,1506581786000,242,0
1,11,191890,0.068161,1600533000,54,0.068161,0.068161,0.068161,0.068161,0.996075,...,1,25,2,0.343551,0.992781,200.0,309,1506581786000,242,0
2,31,191890,-0.023481,1595980000,28,-0.023481,-0.023481,-0.023481,-0.023481,0.995851,...,1,25,1,0.343456,0.992715,218.0,309,1506581786000,242,0
3,86,191890,0.263226,1599786000,30,0.263226,0.263226,0.263226,0.263226,0.99637,...,1,25,2,0.343011,0.99278,213.5,309,1506581786000,242,0
4,94,191890,-0.002702,1605934000,2,-0.002702,-0.002702,-0.002702,-0.002702,0.995544,...,1,25,2,0.34291,0.992766,244.5,309,1506581786000,242,0


In [9]:
# 定义特征列
lgb_cols = ['sim0', 'time_diff0', 'word_diff0','sim_max', 'sim_min', 'sim_sum', 
            'sim_mean', 'score','click_size', 'time_diff_mean', 'active_level',
            'click_environment','click_deviceGroup', 'click_os', 'click_country', 
            'click_region','click_referrer_type', 'user_time_hob1', 'user_time_hob2',
            'word_hbo', 'category_id', 'created_at_ts','words_count']

In [10]:
# 先排序
trn_user_item_feats_df_rank_model.sort_values(by="user_id", inplace=True)
# 排序模型独有，优化只在所在空间内
g_train = trn_user_item_feats_df_rank_model.groupby(["user_id"], as_index=False).count()["label"].values

if offline:
    val_user_item_feats_df_rank_model.sort_values(by="user_id", inplace=True)
    g_val = val_user_item_feats_df_rank_model.groupby(["user_id"], as_index=False).count()["label"].values

In [11]:
# 模型定义
lgb_rank = lgb.LGBMRanker(n_estimators=300, n_jobs=16)

In [12]:
# Training Model
if offline:
    lgb_ranker.fit(trn_user_item_feats_df_rank_model[lgb_cols], trn_user_item_feats_df_rank_model['label'], group=g_train,
                eval_set=[(val_user_item_feats_df_rank_model[lgb_cols], val_user_item_feats_df_rank_model['label'])], 
                eval_group= [g_val], eval_at=[1, 2, 3, 4, 5], eval_metric=['ndcg', ], early_stopping_rounds=50, )
else:
    lgb_rank.fit(trn_user_item_feats_df_rank_model[lgb_cols], trn_user_item_feats_df_rank_model["label"], group=g_train)

In [13]:
lgb_rank.best_iteration_

In [14]:
tst_user_item_feats_df_rank_model["pred_score"] = lgb_rank.predict(tst_user_item_feats_df_rank_model[lgb_cols], num_iteration=lgb_rank.best_iteration_)
tst_user_item_feats_df_rank_model[["user_id", "click_article_id", "pred_score"]].to_csv(save_path + "lgb_ranker_score.csv", index=False)

In [15]:
rank_results = tst_user_item_feats_df_rank_model[["user_id", "click_article_id", "pred_score"]]
rank_results["click_article_id"] = rank_results["click_article_id"].astype("int")
rank_results.head()

Unnamed: 0,user_id,click_article_id,pred_score
0,200000,123938,-4.050364
1,200001,123938,2.547849
2,200002,123938,-4.093964
3,200004,123938,3.312659
4,200006,123938,-4.059014


In [16]:
submit(rank_results, model_name="lgb_ranker")

In [17]:
def get_kfold_users(trn_df, n=5):
    """对用户执行五折交叉验证"""
    
    user_ids = trn_df.user_id.unique()
    user_set = [user_ids[i::n] for i in range(n)]
    
    return user_set

In [18]:
# 五折交叉验证模型
k_fold = 5
trn_df = trn_user_item_feats_df_rank_model
user_set = get_kfold_users(trn_df, n=k_fold)

sore_list = []
score_df = trn_df[["user_id", "click_article_id", "label"]]
sub_preds = np.zeros(tst_user_item_feats_df_rank_model.shape[0])

for n_fold, valid_user in enumerate(user_set):
    print("正在执行第{}折交叉验证......".format(n_fold+1))
    # 五折划分数据集
    train_idx = trn_df[~trn_df["user_id"].isin(valid_user)]
    valid_idx = trn_df[trn_df["user_id"].isin(valid_user)]
    
    # 训练集和验证集进行分组构建，利于排序
    train_idx.sort_values("user_id", inplace=True)
    g_train = train_idx.groupby("user_id", as_index=False).count()['label'].values
    
    valid_idx.sort_values("user_id", inplace=True)
    g_val = valid_idx.groupby("user_id", as_index=False).count()['label'].values
    
    lgb_ranker = lgb.LGBMRanker(boosting_type='gbdt', num_leaves=31, reg_alpha=0.0, reg_lambda=1,
                        max_depth=-1, n_estimators=100, subsample=0.7, colsample_bytree=0.7, subsample_freq=1,
                        learning_rate=0.01, min_child_weight=50, random_state=2018, n_jobs= 16)
    
    lgb_ranker.fit(train_idx[lgb_cols], train_idx['label'], group=g_train,
                   eval_set=[(valid_idx[lgb_cols], valid_idx['label'])], eval_group= [g_val], 
                   eval_at=[1, 2, 3, 4, 5], eval_metric=['ndcg', ], early_stopping_rounds=50, )
    
    # 预测验证集结果
    valid_idx['pred_score'] = lgb_ranker.predict(valid_idx[lgb_cols], num_iteration=lgb_ranker.best_iteration_)
    
    # 归一化
    valid_idx['pred_score'] = valid_idx[['pred_score']].transform(lambda x: norm_sim(x))
    valid_idx.sort_values(by=["user_id", "pred_score"], inplace=True)
    
    # rank
    valid_idx["pred_rank"] = valid_idx.groupby("user_id")["pred_score"].rank(ascending=False, method="first")
    sore_list.append(valid_idx[["user_id", "click_article_id", "pred_score", "pred_rank"]])
    
    # 线上验证,五折求平均
    if not offline:
        sub_preds += lgb_ranker.predict(tst_user_item_feats_df_rank_model[lgb_cols], lgb_ranker.best_iteration_)

正在执行第1折交叉验证......
[1]	valid_0's ndcg@1: 0.999917	valid_0's ndcg@2: 0.999969	valid_0's ndcg@3: 0.999969	valid_0's ndcg@4: 0.999969	valid_0's ndcg@5: 0.999969
Training until validation scores don't improve for 50 rounds
[2]	valid_0's ndcg@1: 0.999917	valid_0's ndcg@2: 0.999969	valid_0's ndcg@3: 0.999969	valid_0's ndcg@4: 0.999969	valid_0's ndcg@5: 0.999969
[3]	valid_0's ndcg@1: 0.999917	valid_0's ndcg@2: 0.999969	valid_0's ndcg@3: 0.999969	valid_0's ndcg@4: 0.999969	valid_0's ndcg@5: 0.999969
[4]	valid_0's ndcg@1: 0.999917	valid_0's ndcg@2: 0.999969	valid_0's ndcg@3: 0.999969	valid_0's ndcg@4: 0.999969	valid_0's ndcg@5: 0.999969
[5]	valid_0's ndcg@1: 0.999917	valid_0's ndcg@2: 0.999969	valid_0's ndcg@3: 0.999969	valid_0's ndcg@4: 0.999969	valid_0's ndcg@5: 0.999969
[6]	valid_0's ndcg@1: 0.999917	valid_0's ndcg@2: 0.999969	valid_0's ndcg@3: 0.999969	valid_0's ndcg@4: 0.999969	valid_0's ndcg@5: 0.999969
[7]	valid_0's ndcg@1: 0.999917	valid_0's ndcg@2: 0.999969	valid_0's ndcg@3: 0.999969	va

In [29]:
# 拼接所有的验证集上的预测数据，构成stacking数据类型
score_df_ = pd.concat(sore_list, axis=0)
score_df = score_df.merge(score_df_, how='left', on=['user_id', 'click_article_id'])
# 保存训练集交叉验证产生的新特征
score_df[['user_id', 'click_article_id', 'pred_score', 'pred_rank', 'label']].to_csv(save_path + 'trn_lgb_ranker_feats.csv', index=False)

In [30]:
sub_preds

array([0., 0., 0., ..., 0., 0., 0.])

In [31]:
tst_user_item_feats_df_rank_model['pred_score'] = sub_preds / k_fold
tst_user_item_feats_df_rank_model['pred_score'] = tst_user_item_feats_df_rank_model['pred_score'].transform(lambda x: norm_sim(x))
tst_user_item_feats_df_rank_model.sort_values(by=['user_id', 'pred_score'])
tst_user_item_feats_df_rank_model['pred_rank'] = tst_user_item_feats_df_rank_model.groupby(['user_id'])['pred_score'].rank(ascending=False, method='first')

# 保存测试集交叉验证的新特征
tst_user_item_feats_df_rank_model[['user_id', 'click_article_id', 'pred_score', 'pred_rank']].to_csv(save_path + 'tst_lgb_ranker_feats.csv', index=False)

In [32]:
rank_results = tst_user_item_feats_df_rank_model[['user_id', 'click_article_id', 'pred_score']]
rank_results['click_article_id'] = rank_results['click_article_id'].astype(int)
submit(rank_results, topk=5, model_name='lgb_ranker')

## LGB分类模型

In [33]:
# 模型及参数的定义
lgb_Classfication = lgb.LGBMClassifier(boosting_type='gbdt', num_leaves=31, reg_alpha=0.0, reg_lambda=1,
                            max_depth=-1, n_estimators=500, subsample=0.7, colsample_bytree=0.7, subsample_freq=1,
                            learning_rate=0.01, min_child_weight=50, random_state=2018, n_jobs= 16, verbose=10)  

In [34]:
# 模型训练
if offline:
    lgb_Classfication.fit(trn_user_item_feats_df_rank_model[lgb_cols], trn_user_item_feats_df_rank_model['label'],
                    eval_set=[(val_user_item_feats_df_rank_model[lgb_cols], val_user_item_feats_df_rank_model['label'])], 
                    eval_metric=['auc', ],early_stopping_rounds=50, )
else:
    lgb_Classfication.fit(trn_user_item_feats_df_rank_model[lgb_cols], trn_user_item_feats_df_rank_model['label'])

[LightGBM] [Info] Number of positive: 21, number of negative: 180265
[LightGBM] [Debug] Dataset::GetMultiBinFromAllFeatures: sparse rate 0.042669
[LightGBM] [Debug] init for col-wise cost 0.000008 seconds, init for row-wise cost 0.008362 seconds
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3786
[LightGBM] [Info] Number of data points in the train set: 180286, number of used features: 23
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.000116 -> initscore=-9.057661
[LightGBM] [Info] Start training from score -9.057661
[LightGBM] [Debug] Re-bagging, using 126322 data to train
[LightGBM] [Debug] Trained a tree with leaves = 1 and max_depth = 1
[LightGBM] [Debug] Re-bagging, using 126119 data to train
[LightGBM] [Debug] Trained a tree with leaves = 1 and max_depth = 1
[LightGBM] [Debug] Re-bagging, using 125822 data to train
[LightGBM] [Debug] Trained a tree with leaves = 1 and max_depth = 1
[LightGBM] [Debug] Re-bagging, using 126015 data to trai

In [35]:
tst_user_item_feats_df['pred_score'] = lgb_Classfication.predict_proba(tst_user_item_feats_df[lgb_cols])[:, 1]

In [36]:
# 将这里的排序结果保存一份，用户后面的模型融合
tst_user_item_feats_df[['user_id', 'click_article_id', 'pred_score']].to_csv(save_path + 'lgb_cls_score.csv', index=False)

In [37]:
# 预测结果重新排序, 及生成提交结果
rank_results = tst_user_item_feats_df[['user_id', 'click_article_id', 'pred_score']]
rank_results['click_article_id'] = rank_results['click_article_id'].astype(int)
submit(rank_results, topk=5, model_name='lgb_cls')

In [38]:
# 五折交叉验证，这里的五折交叉是以用户为目标进行五折划分
#  这一部分与前面的单独训练和验证是分开的
def get_kfold_users(trn_df, n=5):
    user_ids = trn_df['user_id'].unique()
    user_set = [user_ids[i::n] for i in range(n)]
    return user_set

k_fold = 5
trn_df = trn_user_item_feats_df_rank_model
user_set = get_kfold_users(trn_df, n=k_fold)

score_list = []
score_df = trn_df[['user_id', 'click_article_id', 'label']]
sub_preds = np.zeros(tst_user_item_feats_df_rank_model.shape[0])

# 五折交叉验证，并将中间结果保存用于staking
for n_fold, valid_user in enumerate(user_set):
    print("正在执行第{}折交叉验证......".format(n_fold+1))
    train_idx = trn_df[~trn_df['user_id'].isin(valid_user)] # add slide user
    valid_idx = trn_df[trn_df['user_id'].isin(valid_user)]
    
    # 模型及参数的定义
    lgb_Classfication = lgb.LGBMClassifier(boosting_type='gbdt', num_leaves=31, reg_alpha=0.0, reg_lambda=1,
                            max_depth=-1, n_estimators=100, subsample=0.7, colsample_bytree=0.7, subsample_freq=1,
                            learning_rate=0.01, min_child_weight=50, random_state=2018, n_jobs= 16, verbose=10)  
    # 训练模型
    lgb_Classfication.fit(train_idx[lgb_cols], train_idx['label'],eval_set=[(valid_idx[lgb_cols], valid_idx['label'])], 
                          eval_metric=['auc', ],early_stopping_rounds=50, )
    
    # 预测验证集结果
    valid_idx['pred_score'] = lgb_Classfication.predict_proba(valid_idx[lgb_cols], 
                                                              num_iteration=lgb_Classfication.best_iteration_)[:,1]
    
    # 对输出结果进行归一化 分类模型输出的值本身就是一个概率值不需要进行归一化
    # valid_idx['pred_score'] = valid_idx[['pred_score']].transform(lambda x: norm_sim(x))
    
    valid_idx.sort_values(by=['user_id', 'pred_score'])
    valid_idx['pred_rank'] = valid_idx.groupby(['user_id'])['pred_score'].rank(ascending=False, method='first')
    
    # 将验证集的预测结果放到一个列表中，后面进行拼接
    score_list.append(valid_idx[['user_id', 'click_article_id', 'pred_score', 'pred_rank']])
    
    # 如果是线上测试，需要计算每次交叉验证的结果相加，最后求平均
    if not offline:
        sub_preds += lgb_Classfication.predict_proba(tst_user_item_feats_df_rank_model[lgb_cols], 
                                                     num_iteration=lgb_Classfication.best_iteration_)[:,1]
    
score_df_ = pd.concat(score_list, axis=0)
score_df = score_df.merge(score_df_, how='left', on=['user_id', 'click_article_id'])
# 保存训练集交叉验证产生的新特征
score_df[['user_id', 'click_article_id', 'pred_score', 'pred_rank', 'label']].to_csv(save_path + 'trn_lgb_cls_feats.csv', index=False)
    
# 测试集的预测结果，多次交叉验证求平均,将预测的score和对应的rank特征保存，可以用于后面的staking，这里还可以构造其他更多的特征
tst_user_item_feats_df_rank_model['pred_score'] = sub_preds / k_fold
tst_user_item_feats_df_rank_model['pred_score'] = tst_user_item_feats_df_rank_model['pred_score'].transform(lambda x: norm_sim(x))
tst_user_item_feats_df_rank_model.sort_values(by=['user_id', 'pred_score'])
tst_user_item_feats_df_rank_model['pred_rank'] = tst_user_item_feats_df_rank_model.groupby(['user_id'])['pred_score'].rank(ascending=False, method='first')

# 保存测试集交叉验证的新特征
tst_user_item_feats_df_rank_model[['user_id', 'click_article_id', 'pred_score', 'pred_rank']].to_csv(save_path + 'tst_lgb_cls_feats.csv', index=False)

正在执行第1折交叉验证......
[LightGBM] [Info] Number of positive: 16, number of negative: 144216
[LightGBM] [Debug] Dataset::GetMultiBinFromAllFeatures: sparse rate 0.042686
[LightGBM] [Debug] init for col-wise cost 0.000009 seconds, init for row-wise cost 0.006024 seconds
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Debug] Using Dense Multi-Val Bin
[LightGBM] [Info] Total Bins 3770
[LightGBM] [Info] Number of data points in the train set: 144232, number of used features: 23
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.000111 -> initscore=-9.106479
[LightGBM] [Info] Start training from score -9.106479
[LightGBM] [Debug] Re-bagging, using 100874 data to train
[LightGBM] [Debug] Trained a tree with leaves = 1 and max_depth = 1
[1]	valid_0's auc: 0.5	valid_0's binary_logloss: 0.0025258
Training until validation scores don't improve for 50 rounds
[LightGBM] [Debug] Re-bagging, using 101017 data to train
[Li

In [39]:
tst_user_item_feats_df_rank_model[['user_id', 'click_article_id', 'pred_score', 'pred_rank']]

Unnamed: 0,user_id,click_article_id,pred_score,pred_rank
0,200000,123938,1.0,1.0
1,200001,123938,1.0,1.0
2,200002,123938,1.0,1.0
3,200004,123938,1.0,1.0
4,200006,123938,1.0,1.0
...,...,...,...,...
949995,247562,304976,1.0,19.0
949996,240259,255640,1.0,19.0
949997,241876,324987,1.0,19.0
949998,244133,272312,1.0,19.0


# 深度学习模型

## DIN模型

### 数据预处理

In [22]:
# 通过用户日志数据，统计历史点击情况
if offline:
    all_data = pd.read_csv("./data/train_click_log.csv")
else:
    trn_data = pd.read_csv("./data/train_click_log.csv")
    tst_data = pd.read_csv("./data/testA_click_log.csv")
    all_data = trn_data.append(tst_data)

In [23]:
all_data["click_article_id"].values.max()

364046

In [24]:
all_data["click_article_id"].nunique()

35380

In [25]:
all_data["click_article_id"] = all_data["click_article_id"].rank(method="dense").astype("int")

In [26]:
# 每一位用户的历史点击文章，作为DIN的输入
hist_click_article = all_data[["user_id", "click_article_id"]].groupby("user_id").agg({list}).reset_index()
hist_click_article.head()

Unnamed: 0_level_0,user_id,click_article_id
Unnamed: 0_level_1,Unnamed: 1_level_1,list
0,0,"[2343, 16105]"
1,1,"[29121, 6678]"
2,2,"[3222, 17493]"
3,3,"[5196, 3222]"
4,4,"[4359, 3789]"


In [27]:
hist_behavior_df = pd.DataFrame()
hist_behavior_df["user_id"] = hist_click_article["user_id"]
hist_behavior_df["hist_click_article_id"] = hist_click_article["click_article_id"]

In [28]:
hist_behavior_df.head()

Unnamed: 0,user_id,hist_click_article_id
0,0,"[2343, 16105]"
1,1,"[29121, 6678]"
2,2,"[3222, 17493]"
3,3,"[5196, 3222]"
4,4,"[4359, 3789]"


In [29]:
# 统计用户的历史点击文章的数量
click_num = hist_behavior_df["hist_click_article_id"].apply(lambda x: len(x))
click_num.values.shape

(250000,)

In [30]:
# copy数据
trn_user_item_feats_df_din_model = trn_user_item_feats_df.copy()

if offline:
    val_user_item_feats_df_din_model = val_user_item_feats_df.copy()
else: 
    val_user_item_feats_df_din_model = None
    
tst_user_item_feats_df_din_model = tst_user_item_feats_df.copy()

In [31]:
# 为每一个用户制作一个用户历史特征数据
trn_user_item_feats_df_din_model = trn_user_item_feats_df_din_model.merge(hist_behavior_df, on='user_id')

if offline:
    val_user_item_feats_df_din_model = val_user_item_feats_df_din_model.merge(hist_behavior_df, on='user_id')
else:
    val_user_item_feats_df_din_model = None

tst_user_item_feats_df_din_model = tst_user_item_feats_df_din_model.merge(hist_behavior_df, on='user_id')

In [32]:
# 处理得到用户的真实点击次数

trn_user_item_feats_df_din_model["length_name"]  = trn_user_item_feats_df_din_model["hist_click_article_id"].apply(lambda x: len(x))
trn_user_item_feats_df_din_model["length_name"]

0          2
1          2
2          2
3          2
4          2
          ..
180281     8
180282     8
180283     9
180284     6
180285    40
Name: length_name, Length: 180286, dtype: int64

In [33]:
trn_user_item_feats_df_din_model.columns

Index(['user_id', 'click_article_id', 'sim0', 'time_diff0', 'word_diff0',
       'sim_max', 'sim_min', 'sim_sum', 'sim_mean', 'score', 'rank', 'label',
       'click_size', 'time_diff_mean', 'active_level', 'click_environment',
       'click_deviceGroup', 'click_os', 'click_country', 'click_region',
       'click_referrer_type', 'user_time_hob1', 'user_time_hob2', 'word_hbo',
       'category_id', 'created_at_ts', 'words_count', 'is_cat_hab',
       'hist_click_article_id', 'length_name'],
      dtype='object')

In [34]:
# pytorch
# from deepctr_torch.models import DIN
# from deepctr_torch.inputs import SparseFeat, VarLenSparseFeat, get_feature_names, DenseFeat
# from tensorflow.keras.preprocessing.sequence import pad_sequences

In [35]:
from deepctr.models import DIN
from deepctr.feature_column import SparseFeat, VarLenSparseFeat, DenseFeat, get_feature_names
from tensorflow.keras.preprocessing.sequence import pad_sequences

from tensorflow.keras import backend as K
from tensorflow.keras.layers import *
from tensorflow.keras.models import *
from tensorflow.keras.callbacks import * 
import tensorflow as tf

In [36]:
# 把特征分开
sparse_fea = ['user_id', 'click_article_id', 'category_id', 'click_environment', 'click_deviceGroup', 
              'click_os', 'click_country', 'click_region', 'click_referrer_type', 'is_cat_hab']

# 行为特征
behavior_fea = ['click_article_id']

# 历史兴趣
hist_behavior_fea = ['hist_click_article_id']

dense_fea = ['sim0', 'time_diff0', 'word_diff0', 'sim_max', 'sim_min', 'sim_sum', 'sim_mean', 'score',
             'rank','click_size','time_diff_mean','active_level','user_time_hob1','user_time_hob2',
             'word_hbo','words_count']

In [37]:
df = trn_user_item_feats_df_din_model

# 检查数据的输入维度是否匹配

for feat in sparse_fea:
    max_index = df[feat].nunique()+1
    feat_un = df[feat].unique()
    raw2idx = {}
    
    if len(df[feat]) == df[feat].apply(lambda x: 0 <= x < max_index).sum():
        continue
    else:
        print(feat + "：维度不匹配, 其维度数据目{}， 最大索引{}，最小索引{}".format(max_index, feat_un.max(), feat_un.min()))
        
        tmp = dict(zip(df[feat].values, df[feat].rank(method="dense").astype("int").values))
        # 编码索引
        raw2idx[feat] = tmp
        df[feat] = df[feat].rank(method="dense").astype("int")

user_id：维度不匹配, 其维度数据目180001， 最大索引199999，最小索引0
click_article_id：维度不匹配, 其维度数据目214， 最大索引360826，最小索引12185
category_id：维度不匹配, 其维度数据目85， 最大索引455，最小索引7
click_environment：维度不匹配, 其维度数据目4， 最大索引4，最小索引1
click_deviceGroup：维度不匹配, 其维度数据目5， 最大索引5，最小索引1
click_os：维度不匹配, 其维度数据目9， 最大索引20，最小索引2


In [38]:
trn_user_item_feats_df_din_model.head()

Unnamed: 0,user_id,click_article_id,sim0,time_diff0,word_diff0,sim_max,sim_min,sim_sum,sim_mean,score,...,click_referrer_type,user_time_hob1,user_time_hob2,word_hbo,category_id,created_at_ts,words_count,is_cat_hab,hist_click_article_id,length_name
0,1,108,0.215748,1603305000,80,0.215748,0.215748,0.215748,0.215748,0.996221,...,2,0.343715,0.992865,266.0,47,1506581786000,242,0,"[2343, 16105]",2
1,11,108,0.068161,1600533000,54,0.068161,0.068161,0.068161,0.068161,0.996075,...,2,0.343551,0.992781,200.0,47,1506581786000,242,0,"[5196, 23679]",2
2,29,108,-0.023481,1595980000,28,-0.023481,-0.023481,-0.023481,-0.023481,0.995851,...,1,0.343456,0.992715,218.0,47,1506581786000,242,0,"[15942, 16656]",2
3,77,108,0.263226,1599786000,30,0.263226,0.263226,0.263226,0.263226,0.99637,...,2,0.343011,0.99278,213.5,47,1506581786000,242,0,"[23679, 1398]",2
4,83,108,-0.002702,1605934000,2,-0.002702,-0.002702,-0.002702,-0.002702,0.995544,...,2,0.34291,0.992766,244.5,47,1506581786000,242,0,"[21611, 4850]",2


In [39]:
def get_din_feats_columns(df, dense_fea, sparse_fea, behavior_fea, hist_behavior_fea, emb_dim=32, max_len=100):
    """
    数据准备函数:
    df: 数据集
    dense_fea: 数值型特征列
    sparse_fea: 离散型特征列
    behavior_fea: 用户的候选行为特征列
    his_behavior_fea: 用户的历史行为特征列
    embedding_dim: embedding的维度， 这里为了简单， 统一把离散型特征列采用一样的隐向量维度
    max_len: 用户序列的最大长度
    """
        
    # 系数特征
    
    sparse_feature_columns = [SparseFeat(name=feat, vocabulary_size=df[feat].nunique()+1, embedding_dim=emb_dim) for feat in sparse_fea]
    
    # 稠密特征
    dense_feature_columns = [DenseFeat(name=feat, dimension=1,) for feat in dense_fea]
    
    # 可变特征
    varlen_feature_columns = [VarLenSparseFeat(SparseFeat(name=feat, vocabulary_size=35380+1
                                                          , embedding_dim=emb_dim, embedding_name="click_article_id"), maxlen=max_len, length_name="length_name") for feat in hist_behavior_fea]
    
    dnn_featue_columns = sparse_feature_columns + dense_feature_columns + varlen_feature_columns
    
    # 构建数据为字典类型的格式
    x = {}
    for name in get_feature_names(dnn_featue_columns):
        if name in hist_behavior_fea:
            hist_list = [l for l in df[name]]
            x[name] = pad_sequences(hist_list, maxlen=max_len, padding="post")
        else:
            x[name] = df[name].values
    
    return x, dnn_featue_columns

In [40]:
## 数据预处理
# 稠密数据进行归一化操作
MMS = MinMaxScaler()

# 替换无群值
trn_user_item_feats_df_din_model.replace([np.inf, -np.inf], 0, inplace=True)
tst_user_item_feats_df_din_model.replace([np.inf, -np.inf], 0, inplace=True)

for feat in dense_fea:
#     print(feat)
    trn_user_item_feats_df_din_model[feat] = MMS.fit_transform(trn_user_item_feats_df_din_model[[feat]])
    if val_user_item_feats_df_din_model is not None:
        val_user_item_feats_df_din_model[feat] = MMS.fit_transform(val_user_item_feats_df_din_model[[feat]])
        
    tst_user_item_feats_df_din_model[feat] = MMS.fit_transform(tst_user_item_feats_df_din_model[[feat]])

In [41]:
# 训练:x
X_trn, dnn_feature_columns = get_din_feats_columns(trn_user_item_feats_df_din_model
                                                   , dense_fea, sparse_fea, behavior_fea
                                                   , hist_behavior_fea, max_len=50
                                                  )
# 训练:label
trn_user_item_feats_df_din_model["label"][1]= 1
Y_trn = trn_user_item_feats_df_din_model["label"].values

In [42]:
trn_user_item_feats_df_din_model["label"].nunique()

2

In [43]:
dnn_feature_columns

[SparseFeat(name='user_id', vocabulary_size=180001, embedding_dim=32, use_hash=False, dtype='int32', embeddings_initializer=<tensorflow.python.keras.initializers.initializers_v1.RandomNormal object at 0x7fa93b433d00>, embedding_name='user_id', group_name='default_group', trainable=True),
 SparseFeat(name='click_article_id', vocabulary_size=214, embedding_dim=32, use_hash=False, dtype='int32', embeddings_initializer=<tensorflow.python.keras.initializers.initializers_v1.RandomNormal object at 0x7fa93b433af0>, embedding_name='click_article_id', group_name='default_group', trainable=True),
 SparseFeat(name='category_id', vocabulary_size=85, embedding_dim=32, use_hash=False, dtype='int32', embeddings_initializer=<tensorflow.python.keras.initializers.initializers_v1.RandomNormal object at 0x7fa93b4334f0>, embedding_name='category_id', group_name='default_group', trainable=True),
 SparseFeat(name='click_environment', vocabulary_size=4, embedding_dim=32, use_hash=False, dtype='int32', embeddin

In [44]:
for feat in sparse_fea:
    print("{} voca_size: {}".format(feat, trn_user_item_feats_df_din_model[feat].nunique()+1))

user_id voca_size: 180001
click_article_id voca_size: 214
category_id voca_size: 85
click_environment voca_size: 4
click_deviceGroup voca_size: 5
click_os voca_size: 9
click_country voca_size: 12
click_region voca_size: 29
click_referrer_type voca_size: 8
is_cat_hab voca_size: 2


In [45]:
trn_user_item_feats_df_din_model[sparse_fea].describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
user_id,180286.0,89994.516685,51967.609432,1.0,44977.25,89992.5,135000.75,180000.0
click_article_id,180286.0,113.819038,67.289505,1.0,40.0,108.0,187.0,213.0
category_id,180286.0,47.202772,25.547243,1.0,18.0,47.0,73.0,84.0
click_environment,180286.0,2.968145,0.193526,1.0,3.0,3.0,3.0,3.0
click_deviceGroup,180286.0,1.486183,0.568238,1.0,1.0,1.0,2.0,4.0
click_os,180286.0,4.546776,2.58199,1.0,1.0,6.0,6.0,8.0
click_country,180286.0,1.300828,1.589765,1.0,1.0,1.0,1.0,11.0
click_region,180286.0,18.182599,7.09341,1.0,13.0,21.0,25.0,28.0
click_referrer_type,180286.0,2.069101,1.293896,1.0,1.0,2.0,2.0,7.0
is_cat_hab,180286.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [46]:
x_trn = X_trn.copy()
x_trn.pop("hist_click_article_id")

pd.DataFrame(x_trn)[sparse_fea].describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
user_id,180286.0,89994.516685,51967.609432,1.0,44977.25,89992.5,135000.75,180000.0
click_article_id,180286.0,113.819038,67.289505,1.0,40.0,108.0,187.0,213.0
category_id,180286.0,47.202772,25.547243,1.0,18.0,47.0,73.0,84.0
click_environment,180286.0,2.968145,0.193526,1.0,3.0,3.0,3.0,3.0
click_deviceGroup,180286.0,1.486183,0.568238,1.0,1.0,1.0,2.0,4.0
click_os,180286.0,4.546776,2.58199,1.0,1.0,6.0,6.0,8.0
click_country,180286.0,1.300828,1.589765,1.0,1.0,1.0,1.0,11.0
click_region,180286.0,18.182599,7.09341,1.0,13.0,21.0,25.0,28.0
click_referrer_type,180286.0,2.069101,1.293896,1.0,1.0,2.0,2.0,7.0
is_cat_hab,180286.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### 建模评估

In [47]:
# # 模型建立
# import torch
# device = 'cpu'
# use_cuda = False

# if use_cuda and torch.cuda.is_available():
#     print('cuda ready...')
#     device = 'cuda:0'
    
model = DIN(dnn_feature_columns, history_feature_list=behavior_fea, seed=2020)

The following Variables were used a Lambda layer's call (lambda), but
are not present in its tracked objects:
  <tf.Variable 'attention_sequence_pooling_layer/local_activation_unit/kernel:0' shape=(40, 1) dtype=float32>
  <tf.Variable 'attention_sequence_pooling_layer/local_activation_unit/bias:0' shape=(1,) dtype=float32>
It is possible that this is intended behavior, but it is more likely
an omission. This is a strong indication that this layer should be
formulated as a subclassed Layer rather than a Lambda layer.


The following Variables were used a Lambda layer's call (lambda), but
are not present in its tracked objects:
  <tf.Variable 'attention_sequence_pooling_layer/local_activation_unit/kernel:0' shape=(40, 1) dtype=float32>
  <tf.Variable 'attention_sequence_pooling_layer/local_activation_unit/bias:0' shape=(1,) dtype=float32>
It is possible that this is intended behavior, but it is more likely
an omission. This is a strong indication that this layer should be
formulated as a subclassed Layer rather than a Lambda layer.


In [48]:
model

<tensorflow.python.keras.engine.functional.Functional at 0x7fa9393e49a0>

In [49]:
X_trn["hist_click_article_id"]

array([[ 2343, 16105,     0, ...,     0,     0,     0],
       [ 5196, 23679,     0, ...,     0,     0,     0],
       [15942, 16656,     0, ...,     0,     0,     0],
       ...,
       [15979, 28355,  2456, ...,     0,     0,     0],
       [24211,  9215,  9214, ...,     0,     0,     0],
       [ 5846,  5224, 28702, ...,     0,     0,     0]], dtype=int32)

In [50]:
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["binary_crossentropy"])
model.fit(x=X_trn, y=Y_trn, epochs=10, batch_size=256)

Epoch 1/10
Epoch 2/10

KeyboardInterrupt: 

In [91]:
trn_user_item_feats_df_din_model["user_id"].unique()

array([     1,     11,     29, ..., 166040, 171342, 174255])

In [92]:
Y_trn

array([0., 1., 0., ..., 0., 0., 0.])

# 知识点
[torch.nn.Embedding的使用](https://blog.csdn.net/weixin_43532000/article/details/104429609?utm_medium=distribute.pc_relevant.none-task-blog-BlogCommendFromBaidu-1.control&depth_1-utm_source=distribute.pc_relevant.none-task-blog-BlogCommendFromBaidu-1.control)

In [93]:
train_data = torch.tensor([[1,1,2,2,2,3,4,2,3,1,2,2,2],
			[4,3,2,5,3,2,7,7,3,6,3,2,3]]
)

NameError: name 'torch' is not defined

In [94]:
train_data.max()

NameError: name 'train_data' is not defined

In [95]:
train_data.min()

NameError: name 'train_data' is not defined

In [96]:
len(train_data.unique())+2

NameError: name 'train_data' is not defined

In [97]:
embedd=nn.Embedding(len(train_data.unique())+2,5)

NameError: name 'nn' is not defined

In [153]:
out_put = embedd(train_data)
out_put

tensor([[[ 0.5020,  0.5123, -0.4282, -0.8447, -0.5466],
         [ 0.5020,  0.5123, -0.4282, -0.8447, -0.5466],
         [ 0.1316, -2.2351,  0.7376, -1.4687,  0.5698],
         [ 0.1316, -2.2351,  0.7376, -1.4687,  0.5698],
         [ 0.1316, -2.2351,  0.7376, -1.4687,  0.5698],
         [-0.0121, -2.9470, -0.9865,  0.1426,  0.6155],
         [-0.8315,  1.9206,  0.1890,  0.0061,  0.9114],
         [ 0.1316, -2.2351,  0.7376, -1.4687,  0.5698],
         [-0.0121, -2.9470, -0.9865,  0.1426,  0.6155],
         [ 0.5020,  0.5123, -0.4282, -0.8447, -0.5466],
         [ 0.1316, -2.2351,  0.7376, -1.4687,  0.5698],
         [ 0.1316, -2.2351,  0.7376, -1.4687,  0.5698],
         [ 0.1316, -2.2351,  0.7376, -1.4687,  0.5698]],

        [[-0.8315,  1.9206,  0.1890,  0.0061,  0.9114],
         [-0.0121, -2.9470, -0.9865,  0.1426,  0.6155],
         [ 0.1316, -2.2351,  0.7376, -1.4687,  0.5698],
         [ 1.4615,  0.2073,  1.0874, -1.0680,  0.6290],
         [-0.0121, -2.9470, -0.9865,  0.1426, 

In [154]:
out_put.shape

torch.Size([2, 13, 5])

In [99]:
from collections import Counter
import torch.nn as nn

# Let's say you have 2 sentences(lowercased, punctuations removed) :
sentences = "i am new to PyTorch i am having fun"
words = sentences.split(' ')

vocab = Counter(words)  # create a dictionary
vocab = sorted(vocab, key=vocab.get, reverse=True)
vocab_size = len(vocab)

# map words to unique indices
word2idx = {word: ind for ind, word in enumerate(vocab)}

# word2idx = {'i': 0, 'am': 1, 'new': 2, 'to': 3, 'pytorch': 4, 'having': 5, 'fun': 6}

encoded_sentences = [word2idx[word] for word in words]

# encoded_sentences = [0, 1, 2, 3, 4, 0, 1, 5, 6]
print(encoded_sentences)
# let's say you want embedding dimension to be 3
emb_dim = 3 


[0, 1, 2, 3, 4, 0, 1, 5, 6]


In [132]:
from sklearn.metrics import log_loss

In [147]:
log_loss(y_true,y_pred)

11.51319199744696

In [144]:
y_true = np.array([1, 0, 0])

In [145]:
y_pred = np.array([1, 0, 1])

In [146]:
y_pred

array([1, 0, 1])