In [1]:
import sys
sys.path.append(".")
sys.path.append("..")
#from RecallConfig import *
import RecallConfig
from MF import MF
from FM import FM
from DSSM import DSSM
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import pandas as pd
import random
from utils import *

MODEL_NAME = input("your model name:") # FM MF DSSM
assert MODEL_NAME in ['FM', 'MF', 'DSSM']

EMB_DIM = 32
hidden_unit = [32]

def setup_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    random.seed(seed)
    torch.backends.cudnn.deterministic = True

setup_seed(114514)

# 读取特征
test_user_feature = pd.read_csv('../feature/test_user_feature.csv')
item_feature = pd.read_csv('../feature/item_feature.csv')

# 读取数据
train = pd.read_csv('../data/train_data.csv')
test = pd.read_csv('../data/test_data.csv')

for fea in RecallConfig.count_feature:
    test_user_feature[fea] = list(map(eval, list(test_user_feature[fea])))
    train[fea] = list(map(eval, list(train[fea])))
    test[fea] = list(map(eval, list(test[fea])))

if MODEL_NAME == 'MF':
    #提取标签是1的样本
    train = train[train['label']==1]
    test = test[test['label']==1]


train_y, test_y = torch.Tensor(list(train['label'])), torch.Tensor(list(test['label']))

In [2]:
User_Features = RecallConfig.user_feature + RecallConfig.count_feature
Item_Features = RecallConfig.item_feature
Features = User_Features + Item_Features

if MODEL_NAME == 'MF':
    model = MF(Features=Features, default_vector_dim=EMB_DIM)
elif MODEL_NAME == 'FM':
    model = FM(Features=Features, default_vector_dim=12)
elif MODEL_NAME == 'DSSM':
    model = DSSM(User_Features, Item_Features, default_vector_dim=Feature_Embedding_Dim, unit_hidden=hidden_unit)

print("{}:{}".format(MODEL_NAME, model.features))
# 训练
model.fit(train, train_y, test, test_y, epoch=5, batch_size=1024, lr=7e-2)
# model.fit(train, train_y, test, test_y, epoch=5, batch_size=1024, lr=1e-1)

['click_article_id', 'user_last_click_7t']
[2022-02-20 19:45:01.426112]start fit model
[2022-02-20 19:45:09.420725][epoch:1||train_loss/test_loss:0.5782/0.6266||train_auc/test_auc:-1.000/-1.000]
[2022-02-20 19:45:12.702114][epoch:2||train_loss/test_loss:0.4104/0.4699||train_auc/test_auc:-1.000/-1.000]
[2022-02-20 19:45:15.472738][epoch:3||train_loss/test_loss:0.3175/0.3824||train_auc/test_auc:-1.000/-1.000]
[2022-02-20 19:45:18.222483][epoch:4||train_loss/test_loss:0.2587/0.3270||train_auc/test_auc:-1.000/-1.000]
[2022-02-20 19:45:21.529101][epoch:5||train_loss/test_loss:0.2183/0.2888||train_auc/test_auc:-1.000/-1.000]


In [3]:
# 冻结模型参数
for parm in model.parameters():
    parm.requires_grad = False

model

MF(
  (embs): Normal_Embedding(
    (embs): ModuleDict(
      (click_article_id): Embedding(31116, 32)
      (count_feature): Embedding(31117, 32, padding_idx=31116)
    )
  )
  (ce): MSELoss()
)

In [4]:
# 为每一个用户生成召回序列
# 构造历史点击序列
from tqdm import tqdm

user_click_hist = {}
for i, row in tqdm(pd.read_csv('../data/train.csv').append(pd.read_csv('../data/test.csv')).sort_values(by=['user_id', "click_timestamp"]).iterrows()):
    user, item = row['user_id'], row['click_article_id']
    user_click_hist.setdefault(user, [])
    user_click_hist[user].append(item)

# 去掉最后一次点击
for user in user_click_hist:
    user_click_hist[user] = user_click_hist[user][:-1]

items = list(item_feature['click_article_id'])
def get_recall_dict(user_id, top_k):
    fs = merge_feature(user_id, test_user_feature, item_feature)
    scores = model.get_y_pre(fs).detach().numpy()
    candidates = [(items[i], scores[i]) for i in range(len(scores))]
    candidates = sorted(candidates, key=lambda x:x[1], reverse = True)
    res = []
    count = 0
    for candidate in candidates:
        if candidate[0] not in user_click_hist[user_id]:
            res.append(candidate)
            count += 1
            if count == top_k: break
    return res


recall_dict = {}
for user in tqdm(list(test_user_feature['user_id'])):
    recall_dict[user] = get_recall_dict(user, 300)

1112623it [03:02, 6091.43it/s]
100%|██████████| 50000/50000 [1:15:38<00:00, 11.02it/s]


In [5]:
# 获取测试集用户的最后一次点击

last_clk = pd.read_csv('../data/test.csv').sort_values(by=['user_id', "click_timestamp"]).groupby('user_id').tail(1)
# last_clk = pd.read_csv('../data/train.csv').append(pd.read_csv('../data/test.csv')).sort_values(by=['user_id', "click_timestamp"]).groupby('user_id').tail(1)

In [6]:
# 看一下召回率
recall_data_ = {}
for user in recall_dict:
    recall_data_[user] = [x[0] for x in recall_dict[user]]

ks = [1, 5, 10, 20, 30, 50, 100, 200, 300]
for k in ks:    
    clk_hit = 0
    # for i, row in last_clk[last_clk['user_id'].isin(list(test_user_feature['user_id']))].iterrows():
    for i, row in last_clk.iterrows():
        user, item = row['user_id'], row['click_article_id']
        if item in recall_data_[user][:k]:
            clk_hit += 1
    message = 'k = {}, hit rate: {}/{} = {:.2f}%'.format(k, clk_hit, len(last_clk), 100*clk_hit/len(last_clk))
    timelogger(message)

[2022-02-20 21:04:18.086801]k = 1, hit rate: 3602/50000 = 7.20%
[2022-02-20 21:04:26.436539]k = 5, hit rate: 10322/50000 = 20.64%
[2022-02-20 21:04:34.356249]k = 10, hit rate: 13704/50000 = 27.41%
[2022-02-20 21:04:43.005996]k = 20, hit rate: 17326/50000 = 34.65%
[2022-02-20 21:04:51.568061]k = 30, hit rate: 19502/50000 = 39.00%
[2022-02-20 21:05:00.797885]k = 50, hit rate: 22123/50000 = 44.25%
[2022-02-20 21:05:10.591909]k = 100, hit rate: 25765/50000 = 51.53%
[2022-02-20 21:05:22.009021]k = 200, hit rate: 29120/50000 = 58.24%
[2022-02-20 21:05:34.779077]k = 300, hit rate: 31040/50000 = 62.08%


In [7]:
# 保存召回字典
import json

class JsonEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, int):
            return int(obj)
        elif isinstance(obj, float):
            return float(obj)
        elif isinstance(obj, np.ndarray):
            return obj.tolist()
        elif isinstance(obj, datetime):                                 
            return obj.__str__()
        else:
            return super(JsonEncoder, self).default(obj)

with open("./recall_data/{}_data.json".format(MODEL_NAME), 'w') as json_file:
    json.dump(recall_data_, json_file, ensure_ascii=False, cls=JsonEncoder)

'''
f = open(filename, 'r')
tmp = json.load(f)

注意读取出来的索引keys由原来的int变为str
'''

"\nf = open(filename, 'r')\ntmp = json.load(f)\n\n注意读取出来的索引keys由原来的int变为str\n"