In [1]:
import sys
sys.path.append(".")
sys.path.append("..")
import Config
from Bias_DNN import DNN
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import pandas as pd
import random
from utils import *

def setup_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    random.seed(seed)
    torch.backends.cudnn.deterministic = True

setup_seed(114514)

# 读取特征
test_user_feature = pd.read_csv('../feature/test_user_feature.csv')
item_feature = pd.read_csv('../feature/item_feature.csv')

# 读取数据
train = pd.read_csv('../data/train_data.csv')
test = pd.read_csv('../data/test_data.csv')

for fea in Config.count_feature:
    test_user_feature[fea] = list(map(eval, list(test_user_feature[fea])))
    train[fea] = list(map(eval, list(train[fea])))
    test[fea] = list(map(eval, list(test[fea])))



train_y, test_y = torch.Tensor(list(train['label'])), torch.Tensor(list(test['label']))

### 这里放弃用time_diff特征作为bias, 使用后auc大幅降低(若不使用time_diff则auc会更低)

In [2]:
# 首先训练模型
model = DNN(
    Features = Config.item_feature + Config.count_feature + Config.user_feature + Config.match_feature, 
    hidden_unit = [32], 
    Bias_Features = [],
    bias_unit = []
)

model.fit(train, train_y, test, test_y, epoch=5, batch_size=1024, lr=5e-2)

[2022-02-24 19:53:59.390149]start fit model
[2022-02-24 19:54:48.419268][epoch:1||train_loss/test_loss:1.0240/1.0250||train_auc/test_auc:0.936/0.932]
[2022-02-24 19:55:33.870559][epoch:2||train_loss/test_loss:0.9917/0.9929||train_auc/test_auc:0.945/0.942]
[2022-02-24 19:56:21.985455][epoch:3||train_loss/test_loss:0.9681/0.9695||train_auc/test_auc:0.948/0.944]
[2022-02-24 19:57:10.760477][epoch:4||train_loss/test_loss:0.9502/0.9517||train_auc/test_auc:0.949/0.945]
[2022-02-24 19:57:56.682893][epoch:5||train_loss/test_loss:0.9358/0.9373||train_auc/test_auc:0.949/0.945]


In [3]:
# 冻结模型参数
for param in model.parameters():
    param.requires_grad = False

print(model)

DNN(
  (embs): Normal_Embedding(
    (embs): ModuleDict(
      (click_article_id): Embedding(31116, 32)
      (category_id): Embedding(290, 4)
      (count_feature): Embedding(31117, 32, padding_idx=31116)
      (click_environment): Embedding(3, 1)
      (click_deviceGroup): Embedding(5, 1)
      (click_os): Embedding(8, 1)
      (click_country): Embedding(11, 1)
      (click_region): Embedding(28, 2)
      (click_referrer_type): Embedding(7, 1)
      (time_diff): Embedding(4, 1)
    )
  )
  (fc): ModuleList(
    (0): Linear(in_features=172, out_features=32, bias=True)
    (1): Linear(in_features=32, out_features=1, bias=True)
  )
  (bias_fc): ModuleList()
  (ce): BCEWithLogitsLoss()
)


In [4]:
# 解析json
import json
def read_json(path):
    f = open(path, 'r')
    return json.load(f)

# 读取召回字典, 这里用MF单路召回
# recall_data = read_json('../recall/recall_data/MF_data.json')
recall_data = read_json('../recall/recall_data/DSSM_data.json')
timelogger(f"all users:{len(recall_data)}, recall nums per user:{len(recall_data[list(recall_data.keys())[0]])}")

[2022-02-24 19:58:01.208976]all users:50000, recall nums per user:300


In [5]:
# 排序, 取分数最高的10个作为推荐序列
def get_rec_seq(user_id, top_k):
    items = sorted(recall_data[str(user_id)]) # 候选样本
    fs = merge_feature(
        user_id = user_id,
        user_df = test_user_feature,
        item_df = item_feature[item_feature['click_article_id'].isin(items)],
        use_match_feature = True
    )
    scores = model.get_y_pre(fs).detach().numpy()
    candidates = [(items[i], scores[i]) for i in range(len(scores))]
    candidates = sorted(candidates, key=lambda x:x[1], reverse = True)
    return candidates[:top_k]
    
from tqdm import tqdm

rec_seq = {}
top_k = 10
for user in tqdm(list(test_user_feature['user_id'])):
    rec_seq[user] = get_rec_seq(user, top_k)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  item_df['user_id'] = user_id
100%|██████████| 50000/50000 [21:55<00:00, 38.01it/s]


In [11]:
# 生成推荐序列, 每个用户5篇, 保存推荐序列到 ./result.csv, columns = [user_id, article1, score1, ..., article5, score5]
result = {'user_id':[]}
for i in range(1,6):
    result[f'article{i}'] = []
    result[f'score{i}'] = []


for user in tqdm(list(test_user_feature['user_id'])):
    candidates = rec_seq[user]
    result['user_id'].append(user)
    for i in range(1,6):
        result[f'article{i}'].append(candidates[i-1][0])
        result[f'score{i}'].append(candidates[i-1][1])

result = pd.DataFrame(result)

import os

try:
    os.system('mkdir result')
except:
    pass

result.to_csv('./result/result.csv', index=False)
timelogger("successfully save result to ./result/result.csv")

100%|██████████| 50000/50000 [00:00<00:00, 126552.19it/s]


[2022-02-24 20:32:38.911406]successfully save result to ./result/result.csv
