In [1]:
import os
import torch
import argparse
import torch.nn as n
from transformers import T5Tokenizer
from module import Solomon
import math
import json
import torch
import random
import datetime
from rouge import rouge
from bleu import compute_bleu
from templates import exp_templates, seq_templates, topn_templates
from torch import nn
import pickle
import re
from transformers import (
    T5ForConditionalGeneration,
    LogitsProcessorList,
    MinLengthLogitsProcessor,
    NoBadWordsLogitsProcessor,
    HammingDiversityLogitsProcessor,
    RepetitionPenaltyLogitsProcessor,
    BeamSearchScorer,
    MaxLengthCriteria,
    StoppingCriteriaList,
)
from transformers.modeling_outputs import BaseModelOutput

import torch
from transformers.modeling_outputs import BaseModelOutput
import torch.nn as nn
import torch
import pickle
from transformers import T5Tokenizer
import torch.nn as nn

In [2]:
class MlpProjector(nn.Module):
    def __init__(self, rec_size=64, llm_size=512):
        super().__init__()
        self.mlp_proj = nn.Sequential(
            nn.Linear(rec_size, llm_size),
            nn.GELU(),
            nn.Linear(llm_size, llm_size)
        )

    def forward(self, x):
        x = self.mlp_proj(x)
        return x

In [3]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [4]:
projector = MlpProjector().to(device)

In [5]:
model_version = 't5-small'

In [6]:
model = Solomon.from_pretrained(model_version)

In [None]:
tokenizer = T5Tokenizer.from_pretrained(model_version)

In [None]:
# 创建一个空列表
input_list = []

# 生成从 'item_1' 到 'item_11924' 的字符串，并添加到列表中
for i in range(1, 18358):  # 注意范围是从1到11925，因为range是左闭右开区间
    input_list.append(f"item_{i}")

# 打印列表的前几个元素以确认
print(input_list[:5])  # 打印前5个元素
print(input_list[-5:])  # 打印最后5个元素

['item_1', 'item_2', 'item_3', 'item_4', 'item_5']
['item_18353', 'item_18354', 'item_18355', 'item_18356', 'item_18357']


In [None]:
encoded_source = tokenizer(input_list, padding=True, return_tensors='pt')

In [None]:
source_seq = encoded_source['input_ids'].contiguous()

In [None]:
print("source_seq", source_seq[:5])

source_seq tensor([[2118,  834,  536,    1,    0,    0],
        [2118,  834,  357,    1,    0,    0],
        [2118,  834,  519,    1,    0,    0],
        [2118,  834,  591,    1,    0,    0],
        [2118,  834,  755,    1,    0,    0]])


In [None]:
text_emb = model.shared(source_seq)

In [None]:
print("sahpe", text_emb.shape)

sahpe torch.Size([18357, 6, 512])


In [None]:
import torch
import pickle

# Step Ⅰ: 读取数据集并解析用户-物品关系
item_to_users = {}

# 读取 sequential.txt 文件
with open('data/beauty/sequential.txt', 'r') as file:
    for line in file:
        data = list(map(int, line.strip().split()))
        user_id = data[0] - 1  # 减 1 因为用户ID从1开始，但我们用的是索引
        items = data[1:]
        for item in items:
            if item not in item_to_users:
                item_to_users[item] = []
            item_to_users[item].append(user_id)

# Step Ⅱ: 加载用户嵌入
with open('SASRec_user_embed.pkl', 'rb') as f:
    user_embeddings = torch.tensor(pickle.load(f)).to(device)  # 用户嵌入，形状为 (num_users, 128)

# Step Ⅲ: 创建物品的用户嵌入表示并进行融合
fused_embeddings = [None] * len(item_to_users)  # 用于保存每个物品的融合后的用户嵌入

for item, users in item_to_users.items():
    # 获取购买该物品的所有用户的嵌入
    item_user_emb_list = user_embeddings[users]  # 形状为 (num_users_for_item, 128)

    # 对用户嵌入进行平均池化，得到物品的用户嵌入
    if len(users) > 0:
        item_user_emb = torch.mean(item_user_emb_list, dim=0)  # (128,)
    else:
        # 如果没有用户购买该物品，跳过这个物品
        continue

    # 将融合后的物品用户嵌入保存到对应的索引位置，并转换为 NumPy 格式
    fused_embeddings[item - 1] = item_user_emb.cpu().detach().numpy()  # 减 1 使物品ID与列表索引一致

# Step Ⅳ: 保存融合后的嵌入为 .pkl 文件
with open('fused_item_user_embeddings.pkl', 'wb') as f:
    pickle.dump(fused_embeddings, f)

print("融合后的嵌入已保存为 fused_item_user_embeddings.pkl 文件")

In [16]:
from torch.nn.utils.rnn import pad_sequence

# 假设 source_seq 和 text_emb 已经定义好
# source_seq 的形状为 (batch_size, sequence_length)
# text_emb 的形状为 (batch_size, sequence_length, embedding_dim)

# 创建掩码，排除值为 0 和 1 的元素
mask = (source_seq != 0) & (source_seq != 1)

# 初始化一个空列表来存储过滤后的嵌入向量
filtered_text_em = []

# 遍历每个样本
for i, emb in enumerate(text_emb):
    # 获取当前样本的掩码
    current_mask = mask[i]
    
    # 找到掩码为 True 的索引
    indices = current_mask.nonzero(as_tuple=True)[0]
    
    # 使用索引选择需要的嵌入向量
    filtered_emb = emb[indices]
    
    # 将过滤后的嵌入向量添加到列表中
    filtered_text_em.append(filtered_emb)

# 使用 pad_sequence 对过滤后的嵌入向量进行填充，以便它们具有相同的序列长度
padded_text_emb = pad_sequence(filtered_text_em, batch_first=True)

print("shape:",padded_text_emb[:5])

shape: tensor([[[-34.5000,   6.8750,   9.9375,  ..., -13.0625, -19.5000,   0.5273],
         [-39.5000,   5.5938,  36.2500,  ..., -31.3750,  24.2500,   4.7812],
         [-26.3750,   2.9375,   5.7812,  ...,   5.6250,  -1.0156,  -2.5156],
         [  0.0000,   0.0000,   0.0000,  ...,   0.0000,   0.0000,   0.0000],
         [  0.0000,   0.0000,   0.0000,  ...,   0.0000,   0.0000,   0.0000]],

        [[-34.5000,   6.8750,   9.9375,  ..., -13.0625, -19.5000,   0.5273],
         [-39.5000,   5.5938,  36.2500,  ..., -31.3750,  24.2500,   4.7812],
         [-17.5000,   6.1562,  -4.2812,  ...,  -6.8750,  51.2500,  -0.2910],
         [  0.0000,   0.0000,   0.0000,  ...,   0.0000,   0.0000,   0.0000],
         [  0.0000,   0.0000,   0.0000,  ...,   0.0000,   0.0000,   0.0000]],

        [[-34.5000,   6.8750,   9.9375,  ..., -13.0625, -19.5000,   0.5273],
         [-39.5000,   5.5938,  36.2500,  ..., -31.3750,  24.2500,   4.7812],
         [-14.9375,   5.5938, -38.5000,  ...,  -0.3125,  31.0000,

In [24]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import pickle

# 假设已经定义了 device, padded_text_emb, projector, 和其他必要的变量
text_emb = padded_text_emb  # 形状为 (11925, variable, 512)
with open('SASRec_item_embed.pkl', 'rb') as f:
    behavior_embeddings = torch.tensor(pickle.load(f)).to(device)  # 长度为 11925，每个元素是 (512,) 的向量
text_emb = text_emb.to(device)


class AttentionMechanism(nn.Module):
    def __init__(self, embedding_dim):
        super(AttentionMechanism, self).__init__()
        self.attention_layer = nn.Linear(embedding_dim * 2, 1)

    def forward(self, text_emb, behavior_emb):
        concat_embedding = torch.cat((text_emb, behavior_emb), dim=-1)  # (1024,)
        concat_embedding = concat_embedding.unsqueeze(0)  # (1, 1024)
        attention_score = torch.sigmoid(self.attention_layer(concat_embedding))  # (1, 1)
        return attention_score.squeeze()  # (1,)

attention_mechanism = AttentionMechanism(embedding_dim=512).to(device)
fused_embeddings = []  # 用于保存融合后的嵌入

for idx in range(len(text_emb)):
    current_text_emb = text_emb[idx]  # 当前物品的文本嵌入，形状为 (variable, 512)
    current_behavior_emb = behavior_embeddings[idx]  # 当前物品的行为嵌入，形状为 (512,)

    # 移除全零行
    non_zero_mask = current_text_emb.abs().sum(dim=1).bool()  # 创建一个布尔掩码
    filtered_text_emb = current_text_emb[non_zero_mask]  # 使用掩码过滤出非零行

    # 如果过滤后没有非零行，可以考虑使用原始的 current_text_emb 或者抛出异常
    if filtered_text_emb.numel() == 0:
        pooled_text_emb = torch.mean(current_text_emb, dim=0)  # (512,)
    else:
        pooled_text_emb = torch.mean(filtered_text_emb, dim=0)  # (512,)

    attention_score = attention_mechanism(pooled_text_emb, current_behavior_emb)  # (1,)
    fused_embedding = attention_score * pooled_text_emb + (1 - attention_score) * current_behavior_emb  # (512,)

    fused_embeddings.append(fused_embedding.cpu().detach().numpy())  # 转换为numpy数组

with open('fused_item_embeddings.pkl', 'wb') as f:
    pickle.dump(fused_embeddings, f)

print("融合后的嵌入已保存为 fused_item_embeddings.pkl 文件")


  behavior_embeddings = torch.tensor(pickle.load(f)).to(device)  # 长度为 11925，每个元素是 (512,) 的向量


融合后的嵌入已保存为 fused_item_embeddings.pkl 文件


In [None]:
import torch
import torch.nn as nn
import pickle

# Step Ⅰ: 加载用户特征嵌入、文本嵌入和行为嵌入
with open('SASRec_user_embed.pkl', 'rb') as f:
    user_emb = torch.tensor(pickle.load(f))  # 用户嵌入 (num_items, 128)

text_emb = padded_text_emb  # 形状为 (11925, variable, 512)
with open('SASRec_item_embed.pkl', 'rb') as f:
    behavior_embeddings = torch.tensor(pickle.load(f)).to(device)  # 长度为 11925，每个元素是 (512,) 的向量

# 将嵌入向量移到设备
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
text_emb = text_emb.to(device)
behavior_embeddings = behavior_embeddings.to(device)
user_emb = user_emb.to(device)
behavior_embeddings = projector(behavior_embeddings)
user_emb = projector(user_emb)

# Step Ⅱ: 定义注意力机制类，考虑用户嵌入
class AttentionMechanism(nn.Module):
    def __init__(self, text_dim, behavior_dim, user_dim):
        super(AttentionMechanism, self).__init__()
        # 注意力层，输入为 text + behavior + user 嵌入，输出为一个标量权重
        self.attention_layer = nn.Linear(text_dim + behavior_dim + user_dim, 1)

    def forward(self, text_emb, behavior_emb, user_emb):
        # 拼接文本、行为和用户嵌入
        concat_embedding = torch.cat((text_emb, behavior_emb, user_emb), dim=-1)  # (text_dim + behavior_dim + user_dim,)
        concat_embedding = concat_embedding.unsqueeze(0)  # (1, text_dim + behavior_dim + user_dim)
        
        # 通过线性层计算融合权重
        attention_score = torch.sigmoid(self.attention_layer(concat_embedding))  # (1, 1)
        return attention_score.squeeze()  # (1,)

# 创建注意力机制实例并移动到设备
attention_mechanism = AttentionMechanism(text_dim=512, behavior_dim=512, user_dim=512).to(device)

# Step Ⅲ: 对每个物品分别进行池化、计算融合权重和融合
fused_embeddings = []  # 用于保存融合后的嵌入

for idx in range(len(text_emb)):
    # 获取当前物品的文本嵌入和行为嵌入
    current_text_emb = text_emb[idx]  # 当前物品的文本嵌入，形状为 (variable, 512)
    current_behavior_emb = behavior_embeddings[idx]  # 当前物品的行为嵌入，形状为 (512,)

    # 对文本嵌入进行平均池化
    pooled_text_emb = torch.mean(current_text_emb, dim=0)  # (512,)

    # 获取对应的用户嵌入（假设每个物品有其唯一的用户，索引关系）
    user_idx = idx % len(user_emb)  # 假设物品与用户的对应关系为模运算
    current_user_emb = user_emb[user_idx]  # 当前用户的嵌入，形状为 (128,)

    # 计算个性化融合权重
    fusion_weight = attention_mechanism(pooled_text_emb, current_behavior_emb, current_user_emb)  # (1,)

    # 使用融合权重加权融合文本和行为嵌入
    fused_embedding = fusion_weight * pooled_text_emb + (1 - fusion_weight) * current_behavior_emb  # (512,)

    # 保存融合后的嵌入
    fused_embeddings.append(fused_embedding.cpu().detach().numpy())  # 转换为numpy数组

# Step Ⅳ: 保存融合后的嵌入为 .pkl 文件
with open('fused_item_embeddings.pkl', 'wb') as f:
    pickle.dump(fused_embeddings, f)

# 打印保存的成功信息
print("融合后的嵌入已保存为 fused_item_embeddings.pkl 文件")


In [25]:
import pickle
import torch

# 加载嵌入向量
with open('fused_user_embeddings.pkl', 'rb') as f:
    fused_embeddings = pickle.load(f)

# 将嵌入向量转换为 PyTorch 张量
fused_embeddings_tensor = torch.tensor(fused_embeddings)

# 查看形状
print("Shape of fused embeddings:", fused_embeddings_tensor.shape)

Shape of fused embeddings: torch.Size([22363, 512])
