In [3]:
import os
import torch
import argparse
import torch.nn as n
from transformers import T5Tokenizer
from module import Solomon
import math
import json
import torch
import random
import datetime
from rouge import rouge
from bleu import compute_bleu
from templates import exp_templates, seq_templates, topn_templates
from torch import nn
import pickle
import re
from transformers import (
    T5ForConditionalGeneration,
    LogitsProcessorList,
    MinLengthLogitsProcessor,
    NoBadWordsLogitsProcessor,
    HammingDiversityLogitsProcessor,
    RepetitionPenaltyLogitsProcessor,
    BeamSearchScorer,
    MaxLengthCriteria,
    StoppingCriteriaList,
)
from transformers.modeling_outputs import BaseModelOutput

import torch


In [4]:
data_dir = './data/beauty/'
model_version = 0
task_num = 3
prompt_num = 3
lr = 0.0005
epochs = 100
batch_size = 64
cuda = True
log_interval = 200
checkpoint = './checkpoint/beauty/'
endure_times = 5
exp_len = 20
negative_num = 99


In [5]:
if torch.cuda.is_available() and cuda:
    device = torch.device('cuda')
    print('Using CUDA')
else:
    device = torch.device('cpu')
    print('Using CPU')

Using CPU


In [6]:
if not os.path.exists(checkpoint):
    os.makedirs(checkpoint)
model_path = os.path.join(checkpoint, 'model.pt')

In [7]:
model_version = 't5-small'

In [8]:
class SeqDataLoader:
    def __init__(self, data_dir):
        self.user2items_positive = {}
        with open(data_dir + 'sequential.txt', 'r') as f:
            for line in f.readlines():
                user, items = line.strip().split(' ', 1)
                self.user2items_positive[int(user)] = items.split(' ')

        self.user2items_negative = {}
        with open(data_dir + 'negative.txt', 'r') as f:
            for line in f.readlines():
                user, items = line.strip().split(' ', 1)
                self.user2items_negative[int(user)] = items.split(' ')

        with open(data_dir + 'datamaps.json', 'r') as f:
            datamaps = json.load(f)
        self.id2user = datamaps['id2user']
        self.id2item = datamaps['id2item']

In [9]:
def compute_whole_word_id(seq_batch, tokenizer, max_len):
    whole_word_ids = []
    for seq in seq_batch:
        token_list = tokenizer.tokenize(seq)
        start_indices = []
        skip_next = False  # Flag to skip the next token if it's part of an existing word
        for idx, token in enumerate(token_list):
            if not skip_next:
                if token == '_' and not (idx > 0 and token_list[idx-1] == 'PH'):  # Exclude PH from being a separate start
                    start_indices.append(idx - 1)
            else:
                skip_next = False
            if token == 'PH':  # If we encounter PH, we should skip the next token
                skip_next = True

        end_indices = []
        for start in start_indices:
            mover = start + 2  # user/item _ xx
            while mover < len(token_list) and (token_list[mover].isdigit() or token_list[mover] == 'PH'):
                mover += 1
            end_indices.append(mover)
            
        whole_word_id = [0] * len(token_list)  # padding
        for i, (start, end) in enumerate(zip(start_indices, end_indices)):
            whole_word_id[start:end] = [i + 1] * (end - start)  # leave 0 as padding token
            
        whole_word_ids.append(whole_word_id)

    # make the batch of the same length
    padded_whole_word_ids = []
    for whole_word_id in whole_word_ids:
        padded_whole_word_ids.append(whole_word_id + [0] * (max_len - len(whole_word_id)))

    return padded_whole_word_ids

In [10]:
def compute_whole_word_id1(seq_batch, tokenizer, max_len):
    whole_word_ids = []
    for seq in seq_batch:
        token_list = tokenizer.tokenize(seq)
        start_indices = []
        skip_next = False  # Flag to skip the next token if it's part of an existing word
        for idx, token in enumerate(token_list):
            if not skip_next:
                if (token == '_' and 
                    not (idx > 0 and (token_list[idx-1] == 'PH' or token_list[idx-1] == 'PU'))):  # Exclude PH and PU from being a separate start
                    start_indices.append(idx - 1)
            else:
                skip_next = False
            if token in ['PH', 'PU']:  # If we encounter PH or PU, we should skip the next token
                skip_next = True

        end_indices = []
        for start in start_indices:
            mover = start + 2  # user/item _ xx
            while mover < len(token_list) and (token_list[mover].isdigit() or token_list[mover] in ['PH', 'PU']):
                mover += 1
            end_indices.append(mover)
            
        whole_word_id = [0] * len(token_list)  # padding
        for i, (start, end) in enumerate(zip(start_indices, end_indices)):
            whole_word_id[start:end] = [i + 1] * (end - start)  # leave 0 as padding token
            
        whole_word_ids.append(whole_word_id)

    # make the batch of the same length
    padded_whole_word_ids = []
    for whole_word_id in whole_word_ids:
        padded_whole_word_ids.append(whole_word_id + [0] * (max_len - len(whole_word_id)))

    return padded_whole_word_ids


In [11]:
import random

class SeqSampler:
    def __init__(self, user2items_pos):
        self.task_id = 1
        self.max_seq_len = 21
        self.item_template = ' item_'

        self.user2items_pos = user2items_pos
        self.user_list = list(user2items_pos.keys())

        self.sample_num = len(self.user_list)
        self.index_list = list(range(self.sample_num))
        self.step = 0

    def check_step(self):
        if self.step == self.sample_num:
            self.step = 0
            random.shuffle(self.index_list)

    def sample_seq(self, u):
        item_history = self.user2items_pos[u]  # should have at least 4 items
        start_item = random.randint(0, len(item_history) - 4)  # cannot be the last 3
        end_item = random.randint(start_item + 1, len(item_history) - 3)  # cannot be the last 2
        item_seg = item_history[start_item:(end_item + 1)]  # sample a segment from the sequence without the last two
        if len(item_seg) > self.max_seq_len:
            item_seg = item_seg[-self.max_seq_len:]
        
        # Add "PH" after each item ID
        item_seg_ph = [f"{item}PH" for item in item_seg]
        return item_seg_ph

    def sample(self, num):
        task = [self.task_id] * num
        inputs, outputs, users = [], [], []
        for _ in range(num):
            self.check_step()
            idx = self.index_list[self.step]
            u = self.user_list[idx]
            item_seg = self.sample_seq(u)
            template = random.choice(seq_templates)
            
            # Add "PH" to the user ID
            u_ph = f"{u}PU"
            
            # Use item_seg directly since we've added "PH" to each item in sample_seq
            input_seq = template.format(u_ph, self.item_template.join(item_seg[:-1]))
            inputs.append(input_seq)
            outputs.append(item_seg[-1][:-2])  # Remove "PH" from the last item when adding to outputs
            users.append(u)
            self.step += 1
            
        return task, inputs, outputs, users



In [12]:
class TrainBatchify:
    def __init__(self, exp_data, user2items_pos, negative_num, item_num, tokenizer, exp_len, batch_size):
        self.exp_sampler = ExpSampler(exp_data)
        self.seq_sampler = SeqSampler(user2items_pos)
        self.topn_sampler = TopNSampler(user2items_pos, negative_num, item_num)
        self.tokenizer = tokenizer                                                                                                                                                                                                   
        self.exp_len = exp_len
        self.batch_size = batch_size
        self.exp_num = int(self.exp_sampler.sample_num / batch_size)
        self.seq_num = int(self.seq_sampler.sample_num / batch_size)
        self.topn_num = int(self.topn_sampler.sample_num / batch_size)
        self.batch_num = self.exp_num + self.seq_num + self.topn_num
        self.batch_index = 0

    def encode(self, task, input_list, output_list,user):
        encoded_source = self.tokenizer(input_list, padding=True, return_tensors='pt')
        source_seq = encoded_source['input_ids'].contiguous()
        source_mask = encoded_source['attention_mask'].contiguous()
        max_len = source_seq.size(1)
        whole_word_ids = compute_whole_word_id(input_list, self.tokenizer, max_len)
        whole_word = torch.tensor(whole_word_ids, dtype=torch.int64).contiguous()
        encoded_target = self.tokenizer(output_list, padding=True, return_tensors='pt')
        target_seq = encoded_target['input_ids'][:, :self.exp_len]
        task = torch.tensor(task, dtype=torch.int64)
        return task, source_seq, source_mask, whole_word, target_seq,user

    def next_batch(self):
        self.batch_index += 1
        if self.batch_index % 3 == 1:
            task_list, input_list, output_list,user = self.exp_sampler.sample(self.batch_size)
        elif self.batch_index % 3 == 2:
            task_list, input_list, output_list,user = self.seq_sampler.sample(self.batch_size)
        else:
            task_list, input_list, output_list,user = self.topn_sampler.sample(self.batch_size)
        return self.encode(task_list, input_list, output_list,user)


In [13]:

class SeqBatchify:
    def __init__(self, user2items_pos, tokenizer, batch_size):
        self.task_id = 1
        self.max_seq_len = 21
        self.user_template = 'user_{} item_{}'
        self.item_template = ' item_'

        self.tokenizer = tokenizer
        self.user2items_pos = user2items_pos
        self.user_list = list(user2items_pos.keys())

        self.batch_size = batch_size
        self.sample_num = len(self.user_list)
        self.total_step = int(math.ceil(self.sample_num / self.batch_size))
        self.step = 0

    def encode(self, input_list, output_list, user_ids):
        sample_num = len(input_list)
        encoded_source = self.tokenizer(input_list, padding=True, return_tensors='pt')
        source_seq = encoded_source['input_ids'].contiguous()
        source_mask = encoded_source['attention_mask'].contiguous()
        max_len = source_seq.size(1)
        whole_word_ids = compute_whole_word_id1(input_list, self.tokenizer, max_len)
        whole_word = torch.tensor(whole_word_ids, dtype=torch.int64).contiguous()
        encoded_target = self.tokenizer(output_list, padding=True, return_tensors='pt')
        target_seq = encoded_target['input_ids']
        task = torch.ones((sample_num,), dtype=torch.int64) * self.task_id
        user_ids_tensor = torch.tensor(user_ids, dtype=torch.int64).contiguous()  # Convert user IDs to tensor
        return task, source_seq, source_mask, whole_word, user_ids_tensor, target_seq

    def next_batch(self, valid=True):
        if self.step == self.total_step:
            self.step = 0

        start = self.step * self.batch_size
        offset = min(start + self.batch_size, self.sample_num)
        self.step += 1

        input_list = []
        output_list = []
        user_ids = []  # List to store user IDs
        for i in range(start, offset):
            u = self.user_list[i]
            item_seg = self.user2items_pos[u]
            if valid:
                item_seg = item_seg[:-1]  # leave the last 1
            if len(item_seg) > self.max_seq_len:
                item_seg = item_seg[-self.max_seq_len:]

            # Ensure that each item has the "item_" prefix and "PH" suffix
            item_seg_with_ph = [f"item_{x}PH" for x in item_seg[:-1]]  # Add both prefix and suffix
            input_seq = ' '.join(item_seg_with_ph)  # Join with spaces
            # Add "PH" to the user ID
            user_with_ph = f"user_{u}PU"
            input_list.append(f"{user_with_ph} {input_seq}")
            output_list.append(item_seg[-1])
            user_ids.append(u)  # Collect user ID

        return input_list, output_list, user_ids


    def next_batch_valid(self):
        return self.next_batch()

    def next_batch_test(self):
        return self.next_batch(False)




In [14]:
import random
import math

class TopNBatchify:
    def __init__(self, user2items_pos, user2items_neg, negative_num, item_num, tokenizer, batch_size=128):
        self.task_id = 2
        self.user_template = 'user_{} item_{}'
        self.item_template = ' item_'
        self.negative_num = negative_num
        self.item_num = item_num

        self.tokenizer = tokenizer
        self.user2items_neg = user2items_neg
        self.user2item_set_pos = {}
        self.user2item_val = {}
        self.user2item_test = {}
        self.user_list = list(user2items_pos.keys())
        for user, items in user2items_pos.items():
            self.user2item_set_pos[user] = set([int(item) for item in items])
            self.user2item_val[user] = items[-2]
            self.user2item_test[user] = items[-1]

        self.batch_size = batch_size
        self.sample_num = len(self.user_list)
        self.total_step = int(math.ceil(self.sample_num / self.batch_size))
        self.step = 0

    def encode(self, input_list, output_list, user_ids):
        sample_num = len(input_list)
        encoded_source = self.tokenizer(input_list, padding=True, return_tensors='pt')
        source_seq = encoded_source['input_ids'].contiguous()
        source_mask = encoded_source['attention_mask'].contiguous()
        max_len = source_seq.size(1)
        whole_word_ids = compute_whole_word_id(input_list, self.tokenizer, max_len)
        whole_word = torch.tensor(whole_word_ids, dtype=torch.int64).contiguous()
        encoded_target = self.tokenizer(output_list, padding=True, return_tensors='pt')
        target_seq = encoded_target['input_ids']
        task = torch.ones((sample_num,), dtype=torch.int64) * self.task_id
        return task, source_seq, source_mask, whole_word, target_seq, user_ids

    def sample_negative(self, user):
        item_set = set()
        items_pos = self.user2item_set_pos[user]
        while len(item_set) < self.negative_num:
            i = random.randint(1, self.item_num)
            if i not in items_pos:
                item_set.add(i)
        return [str(item) for item in item_set]

    def next_batch(self, valid=True):
        if self.step == self.total_step:
            self.step = 0

        start = self.step * self.batch_size
        offset = min(start + self.batch_size, self.sample_num)
        self.step += 1

        input_list = []
        output_list = []
        user_ids = []

        for i in range(start, offset):
            u = self.user_list[i]
            if valid:
                item_pos = self.user2item_val[u]
                item_list_neg = self.sample_negative(u)
            else:
                item_pos = self.user2item_test[u]
                item_list_neg = self.user2items_neg[u]
            item_list_neg.append(item_pos)
            random.shuffle(item_list_neg)
            # Add "PH" to the user ID
            u_ph = f"{u}PU"
            input_seq = self.user_template.format(u_ph, self.item_template.join(item_list_neg))
            input_list.append(input_seq)
            output_list.append(item_pos)
            user_ids.append(u)

        return input_list, output_list, user_ids

    def next_batch_valid(self):
        return self.next_batch()

    def next_batch_test(self):
        return self.next_batch(False)



In [15]:
class TopNSampler:
    def __init__(self, user2items_pos, negative_num, item_num):
        self.task_id = 2
        self.item_template = ' item_'
        self.negative_num = negative_num
        self.item_num = item_num

        self.user2item_set_pos = {}
        self.user2items_train = {}
        self.user_list = list(user2items_pos.keys())
        for user, items in user2items_pos.items():
            self.user2item_set_pos[user] = set([int(item) for item in items])
            self.user2items_train[user] = items[:-2]

        self.sample_num = len(self.user_list)
        self.index_list = list(range(self.sample_num))
        self.step = 0

    def check_step(self):
        if self.step == self.sample_num:
            self.step = 0
            random.shuffle(self.index_list)

    def sample_negative(self, user):
        item_set = set()
        items_pos = self.user2item_set_pos[user]
        while len(item_set) < self.negative_num:
            i = random.randint(1, self.item_num)
            if i not in items_pos:
                item_set.add(i)
        return [str(item) for item in item_set]

    def sample(self, num):
        task = [self.task_id] * num
        inputs, outputs, user_ids = [], [], []  # 新增用户ID列表
        for _ in range(num):
            self.check_step()
            idx = self.index_list[self.step]
            u = self.user_list[idx]
            item_list = self.user2items_train[u]
            item_pos = random.choice(item_list)
            item_list_neg = self.sample_negative(u)
            item_list_neg.append(item_pos)
            random.shuffle(item_list_neg)
            
            # Add "PH" to the user ID
            u_ph = f"{u}PU"
            
            template = random.choice(topn_templates)
            input_seq = template.format(u_ph, self.item_template.join(item_list_neg))
            inputs.append(input_seq)
            outputs.append(item_pos)
            user_ids.append(u)  # 添加当前用户的ID到用户ID列表
            self.step += 1
        return task, inputs, outputs, user_ids  # 返回时包含用户ID

In [16]:
print('Loading data')
tokenizer = T5Tokenizer.from_pretrained(model_version)

seq_corpus = SeqDataLoader(data_dir)
nitem = len(seq_corpus.id2item)

Loading data


In [17]:
all = SeqSampler(seq_corpus.user2items_positive)


In [18]:
task_list, input_list, output_list,user = all.sample(512)
print(task_list)
print(input_list[:20])
print(output_list[:5])
print(user[:5])

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 

In [19]:
all1 = TopNSampler(seq_corpus.user2items_positive, 99, nitem)

In [20]:
task, inputs, outputs, user_ids = all1.sample(batch_size)

In [21]:
print("task",task[:5])
print("inputs:",inputs[:1])
print(" outputs", outputs[:5])
print(" user_ids", user_ids[:5])

task [2, 2, 2, 2, 2]
inputs: ['Choose the best item from the candidates to recommend for user_1PU? item_2622 item_5575 item_6313 item_11690 item_4731 item_11743 item_308 item_10869 item_6471 item_7619 item_471 item_7842 item_3975 item_10041 item_1 item_6582 item_6625 item_4507 item_7606 item_1707 item_8486 item_11979 item_7133 item_2551 item_4393 item_9456 item_6928 item_534 item_3493 item_3012 item_2074 item_4822 item_3867 item_11468 item_9395 item_1778 item_5126 item_9397 item_6794 item_4090 item_9444 item_4096 item_2084 item_6206 item_6445 item_742 item_8059 item_2314 item_5832 item_125 item_6040 item_10681 item_11216 item_10078 item_10399 item_5669 item_10541 item_7857 item_69 item_8231 item_9906 item_1954 item_2539 item_4258 item_1856 item_90 item_5292 item_3586 item_2638 item_3965 item_8650 item_11598 item_1128 item_11531 item_3369 item_9755 item_5964 item_3185 item_2807 item_6624 item_11226 item_4528 item_4603 item_11964 item_1699 item_6249 item_2305 item_1689 item_1457 item_236

In [22]:
seq_iterator = SeqBatchify(seq_corpus.user2items_positive, tokenizer, batch_size=64)


In [23]:
task, source_seq, source_mask= seq_iterator.next_batch_valid()
print("task",task)
print("source_seq",source_seq)
print(" source_mask", source_mask)



task ['user_1PU item_1PH item_2PH item_3PH', 'user_2PU item_6PH item_7PH item_8PH item_9PH item_10PH', 'user_3PU item_4PH item_12PH item_13PH item_14PH item_15PH item_16PH item_17PH', 'user_4PU item_20PH item_21PH item_22PH item_23PH', 'user_5PU item_4PH item_25PH item_26PH item_27PH item_28PH item_29PH item_30PH', 'user_6PU item_33PH item_34PH item_35PH item_4PH item_36PH item_37PH item_38PH item_39PH item_40PH item_41PH item_42PH item_43PH item_44PH item_45PH item_46PH item_47PH', 'user_7PU item_50PH item_51PH item_52PH item_53PH item_54PH', 'user_8PU item_56PH item_57PH item_58PH', 'user_9PU item_22PH item_63PH item_64PH item_65PH item_66PH item_67PH item_68PH item_69PH item_70PH item_71PH item_72PH item_73PH item_74PH item_75PH item_76PH item_77PH item_78PH item_79PH item_80PH item_81PH', 'user_10PU item_84PH item_85PH item_86PH item_83PH item_87PH item_59PH item_88PH item_89PH', 'user_11PU item_92PH item_93PH item_94PH item_83PH item_95PH', 'user_12PU item_15PH item_98PH item_99PH

In [24]:
topn_iterator = TopNBatchify(seq_corpus.user2items_positive,seq_corpus.user2items_negative,99, nitem,tokenizer, batch_size=64)

In [25]:
whole_word1, target1, user1 =topn_iterator.next_batch_valid()
print("in",whole_word1[:5])
print("out",target1)
print("user:",user1)


in ['user_1PU item_3043 item_1177 item_2287 item_3045 item_12081 item_9186 item_9077 item_8987 item_3510 item_3129 item_6670 item_10064 item_6883 item_4674 item_2618 item_6892 item_4807 item_6885 item_11846 item_6211 item_5897 item_729 item_5467 item_692 item_763 item_5489 item_899 item_3332 item_6311 item_7304 item_10441 item_839 item_10932 item_2626 item_7892 item_6684 item_10920 item_11217 item_2787 item_1790 item_2158 item_3495 item_8353 item_11857 item_5249 item_7859 item_6380 item_4089 item_2557 item_1565 item_1742 item_9702 item_3613 item_10166 item_2178 item_11988 item_7370 item_438 item_9134 item_8104 item_199 item_5188 item_911 item_1904 item_821 item_8386 item_3212 item_5098 item_3646 item_7542 item_3702 item_2631 item_7008 item_4962 item_7988 item_2686 item_10747 item_4 item_10589 item_4596 item_9808 item_4462 item_8897 item_1173 item_4302 item_312 item_984 item_3606 item_3252 item_969 item_2089 item_7649 item_3169 item_8184 item_11081 item_8526 item_2322 item_1885 item_964

In [26]:
    for seq in input_list:
        token_list = tokenizer.tokenize(seq)
        print(token_list)

['▁Here', '▁is', '▁the', '▁purchase', '▁history', '▁list', '▁of', '▁user', '_', '1', 'PU', ':', '▁item', '_', '2', 'PH', '.', '▁Try', '▁to', '▁recommend', '▁next', '▁item', '▁to', '▁the', '▁user', '.']
['▁user', '_', '2', 'PU', '▁item', '_', '6', 'PH', '▁item', '_', '7', 'PH']
['▁Given', '▁the', '▁following', '▁purchase', '▁history', '▁of', '▁user', '_', '3', 'PU', ':', '▁item', '_', '13', 'PH', '▁item', '_', '14', 'PH', ',', '▁predict', '▁next', '▁possible', '▁item', '▁to', '▁be', '▁purchased', '▁by', '▁the', '▁user', '.']
['▁Here', '▁is', '▁the', '▁purchase', '▁history', '▁of', '▁user', '_', '4', 'PU', ':', '▁item', '_', '22', 'PH', '.', '▁What', '▁to', '▁recommend', '▁next', '▁for', '▁the', '▁user', '?']
['▁user', '_', '5', 'PU', '▁item', '_', '4', 'PH', '▁item', '_', '25', 'PH', '▁item', '_', '26', 'PH', '▁item', '_', '27', 'PH', '▁item', '_', '28', 'PH']
['▁Here', '▁is', '▁the', '▁purchase', '▁history', '▁of', '▁user', '_', '6', 'PU', ':', '▁item', '_', '46', 'PH', '.', '▁What', '

In [27]:
encoded_source = tokenizer(input_list, padding=True, return_tensors='pt')

In [28]:
print("encoded_source", encoded_source)

encoded_source {'input_ids': tensor([[ 947,   19,    8,  ...,    0,    0,    0],
        [1139,  834,  357,  ...,    0,    0,    0],
        [9246,    8,  826,  ...,    0,    0,    0],
        ...,
        [ 947,   19,    8,  ...,    0,    0,    0],
        [  27,  253,    8,  ...,    0,    0,    0],
        [9246,    8,  826,  ...,    0,    0,    0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}


In [29]:
text = "'Given the following purchase history of user_1: item_2PH, predict next possible item to be purchased by the user.'"

In [30]:
token = tokenizer.tokenize(text)
print(token)

['▁', "'", 'G', 'ive', 'n', '▁the', '▁following', '▁purchase', '▁history', '▁of', '▁user', '_', '1', ':', '▁item', '_', '2', 'PH', ',', '▁predict', '▁next', '▁possible', '▁item', '▁to', '▁be', '▁purchased', '▁by', '▁the', '▁user', '.', "'"]


In [31]:
encoded = tokenizer(text, padding=True, return_tensors='pt')

In [32]:
print("encoded", encoded)

encoded {'input_ids': tensor([[   3,   31,  517,  757,   29,    8,  826, 1242,  892,   13, 1139,  834,
          536,   10, 2118,  834,  357, 8023,    6, 9689,  416,  487, 2118,   12,
           36, 3907,   57,    8, 1139,    5,   31,    1]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1]])}


In [33]:
source_seq = encoded_source['input_ids'].contiguous()

In [34]:
print("source_seq", source_seq[:5])

source_seq tensor([[  947,    19,     8,  1242,   892,   570,    13,  1139,   834,   536,
         10744,    10,  2118,   834,   357,  8023,     5,  5263,    12,  1568,
           416,  2118,    12,     8,  1139,     5,     1,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     

In [35]:
source_mask = encoded_source['attention_mask'].contiguous()

In [36]:
print("source_mask", source_mask[:5])

source_mask tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1,

In [37]:
max_len = source_seq.size(1)
print("max_len", max_len)

max_len 131


In [38]:
whole_word_ids = compute_whole_word_id1(input_list, tokenizer, max_len)
whole_word = torch.tensor(whole_word_ids, dtype=torch.int64).contiguous()
print("whole_word_ids", whole_word_ids[:10])

print("list",input_list[:10])

whole_word_ids [[0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 2, 2, 2, 2, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

In [43]:
pu_positions = (source_seq == 10744).nonzero(as_tuple=False)

In [45]:
print("pu_positions",pu_positions[10:50])

pu_positions tensor([[10,  9],
        [11,  3],
        [12, 10],
        [13,  9],
        [14, 10],
        [15,  3],
        [16, 10],
        [17, 10],
        [18, 10],
        [19,  9],
        [20,  9],
        [21,  9],
        [22,  9],
        [23,  9],
        [24, 10],
        [25, 10],
        [26,  9],
        [27,  9],
        [28,  9],
        [29,  9],
        [30,  9],
        [31,  3],
        [32, 10],
        [33, 10],
        [34,  9],
        [35, 10],
        [36,  9],
        [37,  9],
        [38,  9],
        [39,  3],
        [40,  9],
        [41,  9],
        [42, 10],
        [43, 10],
        [44,  9],
        [45,  9],
        [46,  9],
        [47,  3],
        [48,  9],
        [49,  9]])


In [None]:
# 假设 source_seq 是一个已经存在的张量
partial_source_seq = source_seq[:5]

# 打印这部分张量
print(partial_source_seq)

In [None]:
# 获取第一行的前10个元素
first_row = source_seq[0, :28]

# 过滤掉0值，因为0通常代表padding
non_zero_ids = first_row[first_row != 0]

# 使用tokenizer解码
decoded_text = tokenizer.decode(non_zero_ids.tolist())

# 打印解码后的文本
print(decoded_text)

In [None]:
user_list = list(seq_corpus.user2items_positive.keys())
sample_num = len(user_list)
index_list = list(range(sample_num))
idx = index_list[6]

In [None]:
print(sample_num)

In [None]:
u = user_list[idx]

In [None]:
print(u)

In [None]:
item_history = seq_corpus.user2items_positive[u]
print(item_history)

In [None]:
item_seg = all.sample_seq(u)
print(item_seg)

In [None]:
encoded_target = tokenizer(output_list, padding=True, return_tensors='pt')

In [None]:
target_seq = encoded_target['input_ids'][:, :exp_len]
print("target_seq", target_seq[:5])

In [None]:
    whole_word_ids = []
    for seq in input_list:
        token_list = tokenizer.tokenize(seq)
        print("token_list", token_list)
        start_indices = []
        for idx, token in enumerate(token_list):
            if token == '_':
                start_indices.append(idx - 1)  # user_xx or item_xx, starts before _
        end_indices = []
        for start in start_indices:
            mover = start + 2  # user/item _ xx
            while mover < len(token_list) and token_list[mover].isdigit():
                mover += 1
            end_indices.append(mover)
        whole_word_id = [0] * len(token_list)  # padding
        for i, (start, end) in enumerate(zip(start_indices, end_indices)):
            whole_word_id[start:end] = [i + 1] * (end - start)  # leave 0 as padding token
            
        whole_word_ids.append(whole_word_id)
        
        padded_whole_word_ids = []
    for whole_word_id in whole_word_ids:
        padded_whole_word_ids.append(whole_word_id + [0] * (max_len - len(whole_word_id)))
        
 

In [None]:
model = Solomon.from_pretrained(model_version)
model.init_prompt(task_num, prompt_num, device)
model.to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=lr)

In [None]:
torch.set_printoptions(threshold=1000, linewidth=80, precision=None, sci_mode=None)

In [None]:
text_emb=model.input_plus_whole_word(source_seq, whole_word)


In [None]:
print("text_emb", text_emb[:1])

In [None]:
print(text_emb.shape)

In [None]:


vocab = tokenizer.get_vocab()

# 查找词汇ID
ph_token_id = vocab.get("PH", None)  # 这里假设“PH”是词汇表中的一个词
print(ph_token_id)
# 如果“PH”被分词器分割成多个子词，你可能需要查找所有相关的子词ID
# 例如，如果“PH”被分割为"p"和"h"，你可能需要找到这两个子词的ID

# 从嵌入层获取嵌入向量
if ph_token_id is not None:
    embed_layer = model.shared
    ph_embedding = embed_layer.weight[ph_token_id]

# 打印嵌入向量
print(ph_embedding)

In [None]:

ph_token_id = 8023  # 这个ID需要根据实际情况来确定

# 找到 PH 在 input_ids 中的位置
ph_positions = (source_seq == ph_token_id).nonzero(as_tuple=False)

# 假设你只关心第一个 batch 的 PH 位置
first_ph_position = ph_positions[0][1]  # 只取位置信息，忽略 batch 信息
previous_word_position = first_ph_position - 1

# 获取前一个词汇的嵌入向量
previous_word_embedding = text_emb[0, previous_word_position, :]# 假设 batch_size=1
previous_word_embeddingPH = text_emb[0, previous_word_position+1, :]

# 获取前一个词汇的ID
previous_word_id = source_seq[0, previous_word_position]

# 使用分词器将ID转换回词汇
tokenizer = T5Tokenizer.from_pretrained('t5-small')  # 或者使用你模型对应的分词器
previous_word = tokenizer.convert_ids_to_tokens(previous_word_id.item())

# 输出结果
print("Previous word embedding:", previous_word_embedding)
print("Previous word embeddingPH:", previous_word_embeddingPH)
print("Previous word:", previous_word)

In [None]:
ph_token_id = 8023  

ph_positions = (source_seq == ph_token_id).nonzero(as_tuple=False)
print(ph_positions)

# 遍历所有 PH 的位置
for pos in ph_positions:
    # 检查位置是否在序列开始位置之前
    if pos[1] > 0:
        previous_word_position = pos[1] - 1
        # 获取前一个词汇的ID
        previous_word_id = source_seq[pos[0], previous_word_position]
        # 将ID转换回词汇
        previous_word = tokenizer.convert_ids_to_tokens(previous_word_id.item())
        print(f"Batch {pos[0]}, Previous word before PH: {previous_word}")
    else:
        print(f"Batch {pos[0]}, PH is at the beginning of the sequence.")

In [None]:
class MlpProjector(nn.Module):
    def __init__(self, rec_size=64, llm_size=512):
        super().__init__()
        self.mlp_proj = nn.Sequential(
            nn.Linear(rec_size, llm_size),
            nn.GELU(),
            nn.Linear(llm_size, llm_size)
        )

    def forward(self, x):
        x = self.mlp_proj(x)
        return x

In [None]:

ph_token_id = 8023  

# 找到所有 PH 在 input_ids 中的位置
ph_positions = (source_seq == ph_token_id).nonzero(as_tuple=False)

# 加载物品嵌入向量
with open('SASRec_item_embed.pkl', 'rb') as f:
    item_embeds = pickle.load(f)

# 创建MlpProjector实例
projector = MlpProjector().to(device)




# 遍历所有 PH 的位置
for pos in ph_positions:
    # 检查位置是否在序列开始位置之前
    if pos[1] > 0:
        previous_word_position = pos[1] - 1
        # 获取前一个词汇的ID
        previous_word_id = source_seq[pos[0], previous_word_position]
        previous_word = tokenizer.convert_ids_to_tokens(previous_word_id.item())
        previous=int(previous_word)
        print(previous)
        
        
        item_embedding = item_embeds[previous]
        
        # 将物品嵌入向量转换为PyTorch张量
        item_embedding_tensor = torch.as_tensor(item_embedding, dtype=torch.float32, device=device).detach()
        
        # 通过MlpProjector进行维度转换
        projected_embedding = projector(item_embedding_tensor)
        
        # 替换原本 PH 对应的嵌入向量
        text_emb[pos[0], pos[1], :] = projected_embedding
        
    else:
        print(f"Batch {pos[0]}, PH is at the beginning of the sequence.")

# 打印修改后的text_emb的一部分以验证替换是否成功
print("Modified text_emb:")
print(text_emb[:1])  # 打印第一个batch的嵌入向量

In [None]:

ph_token_id = 8023  # 这个ID需要根据实际情况来确定

# 找到 PH 在 input_ids 中的位置
ph_positions = (source_seq == ph_token_id).nonzero(as_tuple=False)

# 假设你只关心第一个 batch 的 PH 位置
first_ph_position = ph_positions[0][1]  # 只取位置信息，忽略 batch 信息
previous_word_position = first_ph_position - 1

# 获取前一个词汇的嵌入向量
previous_word_embedding = text_emb[0, previous_word_position, :]# 假设 batch_size=1
previous_word_embeddingPH = text_emb[0, previous_word_position+1, :]

# 获取前一个词汇的ID
previous_word_id = source_seq[0, previous_word_position]

# 使用分词器将ID转换回词汇

previous_word = tokenizer.convert_ids_to_tokens(previous_word_id.item())

# 输出结果
print("Previous word embedding:", previous_word_embedding)
print("Previous word embeddingPH:", previous_word_embeddingPH)
print("Previous word:", previous_word)

In [None]:
        item_embedding = item_embeds[1]
        
        # 将物品嵌入向量转换为PyTorch张量
        item_embedding_tensor = torch.as_tensor(item_embedding, dtype=torch.float32, device=device).detach()
        
        # 通过MlpProjector进行维度转换
        projected_embedding = projector(item_embedding_tensor)
        print("projected_embedding", projected_embedding)