In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn import Transformer
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer
from torch.utils.data import DataLoader,Dataset
from torch.optim import AdamW
from torch.nn import CosineSimilarity


# 读取数据构建正负样本对

### 针对每一个用户去获取他的评论

In [None]:
def build_user_data(data, top_n=2):
    # 创建DataFrame
    df = pd.DataFrame(data)

    # 构建用户-酒店消费矩阵
    user_hotel_matrix = df.pivot_table(index='user_id', columns='hotel_id', values='rating').fillna(0)

    # 计算用户间的余弦相似度
    user_similarity = cosine_similarity(user_hotel_matrix)

    def find_similar_users(user_id, similarity_matrix, top_n=2):
        user_index = user_hotel_matrix.index.get_loc(user_id)
        sim_scores = list(enumerate(similarity_matrix[user_index]))
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
        top_users = [i for i, _ in sim_scores[1:top_n+1]]
        return [user_hotel_matrix.index[i] for i in top_users]

    def build_samples(user_id, top_users, hotel_ratings, hotel_intros):
        samples = []
        for hotel, rating in hotel_ratings.items():
            intro = hotel_intros.get(hotel, '')
            if rating > 3:
                samples.append((user_id, hotel, 1, intro))
            else:
                samples.append((user_id, hotel, 0, intro))

            for similar_user in top_users:
                similar_user_ratings = user_hotel_matrix.loc[similar_user].to_dict()
                for hotel, rating in similar_user_ratings.items():
                    if hotel not in hotel_ratings and rating > 3:
                        intro = hotel_intros.get(hotel, '')
                        samples.append((user_id, hotel, 1, intro))
                    elif hotel not in hotel_ratings and rating <= 3:
                        intro = hotel_intros.get(hotel, '')
                        samples.append((user_id, hotel, 0, intro))

        return samples

    # 对所有用户构建样本
    all_samples = []
    for user_id in user_hotel_matrix.index:
        top_users = find_similar_users(user_id, user_similarity, top_n)
        hotel_ratings = user_hotel_matrix.loc[user_id].to_dict()
        hotel_intros = df.set_index('hotel_id')['comment'].to_dict()
        samples = build_samples(user_id, top_users, hotel_ratings, hotel_intros)
        all_samples.extend(samples)

    # 转换成DataFrame
    samples_df = pd.DataFrame(all_samples, columns=['user_id', 'hotel_id', 'label', 'comment'])

    # 转换类别特征
    X = samples_df[['user_id', 'hotel_id', 'comment']]
    X['user_id'] = X['user_id'].astype('category').cat.codes
    X['hotel_id'] = X['hotel_id'].astype('category').cat.codes

    y = samples_df.loc[:, ['label', 'user_id', 'hotel_id']]
    
    return X, y


In [None]:
data=pd.read_csv('/user_data')
X_user,y_user=build_user_data(data)
hotel_data=pd.read_csv('/hotel_data')

# 构建输入数据

In [None]:
# 分组并将评论合并为列表
df = pd.DataFrame(data)
df2= pd.DataFrame(hotel_data)
user_data = df.groupby('user_id')['comment'].apply(lambda comments: ' '.join(comments)).reset_index()
hotel_intro_users=df.groupby('hotel_id')['comment'].apply(lambda comments: ' '.join(comments)).reset_index()
hotel_intro_hotel=df2.groupby('hotel_id')['hotel_intro'].apply(lambda comments: ' '.join(comments)).reset_index()
combined_df = pd.merge(hotel_intro_users, hotel_intro_hotel, on='hotel_id', how='outer')

# 合并 'comment' 和 'hotel_intro' 列
combined_df['text'] = combined_df.apply(lambda row: f"{row['comment']}, {row['hotel_intro']}" if pd.notna(row['comment']) and pd.notna(row['hotel_intro']) else (row['comment'] if pd.notna(row['comment']) else row['hotel_intro']), axis=1)

# 选择最终需要的列
hotel_data = combined_df[['hotel_id', 'text']]

In [None]:
print(hotel_data.head(10))#hotel包含每个消费过的用户的评论以及酒店自己的描述
print(user_data.head(10))#user包含自己消费过的酒店自己给出的评论

# 双塔模型

In [None]:
class RMSNorm(nn.Module):
    def __init__(self, dim, eps=1e-6):
        super(RMSNorm, self).__init__()
        self.weight = nn.Parameter(torch.ones(dim))
        self.eps = eps

    def forward(self, x):
        norm = torch.sqrt(torch.mean(x**2, dim=-1, keepdim=True) + self.eps)
        return self.weight * x / norm

class RoPE(nn.Module):
    def __init__(self, d_model, device, max_len=5000):
        super(RoPE, self).__init__()
        self.d_model = d_model
        self.max_len = max_len
        self.device = device

    def forward(self, x):
        seq_len = x.size(1)
        position = torch.arange(seq_len, dtype=torch.float32, device=self.device).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, self.d_model, 2).float().to(self.device) * -(torch.log(torch.tensor(10000.0)).to(self.device) / self.d_model))
        pos_enc = torch.zeros(seq_len, self.d_model, device=self.device)
        pos_enc[:, 0::2] = torch.sin(position * div_term)
        pos_enc[:, 1::2] = torch.cos(position * div_term)
        return x + pos_enc[:seq_len]
    
class TransformerEncoder(nn.Module):
    def __init__(self, vocab_size, d_model, nhead, num_layers, max_len=5000):
        super(TransformerEncoder, self).__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.rope = RoPE(d_model,device, max_len)
        self.transformer_encoder = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(d_model=d_model, nhead=nhead, dim_feedforward=2048, activation='gelu'),
            num_layers=num_layers
        )
        self.rmsnorm = RMSNorm(d_model)

    def forward(self, x):
        attention_mask = (x.transpose(0,1)!= 0)
        
        x = self.embedding(x)
        x = self.rope(x)
        x = self.transformer_encoder(x,src_key_padding_mask=~attention_mask)
        x = self.rmsnorm(x)
        return x

In [None]:
class TripletDataset(Dataset):
    def __init__(self, user_data, hotel_data, y_user, tokenizer_name='bert-base-chinese', max_length=128):
        self.user_data = user_data
        self.hotel_data = hotel_data
        self.y_user = y_user
        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
        self.max_length = max_length
        
        # Create a mapping from user_id and hotel_id to indices
        self.user_id_to_index = {uid: idx for idx, uid in enumerate(user_data['user_id'].unique())}
        self.hotel_id_to_index = {hid: idx for idx, hid in enumerate(hotel_data['hotel_id'].unique())}
        
        # Create lists to store triplets
        self.triplets = self._create_triplets()
    
    def _create_triplets(self):
        triplets = []
        user_to_hotels = self.y_user.groupby('user_id')['hotel_id'].apply(list).to_dict()
        
        for _, row in self.y_user.iterrows():
            user_id = row['user_id']
            hotel_id = row['hotel_id']
            label = row['label']
            
            if label == 1:  # Positive sample
                # Positive sample
                anchor_index = self.user_id_to_index[user_id]
                positive_index = self.hotel_id_to_index[hotel_id]
                
                # Negative sample
                negative_hotels = [hid for hid in self.hotel_id_to_index.values() if hid != positive_index]
                if len(negative_hotels) > 0:
                    negative_index = np.random.choice(negative_hotels)
                    triplets.append((anchor_index, positive_index, negative_index))
        
        return triplets
    
    def _tokenize(self, texts):
        # Ensure all texts are padded/truncated to the same length
        return self.tokenizer(texts, padding='max_length', truncation=True, max_length=self.max_length, return_tensors='pt')
    
    def __len__(self):
        return len(self.triplets)
    
    def __getitem__(self, index):
        anchor_idx, positive_idx, negative_idx = self.triplets[index]
        anchor_text = self.user_data['comment'].values[anchor_idx]
        positive_text = self.hotel_data['hotel_intro'].values[positive_idx]
        negative_text = self.hotel_data['hotel_intro'].values[negative_idx]
        
        # Tokenize texts
        anchor_tokens = self._tokenize([anchor_text])
        positive_tokens = self._tokenize([positive_text])
        negative_tokens = self._tokenize([negative_text])
        
        return (anchor_tokens['input_ids'].squeeze(0), anchor_tokens['attention_mask'].squeeze(0)), \
               (positive_tokens['input_ids'].squeeze(0), positive_tokens['attention_mask'].squeeze(0)), \
               (negative_tokens['input_ids'].squeeze(0), negative_tokens['attention_mask'].squeeze(0))

def create_dataloader(user_data, hotel_data, y_user, batch_size=3, shuffle=False, tokenizer_name='bert-base-chinese'):
    dataset = TripletDataset(user_data, hotel_data, y_user, tokenizer_name=tokenizer_name)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=shuffle)
    return dataloader

hotel_df=pd.DataFrame(hotel_data)

dataloader = create_dataloader(user_data, hotel_df, y_user, batch_size=2, tokenizer_name='bert-base-chinese')

# 打印一个batch的数据
for batch in dataloader:
    anchor_tokens, positive_tokens, negative_tokens = batch
    print("Anchor tokens:", anchor_tokens)
    print("Positive tokens:", positive_tokens)
    print("Negative tokens:", negative_tokens)
    print("Anchor tokens type:", type(anchor_tokens[0]))


In [None]:
class MLP(nn.Module):
    def __init__(self, d_model, hidden_dim, output_dim):
        super(MLP, self).__init__()
        self.fc1 = nn.Linear(d_model, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, output_dim)
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.fc2(x)
        return x

In [None]:
class DualTowerModel(nn.Module):
    def __init__(self, vocab_size, d_model, nhead, num_layers,hidden_dim,output_dim):
        super(DualTowerModel, self).__init__()
        self.user_encoder = TransformerEncoder(vocab_size, d_model, nhead, num_layers)
        self.hotel_encoder= TransformerEncoder(vocab_size, d_model, nhead, num_layers)
        self.mlp = MLP(d_model, hidden_dim, output_dim)

    def forward(self, input,pos,neg):
        encoder_user = self.user_encoder(input)
        representation_user = self.mlp(encoder_user)

        encoder_pos = self.hotel_encoder(pos)
        representation_pos = self.mlp(encoder_pos)

        encoder_neg = self.hotel_encoder(neg)
        representation_neg = self.mlp(encoder_neg)

        return representation_user , representation_pos,representation_neg

In [None]:
tokenizer=AutoTokenizer.from_pretrained('bert-base-chinese')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
vocab_size = tokenizer.vocab_size
d_model = 512
nhead = 8
num_layers = 6

hidden_dim=256
output_dim=128

model=DualTowerModel(vocab_size, d_model, nhead, num_layers,hidden_dim,output_dim)
model.to(device)

In [None]:
def train_triplet_loss(model, dataloader, optimizer, device, margin=1.0):
    model.train()
    model.to(device)
    #optimizer.to(device)
    #margin.to(device)
    criterion = nn.TripletMarginLoss(margin=margin, p=2).to(device)  # 使用 L2 距离
    
    for batch in dataloader:
        anchor_list, positive_list, negative_list = batch

        anchor=torch.cat(anchor_list,dim=0)
        positive=torch.cat(positive_list,dim=0)
        negative=torch.cat(negative_list,dim=0)

        anchor=anchor.to(device)
        positive=positive.to(device)
        negative=negative.to(device)
                
        optimizer.zero_grad()

        anchor_output,positive_output,negative_output = model(anchor,positive,negative)

        print(anchor_output)
        
        # 计算 Triplet Loss
        loss = criterion(anchor_output, positive_output, negative_output)
        
        # 反向传播和优化
        loss.backward()
        optimizer.step()
        
        print(f"Loss: {loss.item()}")

In [None]:
# Training loop
num_epochs=10
optimizer = AdamW(model.parameters(), lr=1e-5)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

for epoch in range(num_epochs):
    print(f"Epoch {epoch+1}/{num_epochs}")
    train_triplet_loss(model,dataloader,  optimizer, device)