In [33]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
import numpy as np
import os
import seaborn as sns
import pickle
import pandas as pd
import torch.backends.cudnn as cudnn
import random
from sklearn.metrics import classification_report

In [34]:
def fix_random_seed_as(random_seed):
    random.seed(random_seed)
    torch.manual_seed(random_seed)
    torch.cuda.manual_seed_all(random_seed)
    np.random.seed(random_seed)
    cudnn.deterministic = True
    cudnn.benchmark = False
fix_random_seed_as(2024)

In [35]:
dataset_name = 'ml-100k'
model_name = 'dreamrec'
hidden_size=128
device='cuda:0'
num_class=26
batch_size=128

In [36]:
def load_pretrained_emb_weight():
    path = os.path.join('saved',model_name,dataset_name, 'pretrain.pth')

    # path = path_dict[dataset_name]
    saved = torch.load(path, map_location='cpu',weights_only=False)
    pretrained_emb_weight = saved['item_embedding.weight']
    return pretrained_emb_weight

In [37]:
class FeaturePredictionModel(nn.Module):
    def __init__(self):
        super(FeaturePredictionModel, self).__init__()
        
        # 嵌入层，将 itemID 转换为向量表示
        self.embedding = nn.Embedding.from_pretrained(load_pretrained_emb_weight(), padding_idx=0,freeze=True)
        self.norm = nn.BatchNorm1d(hidden_size, affine=False)
        # MLP 网络
        self.Linearproblayer = nn.Linear(hidden_size, num_class)  # 输出是一个标量，假设是回归问题
        self.init_weights(self.Linearproblayer)
    def init_weights(self,m):
        if isinstance(m, nn.Linear):  # 只对线性层初始化
            nn.init.xavier_uniform_(m.weight)  # 使用 Xavier 初始化方法
            if m.bias is not None:
                nn.init.zeros_(m.bias)  # 将偏置初始化为 0

    def forward(self, item_id):
        
        # 通过嵌入层获取 itemID 的嵌入向量
        item_emb = self.embedding(item_id)
        item_emb = self.norm(item_emb)
        # 通过 MLP 进行预测
        x = self.Linearproblayer(item_emb)
        return x

In [38]:
path = '../datasets/data/' + dataset_name + '/item_class.pkl'
df = pd.read_pickle(path)
print(df.shape)
item_ids = df['item_id'].values
features = df.drop(columns=['item_id']).astype(float)  # 去掉 itemID 列
# df.head()

(1008, 27)


In [39]:
class FeaturePredictionDataset(Dataset):
    def __init__(self, item_ids, features):
        self.item_ids = item_ids
        self.features = features

    def __len__(self):
        return len(self.item_ids)

    def __getitem__(self, idx):
        item_id = self.item_ids[idx]
        feature_data = self.features.iloc[idx].values
        
        # 转换为 torch.Tensor
        item_id_tensor = torch.tensor(item_id, dtype=torch.long)  # itemID 用长整型
        feature_tensor = torch.tensor(feature_data, dtype=torch.float32)
        
        return item_id_tensor, feature_tensor

# 创建数据集和 DataLoader
dataset = FeaturePredictionDataset(item_ids, features)
train_data, val_data = torch.utils.data.random_split(dataset,[0.8,0.2], torch.Generator().manual_seed(2024))
# print(dataset[0])
train_dataloader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_data, batch_size=batch_size, shuffle=False)

In [40]:
# 初始化模型
model = FeaturePredictionModel().to(device)# 损失函数和优化器
criterion = nn.BCEWithLogitsLoss() 
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
# 训练过程
epochs = 20
for epoch in range(epochs):
    model.train()
    running_loss = 0.0
    total_train = 0
    correct_train= 0
    for item_id, features in train_dataloader:
        item_id, features = item_id.to(device), features.to(device)
        # 假设 target 是你的目标标签，应该是一个形状为 (batch_size, num_classes) 的 0-1 张量
        target = features  # 假设目标是从特征中获取的
        # 清空梯度
        optimizer.zero_grad()
        # 前向传播
        outputs = model(item_id)
        # 计算损失
        loss = criterion(outputs, target)  # BCEWithLogitsLoss 会自动计算 sigmoid 和二分类损失
        # 反向传播
        loss.backward()
        # 更新参数
        optimizer.step()
        
        running_loss += loss.item()
        # 计算训练准确度
        predictions = torch.sigmoid(outputs)
        predicted_labels = (predictions > 0.5).float()
        correct_train += (predicted_labels == target).sum().item()
        total_train += target.numel()
    # 打印训练信息
    print(f"Epoch [{epoch+1}/{epochs}], train Loss: {running_loss/len(train_dataloader):.4f}, train Accuracy: {correct_train/total_train:.4f}")
    # 验证模式
    model.eval()
    running_val_loss = 0.0
    all_val_preds = []
    all_val_targets = []
    
    with torch.no_grad():  # 关闭梯度计算
        for item_id, features in val_dataloader:
            item_id, features = item_id.to(device), features.to(device)

            target = features

            # 前向传播
            outputs = model(item_id)

            # 计算损失
            loss = criterion(outputs, target)

            # 计算验证准确度
            predictions = torch.sigmoid(outputs)
            predicted_labels = (predictions > 0.5).float()

            # 保存所有预测标签和真实标签
            all_val_preds.append(predicted_labels.cpu().numpy())
            all_val_targets.append(target.cpu().numpy())

            running_val_loss += loss.item()

    # Flatten the list of predictions and targets for classification_report
    all_val_preds = np.concatenate(all_val_preds, axis=0)
    all_val_targets = np.concatenate(all_val_targets, axis=0)

    # 使用 sklearn 的 classification_report 输出详细报告
    val_report = classification_report(all_val_targets, all_val_preds, target_names=[f'Class {i}' for i in range(num_class)], zero_division=0)
    print(f"Validation Loss: {running_val_loss / len(val_dataloader):.4f} val Accuracy: {(all_val_preds==all_val_targets).sum()/(all_val_targets.shape[0]*all_val_targets.shape[1]):.4f}")
    if epoch%10==9:
        print(f"Validation Classification Report:\n{val_report}")
    


Epoch [1/20], train Loss: 0.8170, train Accuracy: 0.5088
Validation Loss: 0.7439 val Accuracy: 0.5245
Epoch [2/20], train Loss: 0.6999, train Accuracy: 0.5618
Validation Loss: 0.6767 val Accuracy: 0.5819
Epoch [3/20], train Loss: 0.6336, train Accuracy: 0.6631
Validation Loss: 0.6342 val Accuracy: 0.6933
Epoch [4/20], train Loss: 0.5932, train Accuracy: 0.7758
Validation Loss: 0.6051 val Accuracy: 0.7782
Epoch [5/20], train Loss: 0.5636, train Accuracy: 0.8454
Validation Loss: 0.5817 val Accuracy: 0.8224
Epoch [6/20], train Loss: 0.5383, train Accuracy: 0.8846
Validation Loss: 0.5595 val Accuracy: 0.8490
Epoch [7/20], train Loss: 0.5152, train Accuracy: 0.9048
Validation Loss: 0.5393 val Accuracy: 0.8617
Epoch [8/20], train Loss: 0.4946, train Accuracy: 0.9173
Validation Loss: 0.5201 val Accuracy: 0.8739
Epoch [9/20], train Loss: 0.4731, train Accuracy: 0.9252
Validation Loss: 0.5030 val Accuracy: 0.8816
Epoch [10/20], train Loss: 0.4570, train Accuracy: 0.9300
Validation Loss: 0.4862 