In [1]:
import torch
import numpy as np
def filepath(dataset):
    dataset_info = 'Data/'+dataset + '.info'
    training_set = 'Data/'+dataset + '_training.dat'
    test_set = 'Data/' + dataset + '_test.dat'
    negative_set = 'Data/' + dataset + '_negative.dat'
    return dataset_info, training_set, test_set, negative_set

if torch.backends.mps.is_available():
    print("MPS is available!")
    device = torch.device("mps")
elif torch.cuda.is_available():
    print("CUDA is available!")
    device = torch.device("cuda")
else:
    print("No GPU available, using CPU instead.")
    device = torch.device("cpu")


CUDA is available!


In [9]:
class InteractionDataset(torch.utils.data.Dataset):
    def __init__(self, user_ids, item_ids, labels,transform = None, device=None):
        self.user_ids = torch.tensor(user_ids, dtype=torch.long)
        self.item_ids = torch.tensor(item_ids, dtype=torch.long)
        self.labels = torch.tensor(labels, dtype = torch.float)
        
    
    def __len__(self):
        return len(self.user_ids)
    
    def __getitem__(self, idx):
        return self.user_ids[idx], self.item_ids[idx], self.labels[idx]


dataset = "ml-100k"
data_info, train_path, test_path, negative_path = filepath(dataset)
training_user_ids, training_items_ids, training_label = [], [], []

# 正样本loading。
with open(train_path, 'r') as file:
    for line in file:
        line = line.strip().split('\t')
        training_user_ids.append(int(line[0]))
        training_items_ids.append(int(line[1]))
        training_label.append(1)
print(f"正样本数量: {len(training_user_ids)}")
# 负样本loading.

# 测试集
test_dataset = {}
# 负样本
neg_users_ids = []
neg_items_ids = []

with open(negative_path, 'r') as file:
    for line in file:
        # 训练集 
        line = line.strip().split('\t')
        for i in range(100):
            training_user_ids.append(int(line[0]))
            neg_users_ids.append(int(line[0]))
        
        training_items_ids.append(int(line[1]))
        neg_items_ids.append(int(line[1]))
        for id in line[2].strip().split(' '):   
            training_items_ids.append(int(id))
            neg_items_ids.append(int(id))
        for i in range(100):
            training_label.append(0)
        
        # 测试集
        test_dataset[int(line[0])] = int(line[1])
        
print(f"负样本数量: {len(neg_items_ids)}")
print(f"总样本数量: {len(training_user_ids)}")
print(f"测试集数量: {len(test_dataset)}")
training_dataset = InteractionDataset(training_user_ids, training_items_ids, training_label,device=device)

正样本数量: 99057
负样本数量: 94300
总样本数量: 193357
测试集数量: 943


In [10]:

batch_size = 64
shuffle = True
num_workers = 4
training_loader = torch.utils.data.DataLoader(training_dataset, batch_size = batch_size, shuffle = shuffle, num_workers= 0)


In [11]:
from torch import nn
class BasicMF(nn.Module):
    def __init__(self, num_users, num_items, embedding_size):
        super(BasicMF, self).__init__()
        self.user_embeddings = nn.Embedding(num_users, embedding_size)
        self.item_embeddings = nn.Embedding(num_items, embedding_size)
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, user_ids, item_ids):
        user_embedding = self.user_embeddings(user_ids)
        item_embedding = self.item_embeddings(item_ids)
        
        ratings = (user_embedding *item_embedding).sum(1)
        probabilities = self.sigmoid(ratings)
        return probabilities

def predict_all_items(model, user_ids, item_ids):
    model.eval()
    
    with torch.no_grad():
        predictions_of_user = {}
        predictions = model(user_ids, item_ids)
        predictions = predictions.flatten()
        
        for user_id, item_id, pred in zip(user_ids, item_ids, predictions):
            if user_id.item() not in predictions_of_user:
                predictions_of_user[user_id.item()] = [(item_id.item(), pred.item())]
            else:
                predictions_of_user[user_id.item()].append((item_id.item(), pred.item()))
                
    return predictions_of_user



def get_top_k_items_for_each_user(predictions_of_user, k=10):
    top_k_items_of_user = {}
    for user_id, item_predictions in predictions_of_user.items():
        sorted_predictions = sorted(item_predictions, key=lambda x: x[1], reverse=True)
        for i in range(k):
            if user_id not in top_k_items_of_user:
                top_k_items_of_user[user_id] = [sorted_predictions[i][0]]
            else:
                top_k_items_of_user[user_id].append(sorted_predictions[i][0])
            
    return top_k_items_of_user


In [12]:
def calculate_hit_rate(recommended_items, test_items):
    hits = 0
    total_users = len(test_items)
    
    for user_id, true_items in test_items.items():
        pred_items = recommended_items.get(user_id, [])
        #print(pred_items)
        if true_items in pred_items:
            hits += 1

    hit_rate = hits / total_users if total_users > 0 else 0
    return hit_rate

def calclulate_ndcg(recommended_items, test_items):
    ndcg = 0
    for user_id, items_id in test_items.items():
        pred_items = recommended_items.get(user_id, [])
        try:
            rank = pred_items.index(items_id) + 1
            dcg  = 1/np.log2(rank+1)
            idcg = 1/np.log2(2)
            ndcg += dcg/idcg
        except ValueError:
            ndcg += 0
    avg_ndcg = ndcg / len(test_items) if test_items else 0 
    return avg_ndcg

In [13]:
import logging
import os
from datetime import datetime

now = datetime.now()
date, time = str(now).split(' ')
time = time.replace(':', '-')

log_directory = f"log/{date}/"
log_filename = f"{time}_{dataset}training.log"

if not os.path.exists(log_directory):
    os.makedirs(log_directory)

log_path = os.path.join(log_directory, log_filename)
logging.basicConfig(filename=log_path, filemode='w', level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

logging.info(f'训练日志 {str(now)} {dataset}')


In [14]:
num_users = max(training_user_ids)
num_items = max(training_items_ids)
print(num_users)
print(num_items)
all_user_ids = torch.arange(num_users).to(device)
all_item_idx = torch.arange(num_items).to(device)
neg_users_ids = torch.tensor(neg_users_ids)
neg_items_ids = torch.tensor(neg_items_ids)
model = BasicMF(num_users+1, num_items+1, 64)
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

num_epochs = 100
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for user_ids, item_ids, labels in training_loader:
        if (user_ids >= num_users+1).any() or (item_ids >= num_items+1).any():
            out_of_range_user_ids = user_ids[user_ids >= num_users+1]
            out_of_range_item_ids = item_ids[item_ids >= num_items+1]
            print(f"Out of range user_ids: {out_of_range_user_ids}")
            print(f"Out of range item_ids: {out_of_range_item_ids}")
            continue 

        optimizer.zero_grad()
        try:
            outputs = model(user_ids, item_ids)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        except Exception as e:  
            print(f"An error occurred: {e}")
            print(user_ids)
            print(item_ids)
            print(model.user_embeddings.weight.size())
            print(model.item_embeddings.weight.size())
    pred = predict_all_items(model,neg_users_ids, neg_items_ids)
    top10 = get_top_k_items_for_each_user(pred, 10)
    hr = calculate_hit_rate(top10, test_dataset)
    ndcg = calclulate_ndcg(top10, test_dataset)
    avg_loss = total_loss / len(training_loader)
    
    logging.info(f"Epoch {epoch+1}: Avg. Loss = {avg_loss:.4f} HR = {hr:.10f} NDCG = {ndcg:.10f}")
    print(f"Epoch {epoch+1}: Avg. Loss = {avg_loss:.4f} HR = {hr:.10f} NDCG = {ndcg:.10f}")

942
1681
Epoch 1: Avg. Loss = 3.7330 HR = 0.1060445387 NDCG = 0.0499720828
Epoch 2: Avg. Loss = 2.9367 HR = 0.1028632025 NDCG = 0.0466662794
Epoch 3: Avg. Loss = 2.3023 HR = 0.0996818664 NDCG = 0.0459273115
Epoch 4: Avg. Loss = 1.7731 HR = 0.1145281018 NDCG = 0.0514372199
Epoch 5: Avg. Loss = 1.3424 HR = 0.1283138918 NDCG = 0.0581385096
Epoch 6: Avg. Loss = 0.9987 HR = 0.1675503712 NDCG = 0.0749414389
Epoch 7: Avg. Loss = 0.7444 HR = 0.2195121951 NDCG = 0.0969415339
Epoch 8: Avg. Loss = 0.5616 HR = 0.2682926829 NDCG = 0.1241498839
Epoch 9: Avg. Loss = 0.4345 HR = 0.3234358431 NDCG = 0.1544724690
Epoch 10: Avg. Loss = 0.3382 HR = 0.3658536585 NDCG = 0.1814783776
Epoch 11: Avg. Loss = 0.2694 HR = 0.4061505832 NDCG = 0.2059153710
Epoch 12: Avg. Loss = 0.2182 HR = 0.4284199364 NDCG = 0.2234646900
Epoch 13: Avg. Loss = 0.1802 HR = 0.4337221633 NDCG = 0.2334983991
Epoch 14: Avg. Loss = 0.1494 HR = 0.4506892895 NDCG = 0.2456916387
Epoch 15: Avg. Loss = 0.1251 HR = 0.4581124072 NDCG = 0.252662

In [15]:
import numpy as np
def calculate_hit_rate(recommended_items, test_items):
    hits = 0
    total_users = len(test_items)
    
    for user_id, true_items in test_items.items():
        pred_items = recommended_items.get(user_id, [])
        print(pred_items)
        if true_items in pred_items:
            hits += 1

    hit_rate = hits / total_users if total_users > 0 else 0
    return hit_rate

def calclulate_ndcg(recommended_items, test_items):
    ndcg = 0
    for user_id, items_id in test_items.items():
        pred_items = recommended_items.get(user_id, [])
        try:
            rank = pred_items.index(items_id) + 1
            dcg  = 1/np.log2(rank+1)
            idcg = 1/np.log2(2)
            ndcg += dcg/idcg
        except ValueError:
            ndcg += 0
    avg_ndcg = ndcg / len(test_items) if test_items else 0 
    return avg_ndcg
        
        
        

neg_users_ids = torch.tensor(neg_users_ids)
neg_items_ids = torch.tensor(neg_items_ids)
pred = predict_all_items(model,neg_users_ids, neg_items_ids)
pred
top10 = get_top_k_items_for_each_user(pred, 10)

  neg_users_ids = torch.tensor(neg_users_ids)
  neg_items_ids = torch.tensor(neg_items_ids)


In [16]:
top10 = get_top_k_items_for_each_user(pred, 10)
top10

{0: [30, 541, 133, 668, 526, 559, 300, 623, 852, 681],
 1: [437, 717, 754, 557, 148, 944, 98, 324, 911, 1210],
 2: [274, 754, 235, 85, 209, 256, 163, 1363, 242, 192],
 3: [525, 550, 1008, 386, 325, 182, 293, 282, 722, 582],
 4: [268, 807, 406, 752, 85, 592, 171, 1464, 503, 880],
 5: [25, 204, 375, 801, 173, 292, 551, 122, 252, 483],
 6: [247, 78, 93, 350, 452, 243, 150, 267, 668, 979],
 7: [34, 739, 613, 529, 101, 592, 195, 303, 355, 610],
 8: [342, 619, 537, 556, 318, 294, 230, 1123, 4, 1026],
 9: [540, 433, 235, 420, 201, 1020, 4, 44, 141, 807],
 10: [222, 446, 283, 288, 801, 385, 950, 195, 425, 1144],
 11: [706, 726, 613, 125, 44, 699, 400, 297, 993, 302],
 12: [288, 406, 544, 305, 310, 284, 608, 576, 407, 1179],
 13: [360, 358, 512, 1020, 552, 544, 211, 482, 3, 369],
 14: [267, 758, 239, 222, 1143, 36, 525, 532, 154, 1167],
 15: [169, 342, 443, 460, 1138, 310, 618, 1120, 495, 837],
 16: [567, 665, 549, 88, 313, 942, 1205, 159, 724, 135],
 17: [257, 229, 447, 1113, 326, 499, 856, 35

In [17]:
hr = calculate_hit_rate(top10, test_dataset)
hr


[30, 541, 133, 668, 526, 559, 300, 623, 852, 681]
[437, 717, 754, 557, 148, 944, 98, 324, 911, 1210]
[274, 754, 235, 85, 209, 256, 163, 1363, 242, 192]
[525, 550, 1008, 386, 325, 182, 293, 282, 722, 582]
[268, 807, 406, 752, 85, 592, 171, 1464, 503, 880]
[25, 204, 375, 801, 173, 292, 551, 122, 252, 483]
[247, 78, 93, 350, 452, 243, 150, 267, 668, 979]
[34, 739, 613, 529, 101, 592, 195, 303, 355, 610]
[342, 619, 537, 556, 318, 294, 230, 1123, 4, 1026]
[540, 433, 235, 420, 201, 1020, 4, 44, 141, 807]
[222, 446, 283, 288, 801, 385, 950, 195, 425, 1144]
[706, 726, 613, 125, 44, 699, 400, 297, 993, 302]
[288, 406, 544, 305, 310, 284, 608, 576, 407, 1179]
[360, 358, 512, 1020, 552, 544, 211, 482, 3, 369]
[267, 758, 239, 222, 1143, 36, 525, 532, 154, 1167]
[169, 342, 443, 460, 1138, 310, 618, 1120, 495, 837]
[567, 665, 549, 88, 313, 942, 1205, 159, 724, 135]
[257, 229, 447, 1113, 326, 499, 856, 351, 686, 318]
[456, 705, 566, 556, 576, 125, 428, 99, 285, 250]
[73, 574, 535, 612, 778, 282, 400,

0.5068928950159067

In [18]:
ndcg = calclulate_ndcg(top10, test_dataset)
ndcg

0.3199456142163394