In [1]:
!pip install surprise

Collecting surprise
  Using cached surprise-0.1-py2.py3-none-any.whl.metadata (327 bytes)
Collecting scikit-surprise (from surprise)
  Using cached scikit_surprise-1.1.4-cp311-cp311-linux_x86_64.whl
Using cached surprise-0.1-py2.py3-none-any.whl (1.8 kB)
Installing collected packages: scikit-surprise, surprise
Successfully installed scikit-surprise-1.1.4 surprise-0.1


In [1]:
def precision_at_k(actual, predicted, k):
    """
    Calculate Precision@k.
    
    Parameters:
        actual (set): The set of items that the user actually interacted with or purchased.
        predicted (list): The list of items recommended to the user, ranked by relevance.
        k (int): The number of top items to consider for the calculation.
    
    Returns:
        float: The Precision@k score, which measures the proportion of recommended items
               in the top-k list that are relevant (i.e., present in the actual set).
    """
    # Convert actual interactions to a set for faster membership testing
    actual_set = set(actual)
    
    # Select the top-k predicted items and convert to a set
    predicted_set = set(predicted[:k])
    
    # Handle edge case where no predictions are made
    if len(predicted_set) == 0:
        return 0.0
    
    # Calculate precision as the ratio of relevant recommended items in the top-k list
    return len(actual_set & predicted_set) / float(k)


def recall_at_k(actual, predicted, k):
    """
    Calculate Recall@k.
    
    Parameters:
        actual (set): The set of items that the user actually interacted with or purchased.
        predicted (list): The list of items recommended to the user, ranked by relevance.
        k (int): The number of top items to consider for the calculation.
    
    Returns:
        float: The Recall@k score, which measures the proportion of relevant items
               from the actual set that are included in the top-k recommended list.
    """
    # Convert actual interactions to a set for faster membership testing
    actual_set = set(actual)
    
    # Select the top-k predicted items and convert to a set
    predicted_set = set(predicted[:k])
    
    # Handle edge case where the actual set is empty
    if len(actual_set) == 0:
        return 0.0
    
    # Calculate recall as the ratio of relevant items retrieved from the actual set
    return len(actual_set & predicted_set) / float(len(actual_set))


def f1_at_k(actual, predicted, k):
    """
    Calculate F1@k score.
    
    Parameters:
        actual (set): The set of items that the user actually interacted with or purchased.
        predicted (list): The list of items recommended to the user, ranked by relevance.
        k (int): The number of top items to consider for the calculation.
    
    Returns:
        float: The F1@k score, which is the harmonic mean of Precision@k and Recall@k.
    """
    # Calculate Precision@k and Recall@k
    precision = precision_at_k(actual, predicted, k)
    recall = recall_at_k(actual, predicted, k)
    
    # Handle edge case where both precision and recall are zero to avoid division by zero
    if precision + recall == 0:
        return 0.0
    
    # Calculate F1 score as the harmonic mean of precision and recall
    return 2 * (precision * recall) / (precision + recall)


## Data Preparation

In [2]:
import pandas as pd
import numpy as np
import random
from surprise import Dataset, Reader, SVD, SlopeOne, CoClustering, SVDpp
from surprise import accuracy
from tqdm.auto import tqdm
from sklearn.metrics import roc_auc_score
from collections import defaultdict

seed = 42
random.seed(seed)
np.random.seed(seed)
data_dir = '/home/sagemaker-user/Data/'
model_dir = '/home/sagemaker-user/Models/'
log_dir = '/home/sagemaker-user/Logs/'

In [3]:
train_df = pd.read_pickle(data_dir+'clean_data/train_set.pkl')
test_df = pd.read_pickle(data_dir+'clean_data/test_set.pkl')

In [5]:
train_df.user_id.value_counts()

user_id
AG73BVBKUOH22USSFJA5ZWL7AKXA    1426
AHPOHKN4PU4W3V5PGFL7AGTAD2AA    1198
AHEMJ62SUJPUYNWGROPI6MUAYQ5A     924
AEYVPPWR4CIKWX4BGYKCBCDL2CZQ     924
AH665SQ6SQF6DXAGYIQFCX76LALA     778
                                ... 
AFI6HA2MBKJJEMGAIBHQOGPO5TMQ       2
AFI6G2GDXADHCF5F5NOQ4YGOFLJQ       2
AFI6F65FPRQLQZCNY5QGHRHBCAOQ       2
AFI6F4Q7RL5DWL7OQIPM5Z6AE2TA       2
AFI6JBTMHUXK77EE2DHURX7PFXTQ       2
Name: count, Length: 290475, dtype: int64

In [8]:
train_df.shape

(1258132, 5)

In [6]:
train_df.head()

Unnamed: 0,user_id,parent_asin,label,rating,timestamp
0,AE22236AFRRSMQIKGG7TPTB75QEA,B0002C7FHC,1,5,2009-09-19 19:42:10
1,AE22236AFRRSMQIKGG7TPTB75QEA,B00UFKQKLS,1,5,2014-03-07 15:31:31
2,AE22236AFRRSMQIKGG7TPTB75QEA,B01I6X61OQ,1,5,2014-03-07 17:06:29
3,AE22236AFRRSMQIKGG7TPTB75QEA,B0713WBZM7,0,0,2009-09-19 19:42:10
4,AE22236AFRRSMQIKGG7TPTB75QEA,B0BVM3J8GW,0,0,2014-03-07 15:31:31


## Model Training

In [8]:
import os
from surprise import Dataset, Reader, SVD, SVDpp, SlopeOne, CoClustering, accuracy
from surprise.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import pandas as pd
import numpy as np

# Load and preprocess explicit feedback data for collaborative filtering
# Define rating range for explicit feedback
reader_explicit = Reader(rating_scale=(1, 5))  # Explicit ratings range from 1 to 5

# Load training data
# train_df should be a DataFrame containing 'user_id', 'parent_asin' (item ID), and 'rating' columns
train_explicit = Dataset.load_from_df(train_df[['user_id', 'parent_asin', 'rating']], reader_explicit)
trainset_explicit = train_explicit.build_full_trainset()  # Convert to training set format required by Surprise library

# Load testing data
# test_df should contain 'user_id', 'parent_asin', and 'rating' (or 'label' for classification)
test_set_explicit = list(zip(test_df['user_id'], test_df['parent_asin'], test_df['rating']))

# Define models for explicit feedback prediction
# Removed NMF model as per the requirement
models_explicit = {
    'SVD': SVD(random_state=seed),  # Singular Value Decomposition
    'SVD++': SVDpp(random_state=seed),  # Enhanced version of SVD considering implicit feedback
    # 'SlopeOne': SlopeOne(),  # Simple and efficient collaborative filtering algorithm
    'CoClustering': CoClustering(random_state=seed)  # Co-clustering approach for collaborative filtering
}

# Train each model and evaluate predictions
model_predictions_explicit = {}  # Store predictions for each model
model_metrics_explicit = {}  # Store evaluation metrics for each model

for model_name, model in models_explicit.items():
    print(f"\nTraining model: {model_name}")
    model.fit(trainset_explicit)  # Train the model on the training set

    print("Making predictions on the test set...")
    predictions = model.test(test_set_explicit)  # Generate predictions on the test set

    # Calculate RMSE (Root Mean Square Error) for prediction quality
    rmse = accuracy.rmse(predictions, verbose=True)

    # Store predictions
    model_predictions_explicit[model_name] = predictions

    # Calculate ROC AUC (Area Under the Receiver Operating Characteristic Curve)
    y_true = test_df['label']  # True binary labels for the test set
    y_pred = [pred.est for pred in predictions]  # Predicted ratings
    roc_auc = roc_auc_score(y_true, y_pred)

    # Store evaluation metrics
    model_metrics_explicit[model_name] = {
        'RMSE': rmse,
        'ROC_AUC': roc_auc
    }


Training model: SVD
Making predictions on the test set...
RMSE: 1.8507

Training model: SVD++
Making predictions on the test set...
RMSE: 1.8464

Training model: CoClustering
Making predictions on the test set...
RMSE: 1.9400


In [4]:
import os
from surprise import Dataset, Reader, SVD, SVDpp, SlopeOne, CoClustering, accuracy
from surprise.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import pandas as pd
import numpy as np

# Load and preprocess explicit feedback data for collaborative filtering
# Define rating range for explicit feedback
reader_explicit = Reader(rating_scale=(1, 5))  # Explicit ratings range from 1 to 5

# Load training data
# train_df should be a DataFrame containing 'user_id', 'parent_asin' (item ID), and 'rating' columns
train_explicit = Dataset.load_from_df(train_df[['user_id', 'parent_asin', 'rating']], reader_explicit)
trainset_explicit = train_explicit.build_full_trainset()  # Convert to training set format required by Surprise library

# Load testing data
# test_df should contain 'user_id', 'parent_asin', and 'rating' (or 'label' for classification)
test_set_explicit = list(zip(test_df['user_id'], test_df['parent_asin'], test_df['rating']))

# Define models for explicit feedback prediction
# Removed NMF model as per the requirement
models_explicit = {
    'SVD': SVD(random_state=seed),  # Singular Value Decomposition
    'SVD++': SVDpp(random_state=seed),  # Enhanced version of SVD considering implicit feedback
    # 'SlopeOne': SlopeOne(),  # Simple and efficient collaborative filtering algorithm
    'CoClustering': CoClustering(random_state=seed)  # Co-clustering approach for collaborative filtering
}

# Train each model and evaluate predictions
model_predictions_explicit = {}  # Store predictions for each model
model_metrics_explicit = {}  # Store evaluation metrics for each model

In [5]:
for model_name, model in models_explicit.items():
    print(f"\nTraining model: {model_name}")
    model.fit(trainset_explicit)  # Train the model on the training set

    print("Making predictions on the test set...")
    predictions = model.test(test_set_explicit)  # Generate predictions on the test set

    # Calculate RMSE (Root Mean Square Error) for prediction quality
    rmse = accuracy.rmse(predictions, verbose=True)

    # Store predictions
    model_predictions_explicit[model_name] = predictions

    # Calculate ROC AUC (Area Under the Receiver Operating Characteristic Curve)
    y_true = test_df['label']  # True binary labels for the test set
    y_pred = [pred.est for pred in predictions]  # Predicted ratings
    roc_auc = roc_auc_score(y_true, y_pred)

    # Store evaluation metrics
    model_metrics_explicit[model_name] = {
        'RMSE': rmse,
        'ROC_AUC': roc_auc
    }

# Define the directory for saving results
result_dir = os.path.join('../Results', 'CF_model')  # Update path to reflect 'collaborative_filtering'

# Create the directory if it doesn't exist
os.makedirs(result_dir, exist_ok=True)

# Save evaluation metrics to a text file
metrics_file_path = os.path.join(result_dir, 'metrics.txt')

with open(metrics_file_path, 'w', encoding='utf-8') as f:
    f.write("Evaluation Metrics for Explicit Feedback Models\n")
    f.write("=============================================\n\n")
    for model_name, metrics in model_metrics_explicit.items():
        f.write(f"Model: {model_name}\n")
        f.write(f"RMSE: {metrics['RMSE']:.4f}\n")
        f.write(f"ROC AUC: {metrics['ROC_AUC']:.4f}\n\n")

print(f"Evaluation metrics have been saved to {metrics_file_path}")

# Save predictions for each model to separate files
for model_name, predictions in model_predictions_explicit.items():
    # Define the file path for predictions
    predictions_file_path = os.path.join(result_dir, f"{model_name}_predictions.txt")
    
    # Write predictions to the file
    with open(predictions_file_path, 'w', encoding='utf-8') as f:
        f.write(f"{model_name} Model Predictions\n")
        f.write("=============================================\n\n")
        for pred in predictions:
            f.write(f"User ID: {pred.uid}, Item ID: {pred.iid}, True Rating: {pred.r_ui}, Predicted Rating: {pred.est}\n")
    
    print(f"Predictions for the {model_name} model have been saved to {predictions_file_path}")



Training model: SVD
Making predictions on the test set...
RMSE: 1.8507

Training model: SVD++
Making predictions on the test set...
RMSE: 1.8464

Training model: CoClustering
Making predictions on the test set...
RMSE: 1.9400
Evaluation metrics have been saved to ../Results/CF_model/metrics.txt
Predictions for the SVD model have been saved to ../Results/CF_model/SVD_predictions.txt
Predictions for the SVD++ model have been saved to ../Results/CF_model/SVD++_predictions.txt
Predictions for the CoClustering model have been saved to ../Results/CF_model/CoClustering_predictions.txt


## Model Comparison

In [6]:
from collections import defaultdict

# 10. Generate recommendation lists and compute evaluation metrics
topK_values = [5, 10, 15]  # Define the top-K values to evaluate (e.g., top-5, top-10, top-15 recommendations)
results_explicit = {
    model_name: {k: {'precision': [], 'recall': [], 'f1': []} for k in topK_values}
    for model_name in models_explicit.keys()
}  # Initialize a dictionary to store evaluation metrics (precision, recall, F1) for each model and K value

print("Generating recommendations and calculating evaluation metrics for explicit feedback models...")

# Build a dictionary mapping each user to the set of items they actually interacted with or purchased
user_actual = (
    test_df[test_df['label'] == 1]  # Filter the test data for items labeled as "purchased" or "positive interactions"
    .groupby('user_id')['parent_asin']  # Group by user ID
    .apply(set)  # Convert each user's items to a set
    .to_dict()  # Convert the grouped object to a dictionary
)

# Get the list of all unique users in the test dataset
all_users = test_df['user_id'].unique()

# Process predictions for each model
for model_name, predictions in tqdm(model_predictions_explicit.items(), desc="Processing explicit models"):
    user_recommendations = defaultdict(list)  # Initialize a dictionary to store recommendations for each user
    
    # Group predictions by user
    user_pred_dict = defaultdict(list)
    for pred in predictions:
        # Each prediction contains a user ID (uid), an item ID (iid), and an estimated rating (est)
        user_pred_dict[pred.uid].append((pred.iid, pred.est))
    
    # Generate a sorted recommendation list for each user
    for user, item_scores in user_pred_dict.items():
        # Sort items by estimated score in descending order
        sorted_items = sorted(item_scores, key=lambda x: x[1], reverse=True)
        # Extract only the item IDs from the sorted list
        recommended_items = [item for item, score in sorted_items]
        # Store the sorted recommendations for the user
        user_recommendations[user] = recommended_items
    
    # Evaluate the recommendations for each user
    for user in tqdm(all_users, desc=f"Evaluating {model_name}"):
        actual = user_actual.get(user, set())  # Get the set of items the user actually interacted with
        predicted = user_recommendations.get(user, [])  # Get the list of recommended items for the user
        
        # Calculate evaluation metrics for each K value
        for k in topK_values:
            precision = precision_at_k(actual, predicted, k)  # Calculate Precision@k
            recall = recall_at_k(actual, predicted, k)  # Calculate Recall@k
            f1 = f1_at_k(actual, predicted, k)  # Calculate F1@k
            
            # Append the metrics to the results dictionary
            results_explicit[model_name][k]['precision'].append(precision)
            results_explicit[model_name][k]['recall'].append(recall)
            results_explicit[model_name][k]['f1'].append(f1)


Generating recommendations and calculating evaluation metrics for explicit feedback models...


Processing explicit models:   0%|          | 0/3 [00:00<?, ?it/s]

Evaluating SVD:   0%|          | 0/290493 [00:00<?, ?it/s]

Evaluating SVD++:   0%|          | 0/290493 [00:00<?, ?it/s]

Evaluating CoClustering:   0%|          | 0/290493 [00:00<?, ?it/s]

In [7]:
# 11. Print evaluation results for each model
for model_name in models_explicit.keys():
    print(f"\nModel: {model_name}")
    for k in topK_values:
        # Calculate average precision, recall, and F1-score for top-K recommendations
        avg_precision = np.mean(results_explicit[model_name][k]['precision'])
        avg_recall = np.mean(results_explicit[model_name][k]['recall'])
        avg_f1 = np.mean(results_explicit[model_name][k]['f1'])
        
        # Print evaluation metrics for the current top-K value
        print(f"\nEvaluation metrics for top-{k}:")
        print(f"Precision@{k}: {avg_precision:.4f}")
        print(f"Recall@{k}: {avg_recall:.4f}")
        print(f"F1-score@{k}: {avg_f1:.4f}")
    
    # Print RMSE and ROC AUC for the model
    print(f"RMSE: {model_metrics_explicit[model_name]['RMSE']:.4f}")
    print(f"ROC AUC: {model_metrics_explicit[model_name]['ROC_AUC']:.4f}")


Model: SVD

Evaluation metrics for top-5:
Precision@5: 0.8018
Recall@5: 0.7227
F1-score@5: 0.7365

Evaluation metrics for top-10:
Precision@10: 0.5535
Recall@10: 0.9404
F1-score@10: 0.6755

Evaluation metrics for top-15:
Precision@15: 0.3960
Recall@15: 0.9870
F1-score@15: 0.5497
RMSE: 1.8507
ROC AUC: 0.8627

Model: SVD++

Evaluation metrics for top-5:
Precision@5: 0.8000
Recall@5: 0.7223
F1-score@5: 0.7355

Evaluation metrics for top-10:
Precision@10: 0.5548
Recall@10: 0.9422
F1-score@10: 0.6770

Evaluation metrics for top-15:
Precision@15: 0.3981
Recall@15: 0.9903
F1-score@15: 0.5523
RMSE: 1.8464
ROC AUC: 0.8598

Model: CoClustering

Evaluation metrics for top-5:
Precision@5: 0.7894
Recall@5: 0.7189
F1-score@5: 0.7292

Evaluation metrics for top-10:
Precision@10: 0.5563
Recall@10: 0.9457
F1-score@10: 0.6793

Evaluation metrics for top-15:
Precision@15: 0.4006
Recall@15: 0.9942
F1-score@15: 0.5554
RMSE: 1.9400
ROC AUC: 0.8226


In [8]:
# Define the path to save evaluation metrics
metrics_file_path = os.path.join(result_dir, 'metrics_classify.txt')

# Write evaluation metrics to the file
with open(metrics_file_path, 'w', encoding='utf-8') as f:
    f.write("Evaluation Metrics for Explicit Feedback Models\n")
    f.write("========================\n\n")
    
    # Iterate through each model and save its metrics
    for model_name in models_explicit.keys():
        f.write(f"Model: {model_name}\n")
        
        # Write evaluation metrics for each top-K value
        for k in topK_values:
            avg_precision = np.mean(results_explicit[model_name][k]['precision'])  # Average Precision@k
            avg_recall = np.mean(results_explicit[model_name][k]['recall'])  # Average Recall@k
            avg_f1 = np.mean(results_explicit[model_name][k]['f1'])  # Average F1@k
            
            f.write(f"\nEvaluation Metrics for top-{k}:\n")
            f.write(f"Precision@{k}: {avg_precision:.4f}\n")
            f.write(f"Recall@{k}: {avg_recall:.4f}\n")
            f.write(f"F1-score@{k}: {avg_f1:.4f}\n")
        
        # Write global metrics: RMSE and ROC AUC
        f.write(f"RMSE: {model_metrics_explicit[model_name]['RMSE']:.4f}\n")
        f.write(f"ROC AUC: {model_metrics_explicit[model_name]['ROC_AUC']:.4f}\n")
        f.write("\n------------------------\n\n")

# Notify the user that the metrics have been saved
print(f"Evaluation metrics have been saved to {metrics_file_path}")


Evaluation metrics have been saved to ../Results/CF_model/metrics_classify.txt


In [9]:
import os
import pickle
import faiss
import numpy as np
import h5py
from sklearn.preprocessing import normalize
from tqdm.notebook import tqdm
from collections import defaultdict

# Define the directory to save results
result_dir = os.path.join('../Results', 'CF_model')
os.makedirs(result_dir, exist_ok=True)  # Create the directory if it doesn't exist

# Identify the best model based on RMSE
best_model_name = min(model_metrics_explicit, key=lambda x: model_metrics_explicit[x]['RMSE'])  # Select model with lowest RMSE
best_rmse = model_metrics_explicit[best_model_name]['RMSE']
print(f"Best model: {best_model_name}, RMSE: {best_rmse:.4f}")

# Save the best model to a file
model_file_path = os.path.join(result_dir, f"{best_model_name}_best_model.pkl")
with open(model_file_path, 'wb') as f:
    pickle.dump(models_explicit[best_model_name], f)  # Serialize and save the best model
print(f"The best model has been saved to {model_file_path}")


Best model: SVD++, RMSE: 1.8464
The best model has been saved to ../Results/CF_model/SVD++_best_model.pkl


## Latent Factors and FAISS Index

In [10]:
# Extract user and item latent factor matrices
user_factors = models_explicit[best_model_name].pu  # User factor matrix: shape (n_users, n_factors)
item_factors = models_explicit[best_model_name].qi  # Item factor matrix: shape (n_items, n_factors)

# Retrieve user and item ID mappings
trainset = models_explicit[best_model_name].trainset  # Access the trainset used by the model
user_id_map = {trainset.to_raw_uid(i): i for i in range(trainset.n_users)}  # Map raw user IDs to internal IDs
item_id_map = {trainset.to_raw_iid(i): i for i in range(trainset.n_items)}  # Map raw item IDs to internal IDs

# Save user factors and ID mapping
np.save(os.path.join(result_dir, 'user_factors.npy'), user_factors)  # Save user latent factors as a NumPy array
with open(os.path.join(result_dir, 'user_id_map.pkl'), 'wb') as f:
    pickle.dump(user_id_map, f)  # Save user ID mapping as a pickle file

# Save item factors and ID mapping
np.save(os.path.join(result_dir, 'item_factors.npy'), item_factors)  # Save item latent factors as a NumPy array
with open(os.path.join(result_dir, 'item_id_map.pkl'), 'wb') as f:
    pickle.dump(item_id_map, f)  # Save item ID mapping as a pickle file

print("User and item vectors and mappings have been saved.")

User and item vectors and mappings have been saved.


In [11]:
# Create FAISS index for item vectors
print("Creating FAISS index...")
item_factors_normalized = normalize(item_factors, norm='l2').astype('float32')  # Normalize item factors for cosine similarity
d = item_factors_normalized.shape[1]  # Dimensionality of the vectors
index = faiss.IndexFlatIP(d)  # Create a FAISS index for inner product similarity
index.add(item_factors_normalized)  # Add normalized item factors to the index
faiss.write_index(index, os.path.join(result_dir, 'item_index.faiss'))  # Save the index to a file
print("FAISS vector index has been saved as 'item_index.faiss'.")

# Load the saved model and latent factor vectors
with open(model_file_path, 'rb') as f:
    best_model = pickle.load(f)  # Load the best model

user_factors = np.load(os.path.join(result_dir, 'user_factors.npy')).astype('float32')  # Load user factors
item_factors = np.load(os.path.join(result_dir, 'item_factors.npy')).astype('float32')  # Load item factors

# Load user and item ID mappings
with open(os.path.join(result_dir, 'user_id_map.pkl'), 'rb') as f:
    user_id_map = pickle.load(f)  # Load user ID mapping
with open(os.path.join(result_dir, 'item_id_map.pkl'), 'rb') as f:
    item_id_map = pickle.load(f)  # Load item ID mapping

# Create reverse mappings for user and item IDs
item_id_inverse_mapping = {v: k for k, v in item_id_map.items()}  # Reverse mapping for item IDs
user_id_inverse_mapping = {v: k for k, v in user_id_map.items()}  # Reverse mapping for user IDs

# Load the saved FAISS index
index = faiss.read_index(os.path.join(result_dir, 'item_index.faiss'))  # Load the FAISS index from file
print("FAISS index successfully loaded.")

Creating FAISS index...
FAISS vector index has been saved as 'item_index.faiss'.
FAISS index successfully loaded.


## Batch Recommendations

In [12]:
# Normalize user factors for cosine similarity
faiss.normalize_L2(user_factors)

# Define recommendation parameters
topK = 20  # Number of items to recommend per user
batch_size = 1000  # Number of users processed in each batch
n_users = user_factors.shape[0]  # Total number of users
all_recommendations = {}  # Dictionary to store recommendations for all users

# Load actual items purchased by each user
user_actual = (
    test_df[test_df['label'] == 1]  # Filter positive interactions (purchased items)
    .groupby('user_id')['parent_asin']  # Group by user ID and collect item IDs
    .apply(set)  # Convert item lists to sets for each user
    .to_dict()  # Convert the grouped object to a dictionary
)

print("Generating recommendation lists using FAISS...")

# Batch processing to generate recommendations for all users
for start in tqdm(range(0, n_users, batch_size), desc="Generating recommendations"):
    end = min(start + batch_size, n_users)  # Define batch range
    batch_user_factors = user_factors[start:end]  # Extract user factors for the batch
    
    # Search for top-K similar items for the current batch of users
    D, I = index.search(batch_user_factors, topK)  # D: Similarity scores, I: Indices of recommended items
    
    # Process recommendations for each user in the batch
    for i in range(end - start):
        user_idx = start + i  # Calculate the global user index
        recommended_item_indices = I[i]  # Get recommended item indices for the user
        # Convert internal item indices to raw item IDs using trainset mapping
        recommended_items = [best_model.trainset.to_raw_iid(item_idx) for item_idx in recommended_item_indices]
        all_recommendations[user_idx] = recommended_items  # Store recommendations for the user

print("Recommendation list generation completed.")

Generating recommendation lists using FAISS...


Generating recommendations:   0%|          | 0/291 [00:00<?, ?it/s]

Recommendation list generation completed.


In [13]:
# Map recommendations back to raw user and item IDs, and filter out already-seen items
user_recommendations_raw = {}

for user_idx, recommended_item_ids in all_recommendations.items():
    user_id = user_id_inverse_mapping.get(user_idx, None)  # Map internal user ID to raw user ID
    if user_id is not None:
        # Get items the user has already interacted with (raw IDs)
        seen_items = user_actual.get(user_id, set())
        # Filter out items already seen by the user
        recommended_item_ids_filtered = [item_id for item_id in recommended_item_ids if item_id not in seen_items]
        # Keep only the top-K recommendations
        recommended_item_ids_filtered = recommended_item_ids_filtered[:topK]
        user_recommendations_raw[user_id] = recommended_item_ids_filtered  # Store the filtered recommendations

# Check example recommendations
print("\nSample recommendation results:")
for user_id, recs in list(user_recommendations_raw.items())[:5]:
    print(f"User ID: {user_id}, Number of recommendations: {len(recs)}, Recommendations: {recs}")

# Save recommendations to an HDF5 file
with h5py.File(os.path.join(result_dir, 'recommendations.h5'), 'w') as hf:
    for user_id, recommended_items in user_recommendations_raw.items():
        # Pad the recommendation list to ensure uniform length (e.g., top-K)
        recommended_items_padded = recommended_items + [''] * (topK - len(recommended_items))
        hf.create_dataset(user_id, data=np.string_(recommended_items_padded))  # Save as fixed-length strings

print(f"Recommendations saved to {os.path.join(result_dir, 'recommendations.h5')}")

# Load recommendations from the HDF5 file
loaded_recommendations = {}
with h5py.File(os.path.join(result_dir, 'recommendations.h5'), 'r') as hf:
    for user_id in hf.keys():
        recommended_items = hf[user_id][:]  # Load the recommendation list
        # Convert bytes to strings and remove padding
        recommended_items = [item.decode('utf-8') for item in recommended_items if item.decode('utf-8')]
        loaded_recommendations[user_id] = recommended_items  # Store the recommendations

print("Recommendations successfully loaded.")

# Check loaded recommendations
print("\nSample loaded recommendation results:")
for user_id, recs in list(loaded_recommendations.items())[:5]:
    print(f"User ID: {user_id}, Number of recommendations: {len(recs)}, Recommendations: {recs}")



Sample recommendation results:
User ID: AE22236AFRRSMQIKGG7TPTB75QEA, Number of recommendations: 20, Recommendations: ['B007NFLN1K', 'B005HSPOVU', 'B01KZTB3HE', 'B0BS72KD59', 'B00E3UKVVQ', 'B07FLSK36C', 'B07S3RFL7V', 'B01FZIWFLS', 'B09FPR3626', 'B003194PBC', 'B09PRVBLTL', 'B000W7IR10', 'B071489L22', 'B01K4KU5HI', 'B07DN8GCLG', 'B00T62YNQK', 'B0002568EK', 'B081RM74FQ', 'B07KF5G7XP', 'B00O4DKHRK']
User ID: AE222MW56PH6JXPIB6XSAMCBTLNQ, Number of recommendations: 20, Recommendations: ['B0002ASBYU', 'B0BL9SWZH5', 'B007ZTL4LS', 'B06Y3RQB1N', 'B07NLZTHD7', 'B0096JVJWO', 'B014854DDM', 'B0BDZPM7SQ', 'B00SG7O9Q0', 'B001CS2MKA', 'B019BJEIRQ', 'B00G9YL8XY', 'B089JHTRMM', 'B0009XPD64', 'B09XJV967L', 'B005OQ35RK', 'B0BC2RFPPR', 'B008EVOTVI', 'B003IWGYF2', 'B01N6YKXZU']
User ID: AE222N3VUKMF3GO6D4LHTELE7UWA, Number of recommendations: 20, Recommendations: ['B06XFRPM63', 'B006684SEE', 'B004PBIJ4O', 'B01A5VEPTA', 'B00DM0OBNM', 'B008XKX3BC', 'B07M683JK4', 'B07CNC8HR8', 'B01H1GDWT6', 'B000HHLRJO', 'B07

In [14]:
# Define a function to get recommendations for a single user
def get_user_recommendations(user_id: str, loaded_recommendations: dict) -> list:
    """
    Retrieve the recommendation list for a single user.
    
    Parameters:
        user_id (str): Unique identifier for the user.
        loaded_recommendations (dict): Dictionary containing loaded recommendation results.
    
    Returns:
        list: A list of recommended items for the user. Returns an empty list if the user is not found.
    """
    return loaded_recommendations.get(user_id, [])  # Return the recommendations or an empty list if user not found

# Test recommendations for a single user
example_user_id = 'AE22236AFRRSMQIKGG7TPTB75QEA'  # Replace with the user ID you want to test
recommended_items = get_user_recommendations(example_user_id, loaded_recommendations)

# Display the user's recommendations
if recommended_items:
    print(f"\nRecommendation list for user {example_user_id}:")
    for rank, item in enumerate(recommended_items, start=1):  # Enumerate recommendations with rankings
        print(f"{rank}. {item}")
else:
    print(f"\nNo recommendations found for user {example_user_id} or user does not exist.")


Recommendation list for user AE22236AFRRSMQIKGG7TPTB75QEA:
1. B007NFLN1K
2. B005HSPOVU
3. B01KZTB3HE
4. B0BS72KD59
5. B00E3UKVVQ
6. B07FLSK36C
7. B07S3RFL7V
8. B01FZIWFLS
9. B09FPR3626
10. B003194PBC
11. B09PRVBLTL
12. B000W7IR10
13. B071489L22
14. B01K4KU5HI
15. B07DN8GCLG
16. B00T62YNQK
17. B0002568EK
18. B081RM74FQ
19. B07KF5G7XP
20. B00O4DKHRK
