In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse
import random
import json
import time
from tqdm import tqdm

# Set random seed for reproducibility
random.seed(2023)
np.random.seed(2023)

# Configuration Parameters
k = 40            # Number of nearest neighbors (not used in Top-N but kept for reference)
num_u = 12        # Number of top similar users to consider
num_i = 19        # Number of top candidate items from user filtering
total_i = 19      # Total number of candidate items after item filtering
data_path = 'ml-100k/ua.base'  # Path to the training data
test_path = 'ml-100k/ua.test'  # Path to the test data

# Function to read data
def read_data(train_path, test_path):
    r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
    rating_base = pd.read_csv(train_path, sep='\t', names=r_cols)
    rating_test = pd.read_csv(test_path, sep='\t', names=r_cols)
    
    # Convert to numpy arrays
    rate_train = rating_base.to_numpy()
    rate_test = rating_test.to_numpy()
    
    # Adjust indices to start from 0
    rate_train[:, :2] -= 1
    rate_test[:, :2] -= 1
    
    return rate_train, rate_test

# Collaborative Filtering Class for Top-N Recommendation
class CollaborativeFilteringTopN:
    def __init__(self, Y_data: np.ndarray, mode='user') -> None:
        """
        Initialize the CF model.

        Parameters:
        - Y_data: numpy array of shape (n_samples, 3), each row is [entity1_id, entity2_id, rating]
        - mode: 'user' for User-User CF, 'item' for Item-Item CF
        """
        self.Y_data = Y_data
        self.mode = mode
        if mode == 'user':
            self.n_entities = int(np.max(self.Y_data[:, 0])) + 1  # Number of unique users
            self.n_items = int(np.max(self.Y_data[:, 1])) + 1    # Number of unique items
            self.interaction_matrix = sparse.lil_matrix((self.n_entities, self.n_items), dtype=int)
        elif mode == 'item':
            self.n_entities = int(np.max(self.Y_data[:, 0])) + 1  # Number of unique items
            self.n_users = int(np.max(self.Y_data[:, 1])) + 1     # Number of unique users
            self.interaction_matrix = sparse.lil_matrix((self.n_entities, self.n_users), dtype=int)
        else:
            raise ValueError("Mode must be 'user' or 'item'")

    def fit(self) -> None:
        """
        Build the interaction matrix and compute the similarity matrix.
        """
        for row in self.Y_data:
            entity1 = int(row[0])
            entity2 = int(row[1])
            self.interaction_matrix[entity1, entity2] = 1  # Binary interaction

        # Convert to CSR format for efficient computations
        self.interaction_matrix = self.interaction_matrix.tocsr()

        # Compute similarity matrix
        self.similarity_matrix = cosine_similarity(self.interaction_matrix, self.interaction_matrix)
    
    def get_similarity(self):
        return self.similarity_matrix

# User Filtering Function
def sort_uf_items(target_user, user_similarity, user_item_matrix, num_u=12, num_i=19):
    """
    User Filtering: Select candidate items based on top similar users.

    Parameters:
    - target_user: User ID for whom to generate recommendations
    - user_similarity: Precomputed user similarity matrix
    - user_item_matrix: Binary user-item interaction matrix
    - num_u: Number of top similar users to consider
    - num_i: Number of top candidate items to select

    Returns:
    - List of candidate item IDs
    """
    # Get similarity scores for the target user
    sim_scores = user_similarity[target_user].copy()
    
    # Exclude the target user itself
    sim_scores[target_user] = -1
    
    # Get top num_u similar users
    top_users = np.argsort(sim_scores)[-num_u:]
    
    # Aggregate items from top similar users
    candidate_items = {}
    for user in top_users:
        sim = sim_scores[user]
        items = user_item_matrix[user].nonzero()[1]
        for item in items:
            if item not in candidate_items:
                candidate_items[item] = 0.0
            candidate_items[item] += sim
    
    # Remove items already interacted with by the target user
    user_items = set(user_item_matrix[target_user].nonzero()[1])
    candidate_items = {item: score for item, score in candidate_items.items() if item not in user_items}
    
    # Sort candidates based on aggregated similarity scores
    sorted_candidates = sorted(candidate_items.items(), key=lambda x: x[1], reverse=True)
    
    # Select top num_i candidates
    candidate_items = [item for item, score in sorted_candidates[:num_i]]
    
    return candidate_items

# Item Filtering Function
def soft_if_items(target_user, candidate_items, item_similarity, user_item_matrix, total_i=19):
    """
    Item Filtering: Refine candidate items based on item similarity.

    Parameters:
    - target_user: User ID for whom to generate recommendations
    - candidate_items: Initial list of candidate item IDs from user filtering
    - item_similarity: Precomputed item similarity matrix
    - user_item_matrix: Binary user-item interaction matrix
    - total_i: Total number of candidate items to select

    Returns:
    - Refined list of candidate item IDs
    """
    user_items = user_item_matrix[target_user].nonzero()[1]
    candidate_scores = {}
    
    for item in user_items:
        sim_scores = item_similarity[item]
        top_sim_items = np.argsort(sim_scores)[-total_i:]
        for sim_item in top_sim_items:
            if sim_item not in user_items and sim_item in candidate_items:
                if sim_item not in candidate_scores:
                    candidate_scores[sim_item] = 0.0
                candidate_scores[sim_item] += sim_scores[sim_item]
    
    # Sort candidates based on aggregated similarity scores
    sorted_candidates = sorted(candidate_scores.items(), key=lambda x: x[1], reverse=True)
    refined_candidates = [item for item, score in sorted_candidates[:total_i]]
    
    return refined_candidates

# Generate Candidates by Combining User and Item Filtering
def generate_candidates(target_user, user_similarity, item_similarity, user_item_matrix, num_u=12, num_i=19, total_i=19):
    """
    Generate candidate items by combining user and item filtering.

    Parameters:
    - target_user: User ID for whom to generate recommendations
    - user_similarity: User similarity matrix
    - item_similarity: Item similarity matrix
    - user_item_matrix: Binary user-item interaction matrix
    - num_u: Number of top similar users to consider
    - num_i: Number of top candidate items from user filtering
    - total_i: Total number of candidate items after item filtering

    Returns:
    - List of refined candidate item IDs
    """
    # User Filtering
    candidate_items = sort_uf_items(target_user, user_similarity, user_item_matrix, num_u, num_i)
    
    # Item Filtering
    refined_candidates = soft_if_items(target_user, candidate_items, item_similarity, user_item_matrix, total_i)
    
    return refined_candidates

# Recommendation Function
def recommend(target_user, user_similarity, item_similarity, user_item_matrix, k=10, num_u=12, num_i=19, total_i=19):
    """
    Generate top-N recommendations for a target user.

    Parameters:
    - target_user: User ID for whom to generate recommendations
    - user_similarity: User similarity matrix
    - item_similarity: Item similarity matrix
    - user_item_matrix: Binary user-item interaction matrix
    - k: Number of top recommendations to return
    - num_u: Number of top similar users to consider
    - num_i: Number of top candidate items from user filtering
    - total_i: Total number of candidate items after item filtering

    Returns:
    - List of top-N recommended item IDs
    """
    candidates = generate_candidates(target_user, user_similarity, item_similarity, user_item_matrix, num_u, num_i, total_i)
    
    # If more candidates than needed, rank them based on item popularity
    if len(candidates) > k:
        item_popularity = user_item_matrix.sum(axis=0).A1
        ranked_candidates = sorted(candidates, key=lambda x: item_popularity[x], reverse=True)
        return ranked_candidates[:k]
    else:
        return candidates

# Evaluation Function for Hit@k
def evaluate_cf(user_similarity, item_similarity, user_item_matrix, test_data, k=10, num_u=12, num_i=19, total_i=19):
    """
    Evaluate CF recommendations using Hit@k.

    Parameters:
    - user_similarity: User similarity matrix
    - item_similarity: Item similarity matrix
    - user_item_matrix: Binary user-item interaction matrix
    - test_data: Test dataset as a numpy array
    - k: Number of top recommendations to consider for Hit@k
    - num_u: Number of top similar users to consider
    - num_i: Number of top candidate items from user filtering
    - total_i: Total number of candidate items after item filtering

    Returns:
    - Hit@k score
    """
    hit = 0
    total = 0
    for row in tqdm(test_data, desc = "Processing..."):
        user = int(row[0])
        true_item = int(row[1])
        
        recommendations = recommend(user, user_similarity, item_similarity, user_item_matrix, k, num_u, num_i, total_i)
        
        if true_item in recommendations:
            hit += 1
        total += 1
    
    hit_at_k = hit / total if total > 0 else 0
    return hit_at_k

# Main Execution
def main():
    # Read training and test data
    print("Reading data...")
    rate_train, rate_test = read_data(data_path, test_path)
    print("Data reading completed.")
    
    # Initialize and fit User-User CF
    print("Fitting User-User Collaborative Filtering...")
    cf_user = CollaborativeFilteringTopN(rate_train, mode='user')
    cf_user.fit()
    user_similarity = cf_user.get_similarity()
    print("User-User CF fitting completed.")
    
    # Initialize and fit Item-Item CF
    print("Fitting Item-Item Collaborative Filtering...")
    # For Item-Item CF, swap user and item
    rate_train_ii = rate_train[:, [1, 0, 2]].copy()
    rate_test_ii = rate_test[:, [1, 0, 2]].copy()
    cf_item = CollaborativeFilteringTopN(rate_train_ii, mode='item')
    cf_item.fit()
    item_similarity = cf_item.get_similarity()
    print("Item-Item CF fitting completed.")
    
    # Convert training data to user-item matrix for recommendation functions
    print("Building user-item interaction matrix...")
    n_users = int(rate_train[:, 0].max()) + 1
    n_items = int(rate_train[:, 1].max()) + 1
    user_item_matrix = sparse.lil_matrix((n_users, n_items), dtype=int)
    for row in rate_train:
        user = int(row[0])
        item = int(row[1])
        user_item_matrix[user, item] = 1
    user_item_matrix = user_item_matrix.tocsr()
    print("User-item interaction matrix built.")
    
    # Evaluate User-User CF
    print("Evaluating User-User CF with Hit@10...")
    hit_at_10_uu = evaluate_cf(user_similarity, item_similarity, user_item_matrix, rate_test, k=10, num_u=num_u, num_i=num_i, total_i=total_i)
    print(f"User-User CF Hit@10: {hit_at_10_uu:.4f}")
    
    # Evaluate Item-Item CF
    print("Evaluating Item-Item CF with Hit@10...")
    # For Item-Item CF evaluation, use the swapped test data and interaction matrix
    # Create item-user interaction matrix for evaluation
    n_items_ii = int(rate_train_ii[:, 0].max()) + 1
    n_users_ii = int(rate_train_ii[:, 1].max()) + 1
    item_user_matrix = sparse.lil_matrix((n_items_ii, n_users_ii), dtype=int)
    for row in rate_train_ii:
        item = int(row[0])
        user = int(row[1])
        item_user_matrix[item, user] = 1
    item_user_matrix = item_user_matrix.tocsr()
    
    hit_at_10_ii = evaluate_cf(item_similarity, user_similarity, item_user_matrix, rate_test_ii, k=10, num_u=num_u, num_i=num_i, total_i=total_i)
    print(f"Item-Item CF Hit@10: {hit_at_10_ii:.4f}")
    
    # Example Recommendation
    example_user = 0  # Change as needed
    top_10_recommendations = recommend(example_user, user_similarity, item_similarity, user_item_matrix, k=10, num_u=num_u, num_i=num_i, total_i=total_i)
    print(f"Top 10 recommendations for User {example_user}: {top_10_recommendations}")

if __name__ == "__main__":
    main()


Reading data...
Data reading completed.
Fitting User-User Collaborative Filtering...
User-User CF fitting completed.
Fitting Item-Item Collaborative Filtering...
Item-Item CF fitting completed.
Building user-item interaction matrix...
User-item interaction matrix built.
Evaluating User-User CF with Hit@10...


Processing...: 100%|██████████| 9430/9430 [06:42<00:00, 23.43it/s] 


User-User CF Hit@10: 0.2133
Evaluating Item-Item CF with Hit@10...


Processing...: 100%|██████████| 9430/9430 [06:05<00:00, 25.77it/s]

Item-Item CF Hit@10: 0.0709
Top 10 recommendations for User 0: [293, 116, 404, 422, 201, 545, 654, 567, 264, 272]



