In [255]:
import pandas as pd
import numpy as np
from scipy import sparse
from tqdm import tqdm
import json

# Constants
mode = 'user'
method = np.dot
n_recommendations = 10
n_candidates = 19

In [256]:
def read_json(file):
    with open(file) as f:
        return json.load(f)

# Reading data file:
dataset = read_json("ml_100k.json")

movies_map = {}
distinct_movies = []
for elem in dataset:
    watched = elem[0].split(' | ')
    for movie in watched:
        if movie not in movies_map:
            movies_map[movie] = len(movies_map)
            distinct_movies.append(movie)
n_movies = len(movies_map)
print ("Number of different movies: ", n_movies)

Number of different movies:  1493


In [257]:
class CollaborativeFiltering:
    """
    Collaborative Filtering class for User-User or Item-Item predictions.
    """
    def __init__(self, method=method, mode=mode) -> None:
        """
        Initialize the collaborative filtering model.
        
        Parameters:
        - method: similarity function
        - mode: 'user' for User-User CF, 'item' for Item-Item CF
        """
        self.method = method
        self.mode = mode
        self.S = None # Similarity matrix

    def fit(self) -> None:
        """
        Compute the similarity matrix.
        """
        if self.mode == 'user':
            print("Computing User-User CF Similarity Matrix...")
            user_list = []
            for elem in dataset:
                watched_list = [0 for _ in range(n_movies)]
                watched = elem[0].split(' | ')
                for movie in watched:
                    watched_list[movies_map[movie]] = 1
                user_list.append(watched_list)
            user_matrix = np.array(user_list)
            self.S = self.method(user_matrix, user_matrix.T)
        else:  # mode == 'item'
            print("Computing Item-Item CF Similarity Matrix...")
            movie_list = [[0. for _ in range(len(dataset))] for _ in range(n_movies)]
            for i, elem in enumerate(dataset):
                movies = elem[0].split(' | ')
                for movie in movies:
                    movie_list[movies_map[movie]][i] = 1
            item_matrix = np.array(movie_list)
            self.S = self.method(item_matrix, item_matrix.T)
            
        # np.fill_diagonal(self.S, 0)

    def recommend(self, userid: int):
        watched = dataset[userid][0].split(' | ')
        movies_score = {}
        if self.mode == 'user':
            similariests = sorted(list(enumerate(self.S[userid])), key=lambda x:x[-1], reverse=True)[:n_recommendations]
            dvd = sum([e[-1] for e in similariests])
            for other_userid, similarity in similariests:
                weight = similarity / dvd
                other_watched = dataset[other_userid][0].split(' | ')
                for movie in other_watched:
                    if movie not in watched:
                        if movie not in movies_score:
                            movies_score[movie] = 0.
                        movies_score[movie] += weight
            candidate_pairs = list(sorted(movies_score.items(), key=lambda x:x[-1], reverse=True))
            candidate_items = [e[0] for e in candidate_pairs][:n_candidates]
        else:  # mode == 'item'
            for movie in watched:
                similariests = sorted(list(enumerate(self.S[movies_map[movie]])), key=lambda x: x[-1], reverse=True)[:n_recommendations]
                for movieid, weight in similariests:
                    other_movieid = distinct_movies[movieid]
                    if other_movieid not in watched:
                        if other_movieid not in movies_score:
                            movies_score[other_movieid] = 0.
                        movies_score[other_movieid] += weight
            candidate_pairs = list(sorted(movies_score.items(), key=lambda x:x[-1], reverse=True))
            candidate_items = [e[0] for e in candidate_pairs][:n_candidates]
        return candidate_items
    
    def evaluate_hit_rate(self) -> None:
        """
        Evaluate Hit Rate@N for the CF model.
        """
        if self.mode == 'user':
            print("Computing HR@10 using User-User CF...")
        else:  # mode == 'item'
            print("Computing HR@10 using Item-Item CF...")
        hits = 0
        total = len(dataset)
        for user_id, elem in enumerate(tqdm(dataset)):
            recommended = self.recommend(user_id)
            if elem[-1] in recommended:
                hits += 1
        print(hits / total)

In [258]:
# User-User CF
cf_user = CollaborativeFiltering(mode='user')
cf_user.fit()
cf_user.evaluate_hit_rate()

# Item-Item CF
cf_item = CollaborativeFiltering(mode='item')
cf_item.fit()
cf_item.evaluate_hit_rate()

Computing User-User CF Similarity Matrix...
Computing HR@10 using User-User CF...


100%|██████████| 943/943 [00:00<00:00, 1118.46it/s]


0.16118769883351008
Computing Item-Item CF Similarity Matrix...
Computing HR@10 using Item-Item CF...


100%|██████████| 943/943 [00:28<00:00, 33.46it/s]

0.15270413573700956



