# Custom User-based Model
The present notebooks aims at creating a UserBased class that inherits from the Algobase class (surprise package) and that can be customized with various similarity metrics, peer groups and score aggregation functions. 

In [33]:
# reloads modules automatically before entering the execution of code
%load_ext autoreload
%autoreload 2

# standard library imports
import sys

# third parties imports
import numpy as np 
import pandas as pd


from surprise import AlgoBase
from surprise import Dataset, Reader
from surprise import KNNWithMeans
from surprise import PredictionImpossible
import heapq

# local imports
sys.path.append("C:/Users/belgn/OneDrive - UCL/Master 3/Cours/Recommander systems") #adjust the path to your local path
from constants import Constant as C  
from loaders import load_ratings
from loaders import load_items
from loaders import load_links
from loaders import load_tags

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# 1. Loading Data
Prepare a dataset in order to help implementing a user-based recommender system

In [34]:
df_ratings = load_ratings()
reader = Reader(rating_scale=(0.5, 5.0))
data = Dataset.load_from_df(df_ratings[['userId', 'movieId', 'rating']], reader)

# Construire le trainset complet
trainset = data.build_full_trainset()

# Construire l'anti-testset
anti_testset = trainset.build_anti_testset()

# 2. Explore Surprise's user-based algorithm
Displays user-based predictions and similarity matrix on the test dataset using the KNNWithMeans class

In [35]:
# -- using surprise's user-based algorithm, explore the impact of different parameters and displays predictions --
# Options de similarité
Param_Sim = {
    'name': 'msd',         # métrique : mean squared difference
    'min_support': 3,      # au moins 3 films en commun pour comparer deux users
    'user_based': True     # user-user similarity
}

In [36]:
algo = KNNWithMeans(k=3, min_k=2, sim_options=Param_Sim)
algo.fit(trainset)

Computing the msd similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithMeans at 0x1afc97e2920>

In [37]:
prediction = algo.predict(uid=11, iid=364)
print(prediction.est)


2.49203431372549


4. Playing with KNN

In [38]:
def test_min_k(min_k_value):
    print(f"\n--- Résultats avec min_k = {min_k_value} ---")
    
    sim_options = {
        'name': 'msd',
        'min_support': 3,
        'user_based': True
    }

    algo = KNNWithMeans(k=3, min_k=min_k_value, sim_options=sim_options)
    algo.fit(trainset)
    predictions = algo.test(anti_testset)

    # Afficher les 30 premières prédictions
    for p in predictions[:30]:
        print(f"user: {p.uid}, item: {p.iid}, est: {p.est:.2f}")


In [39]:
test_min_k(1)


--- Résultats avec min_k = 1 ---
Computing the msd similarity matrix...
Done computing similarity matrix.
user: 11, item: 1214, est: 3.60
user: 11, item: 364, est: 2.49
user: 11, item: 4308, est: 1.60
user: 11, item: 527, est: 3.90
user: 13, item: 1997, est: 2.80
user: 13, item: 4993, est: 3.24
user: 13, item: 2700, est: 2.80
user: 13, item: 1721, est: 1.24
user: 13, item: 527, est: 3.24
user: 17, item: 2028, est: 3.81
user: 17, item: 4993, est: 4.13
user: 17, item: 1214, est: 3.69
user: 17, item: 4308, est: 1.69
user: 19, item: 1997, est: 3.50
user: 19, item: 2028, est: 3.50
user: 19, item: 4993, est: 3.50
user: 19, item: 5952, est: 3.50
user: 19, item: 2700, est: 3.50
user: 19, item: 1721, est: 3.50
user: 19, item: 1214, est: 3.50
user: 19, item: 364, est: 3.50
user: 23, item: 1997, est: 2.78
user: 23, item: 2700, est: 2.35
user: 27, item: 1997, est: 4.67
user: 27, item: 2028, est: 5.00
user: 27, item: 5952, est: 5.00
user: 27, item: 2700, est: 4.67
user: 27, item: 1721, est: 3.10
u

In [40]:
test_min_k(2)


--- Résultats avec min_k = 2 ---
Computing the msd similarity matrix...
Done computing similarity matrix.
user: 11, item: 1214, est: 3.17
user: 11, item: 364, est: 2.49
user: 11, item: 4308, est: 3.17
user: 11, item: 527, est: 3.90
user: 13, item: 1997, est: 2.80
user: 13, item: 4993, est: 2.80
user: 13, item: 2700, est: 2.80
user: 13, item: 1721, est: 2.80
user: 13, item: 527, est: 2.80
user: 17, item: 2028, est: 3.81
user: 17, item: 4993, est: 4.13
user: 17, item: 1214, est: 3.25
user: 17, item: 4308, est: 3.25
user: 19, item: 1997, est: 3.50
user: 19, item: 2028, est: 3.50
user: 19, item: 4993, est: 3.50
user: 19, item: 5952, est: 3.50
user: 19, item: 2700, est: 3.50
user: 19, item: 1721, est: 3.50
user: 19, item: 1214, est: 3.50
user: 19, item: 364, est: 3.50
user: 23, item: 1997, est: 2.78
user: 23, item: 2700, est: 2.35
user: 27, item: 1997, est: 4.67
user: 27, item: 2028, est: 4.67
user: 27, item: 5952, est: 4.67
user: 27, item: 2700, est: 4.67
user: 27, item: 1721, est: 4.67
u

In [41]:
test_min_k(3)


--- Résultats avec min_k = 3 ---
Computing the msd similarity matrix...
Done computing similarity matrix.
user: 11, item: 1214, est: 3.17
user: 11, item: 364, est: 3.17
user: 11, item: 4308, est: 3.17
user: 11, item: 527, est: 3.17
user: 13, item: 1997, est: 2.80
user: 13, item: 4993, est: 2.80
user: 13, item: 2700, est: 2.80
user: 13, item: 1721, est: 2.80
user: 13, item: 527, est: 2.80
user: 17, item: 2028, est: 3.25
user: 17, item: 4993, est: 3.25
user: 17, item: 1214, est: 3.25
user: 17, item: 4308, est: 3.25
user: 19, item: 1997, est: 3.50
user: 19, item: 2028, est: 3.50
user: 19, item: 4993, est: 3.50
user: 19, item: 5952, est: 3.50
user: 19, item: 2700, est: 3.50
user: 19, item: 1721, est: 3.50
user: 19, item: 1214, est: 3.50
user: 19, item: 364, est: 3.50
user: 23, item: 1997, est: 2.56
user: 23, item: 2700, est: 2.56
user: 27, item: 1997, est: 4.67
user: 27, item: 2028, est: 4.67
user: 27, item: 5952, est: 4.67
user: 27, item: 2700, est: 4.67
user: 27, item: 1721, est: 4.67
u

### 1st Observation min_k {1 -> 3}
the prediction are different for the different tests.
As "min_k" increases, the model becomes more selective about which users are considered neighbors, which leads to fewer valid neighbors being used in the prediction.


In [42]:
def test_min_support(min_support_value):
    print(f"\n--- Résultats avec min_support = {min_support_value} ---")
    
    sim_options = {
        'name': 'msd',
        'min_support': min_support_value,
        'user_based': True
    }

    algo = KNNWithMeans(k=3, min_k=2, sim_options=sim_options)
    algo.fit(trainset)
    predictions = algo.test(anti_testset)

    for p in predictions[:30]:
        print(f"user: {p.uid}, item: {p.iid}, est: {p.est:.2f}, actual_k: {p.details['actual_k']}")


In [43]:
test_min_support(1)


--- Résultats avec min_support = 1 ---
Computing the msd similarity matrix...
Done computing similarity matrix.
user: 11, item: 1214, est: 2.68, actual_k: 3
user: 11, item: 364, est: 3.05, actual_k: 3
user: 11, item: 4308, est: 2.36, actual_k: 2
user: 11, item: 527, est: 3.66, actual_k: 3
user: 13, item: 1997, est: 2.41, actual_k: 2
user: 13, item: 4993, est: 3.74, actual_k: 3
user: 13, item: 2700, est: 2.60, actual_k: 2
user: 13, item: 1721, est: 0.85, actual_k: 3
user: 13, item: 527, est: 3.67, actual_k: 3
user: 17, item: 2028, est: 3.66, actual_k: 3
user: 17, item: 4993, est: 3.85, actual_k: 3
user: 17, item: 1214, est: 2.92, actual_k: 3
user: 17, item: 4308, est: 1.91, actual_k: 3
user: 19, item: 1997, est: 3.50, actual_k: 1
user: 19, item: 2028, est: 3.43, actual_k: 2
user: 19, item: 4993, est: 3.86, actual_k: 2
user: 19, item: 5952, est: 4.47, actual_k: 3
user: 19, item: 2700, est: 3.50, actual_k: 1
user: 19, item: 1721, est: 1.43, actual_k: 2
user: 19, item: 1214, est: 2.89, ac

In [44]:
test_min_support(2)


--- Résultats avec min_support = 2 ---
Computing the msd similarity matrix...
Done computing similarity matrix.
user: 11, item: 1214, est: 2.99, actual_k: 2
user: 11, item: 364, est: 3.05, actual_k: 3
user: 11, item: 4308, est: 2.36, actual_k: 2
user: 11, item: 527, est: 3.90, actual_k: 2
user: 13, item: 1997, est: 2.41, actual_k: 2
user: 13, item: 4993, est: 3.92, actual_k: 2
user: 13, item: 2700, est: 2.60, actual_k: 2
user: 13, item: 1721, est: 0.85, actual_k: 3
user: 13, item: 527, est: 3.53, actual_k: 2
user: 17, item: 2028, est: 3.66, actual_k: 3
user: 17, item: 4993, est: 4.13, actual_k: 2
user: 17, item: 1214, est: 3.34, actual_k: 2
user: 17, item: 4308, est: 2.11, actual_k: 2
user: 19, item: 1997, est: 3.50, actual_k: 0
user: 19, item: 2028, est: 3.50, actual_k: 1
user: 19, item: 4993, est: 3.50, actual_k: 1
user: 19, item: 5952, est: 3.50, actual_k: 1
user: 19, item: 2700, est: 3.50, actual_k: 0
user: 19, item: 1721, est: 3.50, actual_k: 1
user: 19, item: 1214, est: 3.50, ac

In [45]:
test_min_support(3)


--- Résultats avec min_support = 3 ---
Computing the msd similarity matrix...
Done computing similarity matrix.
user: 11, item: 1214, est: 3.17, actual_k: 1
user: 11, item: 364, est: 2.49, actual_k: 2
user: 11, item: 4308, est: 3.17, actual_k: 1
user: 11, item: 527, est: 3.90, actual_k: 2
user: 13, item: 1997, est: 2.80, actual_k: 0
user: 13, item: 4993, est: 2.80, actual_k: 1
user: 13, item: 2700, est: 2.80, actual_k: 0
user: 13, item: 1721, est: 2.80, actual_k: 1
user: 13, item: 527, est: 2.80, actual_k: 1
user: 17, item: 2028, est: 3.81, actual_k: 2
user: 17, item: 4993, est: 4.13, actual_k: 2
user: 17, item: 1214, est: 3.25, actual_k: 1
user: 17, item: 4308, est: 3.25, actual_k: 1
user: 19, item: 1997, est: 3.50, actual_k: 0
user: 19, item: 2028, est: 3.50, actual_k: 0
user: 19, item: 4993, est: 3.50, actual_k: 0
user: 19, item: 5952, est: 3.50, actual_k: 0
user: 19, item: 2700, est: 3.50, actual_k: 0
user: 19, item: 1721, est: 3.50, actual_k: 0
user: 19, item: 1214, est: 3.50, ac

### 2nd Observation min_support {1->3}

The prediction are different for the different tests. As min_support increases, fewer user pairs qualify for similarity calculation, we can see it with the "actual_k".  
This causes fewer neighbors to be available for predictions, which often results by a "default estimation (for example here : user mean), and overall less personalized recommendations.

### -> Simple look at user 11

In [46]:
inner_uid = trainset.to_inner_uid(11)  # conversion userId → inner_id
similarities = algo.sim[inner_uid]

# the best 5 neighbours
import numpy as np

neighbors = [(i, sim) for i, sim in enumerate(similarities) if i != inner_uid]
top_neighbors = sorted(neighbors, key=lambda x: x[1], reverse=True)[:5]

for neighbor_inner_id, score in top_neighbors:
    raw_id = trainset.to_raw_uid(neighbor_inner_id)
    print(f"neighbour: user {raw_id}, similarity = {score:.4f}")

neighbour: user 23, similarity = 0.4324
neighbour: user 17, similarity = 0.2462
neighbour: user 13, similarity = 0.0000
neighbour: user 19, similarity = 0.0000
neighbour: user 27, similarity = 0.0000


---

# 3. Implement and explore a customizable user-based algorithm
Create a self-made user-based algorithm allowing to customize the similarity metric, peer group calculation and aggregation function

In [47]:
class UserBased(AlgoBase):
    def __init__(self, k=3, min_k=1, sim_options={}, **kwargs):
        AlgoBase.__init__(self, sim_options=sim_options, **kwargs)
        self.k = k
        self.min_k = min_k

        
    def fit(self, trainset):
        AlgoBase.fit(self, trainset)
         # Étape 1 : construire la matrice des notes utilisateur-item
        self.compute_rating_matrix()

        # Étape 2 : construire la matrice de similarité utilisateur-utilisateur
        self.compute_similarity_matrix()

        # Étape 3 : calcul de la moyenne des notes pour chaque utilisateur
        self.mean_ratings = []
        for uid in range(self.trainset.n_users):
            ratings = [r for (_, r) in self.trainset.ur[uid]]
            if ratings:
                self.mean_ratings.append(np.mean(ratings))
            else:
                self.mean_ratings.append(np.nan)

        return self
    
    def estimate(self, u, i):
        if not (self.trainset.knows_user(u) and self.trainset.knows_item(i)):
            raise PredictionImpossible("User and/or item is unknown.")

        mean_u = self.mean_ratings[u]
        neighbors = self.trainset.ir[i]  # utilisateurs ayant noté l’item i

        # Construire liste des voisins valides : (similarité, note)
        neighbor_sims = []
        for v, r_vi in neighbors:
            if v == u:
                continue
            sim = self.sim[u, v]
            if not np.isnan(self.mean_ratings[v]) and sim > 0:
                neighbor_sims.append((sim, r_vi, self.mean_ratings[v]))

        # Garder les k plus similaires
        k_best = heapq.nlargest(self.k, neighbor_sims, key=lambda x: x[0])
        actual_k = len(k_best)

        if actual_k >= self.min_k:
            num = sum(sim * (r_vi - mean_v) for sim, r_vi, mean_v in k_best)
            den = sum(abs(sim) for sim, _, _ in k_best)
            estimate = mean_u + num / den if den != 0 else mean_u
        else:
            estimate = mean_u  # fallback

        return estimate

                    
    def compute_rating_matrix(self):
        m = self.trainset.n_users
        n = self.trainset.n_items

        self.ratings_matrix = np.empty((m, n))
        self.ratings_matrix[:] = np.nan

        for uid in range(m):
            for iid, rating in self.trainset.ur[uid]:
                self.ratings_matrix[uid, iid] = rating
    
    def compute_similarity_matrix(self):
        m = self.trainset.n_users
        self.sim = np.eye(m)  # Similarité par rapport à soi-même = 1 => diago de 1

        sim_type = self.sim_options.get("name", "msd")
        min_support = self.sim_options.get("min_support", 1)

        for u in range(m):
            for v in range(u + 1, m):  # calculer triangle supérieur seulement
                row_u = self.ratings_matrix[u]
                row_v = self.ratings_matrix[v]

                if sim_type == "msd":
                    common_mask = ~np.isnan(row_u) & ~np.isnan(row_v)
                    support = np.sum(common_mask)
                    if support >= min_support:
                        diffs = row_u[common_mask] - row_v[common_mask]
                        msd = np.mean(diffs ** 2)
                        similarity = 1 / (1 + msd)
                    else:
                        similarity = 0

                elif sim_type == "jacard":
                    items_u = ~np.isnan(row_u)
                    items_v = ~np.isnan(row_v)
                    intersection = np.sum(items_u & items_v)
                    union = np.sum(items_u | items_v)
                    similarity = intersection / union if union != 0 else 0

                self.sim[u, v] = similarity
                self.sim[v, u] = similarity


# 4. Compare KNNWithMeans with UserBased
Try to replicate KNNWithMeans with your self-made UserBased and check that outcomes are identical

In [48]:
# Options communes
sim_options = {
    'name': 'msd',
    'min_support': 3,
    'user_based': True
}

# Entraînement du modèle KNNWithMeans
knn_model = KNNWithMeans(k=3, min_k=2, sim_options=sim_options)
knn_model.fit(trainset)
knn_preds = knn_model.test(anti_testset)

# Entraînement de notre modèle UserBased
user_model = UserBased(k=3, min_k=2, sim_options=sim_options)
user_model.fit(trainset)
user_preds = []

for uid, iid, _ in anti_testset:
    try:
        inner_uid = trainset.to_inner_uid(uid)
        inner_iid = trainset.to_inner_iid(iid)
        est = user_model.estimate(inner_uid, inner_iid)
    except:
        est = user_model.mean_ratings[inner_uid]  # fallback simple si erreur
    user_preds.append((uid, iid, est))

# Affichage des 30 premières comparaisons
print("\nComparaison des 30 premières prédictions :")
for i in range(30):
    knn_est = knn_preds[i].est
    user_est = user_preds[i][2]
    print(f"[{i+1}] uid: {knn_preds[i].uid}, iid: {knn_preds[i].iid} → KNN: {knn_est:.4f} | UserBased: {user_est:.4f}")


Computing the msd similarity matrix...
Done computing similarity matrix.

Comparaison des 30 premières prédictions :
[1] uid: 11, iid: 1214 → KNN: 3.1667 | UserBased: 3.1667
[2] uid: 11, iid: 364 → KNN: 2.4920 | UserBased: 2.4920
[3] uid: 11, iid: 4308 → KNN: 3.1667 | UserBased: 3.1667
[4] uid: 11, iid: 527 → KNN: 3.8989 | UserBased: 3.8989
[5] uid: 13, iid: 1997 → KNN: 2.8000 | UserBased: 2.8000
[6] uid: 13, iid: 4993 → KNN: 2.8000 | UserBased: 2.8000
[7] uid: 13, iid: 2700 → KNN: 2.8000 | UserBased: 2.8000
[8] uid: 13, iid: 1721 → KNN: 2.8000 | UserBased: 2.8000
[9] uid: 13, iid: 527 → KNN: 2.8000 | UserBased: 2.8000
[10] uid: 17, iid: 2028 → KNN: 3.8125 | UserBased: 3.8125
[11] uid: 17, iid: 4993 → KNN: 4.1283 | UserBased: 4.1283
[12] uid: 17, iid: 1214 → KNN: 3.2500 | UserBased: 3.2500
[13] uid: 17, iid: 4308 → KNN: 3.2500 | UserBased: 3.2500
[14] uid: 19, iid: 1997 → KNN: 3.5000 | UserBased: 3.5000
[15] uid: 19, iid: 2028 → KNN: 3.5000 | UserBased: 3.5000
[16] uid: 19, iid: 4993 →

In [49]:
# (Optionnel) Comparaison dans un DataFrame
comparison_df = pd.DataFrame({
    "user_id": [pred.uid for pred in knn_preds[:30]],
    "item_id": [pred.iid for pred in knn_preds[:30]],
    "knn_est = ": [pred.est for pred in knn_preds[:30]],
    "userbased_est = ": [user_preds[i][2] for i in range(30)],
})

# 4. Erreur absolue
comparison_df["abs_diff"] = (comparison_df["knn_est = "] - comparison_df["userbased_est = "]).abs()

# Affichage
comparison_df.head(10)

Unnamed: 0,user_id,item_id,knn_est =,userbased_est =,abs_diff
0,11,1214,3.166667,3.166667,0.0
1,11,364,2.492034,2.492034,0.0
2,11,4308,3.166667,3.166667,0.0
3,11,527,3.898897,3.898897,0.0
4,13,1997,2.8,2.8,0.0
5,13,4993,2.8,2.8,0.0
6,13,2700,2.8,2.8,0.0
7,13,1721,2.8,2.8,0.0
8,13,527,2.8,2.8,0.0
9,17,2028,3.8125,3.8125,0.0


# 5. Compare MSD and Jacard
Compare predictions made with MSD similarity and Jacard similarity


In [50]:
sim_options_msd = {'name': 'msd', 'min_support': 3, 'user_based': True}
sim_options_jacard = {'name': 'jacard', 'user_based': True}

# Modèle MSD
model_msd = UserBased(k=3, min_k=2, sim_options=sim_options_msd)
model_msd.fit(trainset)
preds_msd = []
for uid, iid, _ in anti_testset[:30]:
    inner_uid = trainset.to_inner_uid(uid)
    inner_iid = trainset.to_inner_iid(iid)
    est = model_msd.estimate(inner_uid, inner_iid)
    preds_msd.append(est)

# Modèle Jacard
model_jacard = UserBased(k=3, min_k=2, sim_options=sim_options_jacard)
model_jacard.fit(trainset)
preds_jacard = []
for uid, iid, _ in anti_testset[:30]:
    inner_uid = trainset.to_inner_uid(uid)
    inner_iid = trainset.to_inner_iid(iid)
    est = model_jacard.estimate(inner_uid, inner_iid)
    preds_jacard.append(est)

# Comparaison
import pandas as pd
comparison_df = pd.DataFrame({
    "user_id": [uid for uid, _, _ in anti_testset[:30]],
    "item_id": [iid for _, iid, _ in anti_testset[:30]],
    "estimate_msd": preds_msd,
    "estimate_jacard": preds_jacard
})
comparison_df["abs_diff"] = (comparison_df["estimate_msd"] - comparison_df["estimate_jacard"]).abs()
comparison_df.head(10)



Unnamed: 0,user_id,item_id,estimate_msd,estimate_jacard,abs_diff
0,11,1214,3.166667,2.754027,0.412639
1,11,364,2.492034,2.80033,0.308296
2,11,4308,3.166667,2.590774,0.575893
3,11,527,3.898897,3.987805,0.088908
4,13,1997,2.8,2.841667,0.041667
5,13,4993,2.8,3.535763,0.735763
6,13,2700,2.8,2.591667,0.208333
7,13,1721,2.8,0.969102,1.830898
8,13,527,2.8,3.59024,0.79024
9,17,2028,3.8125,3.618812,0.193688
