# Custom User-based Model
The present notebooks aims at creating a UserBased class that inherits from the Algobase class (surprise package) and that can be customized with various similarity metrics, peer groups and score aggregation functions. 

In [None]:
# reloads modules automatically before entering the execution of code
%load_ext autoreload
%autoreload 2

# standard library imports
import sys

# third parties imports
import numpy as np 
import pandas as pd

from surprise.model_selection import train_test_split, KFold, cross_validate, LeaveOneOut
from surprise import AlgoBase
from surprise import Dataset, Reader, accuracy 
from surprise import KNNWithMeans, KNNBasic
from surprise import PredictionImpossible
import heapq

from sklearn.metrics import mean_squared_error, mean_absolute_error
import numpy as np
import pandas as pd

# local imports
sys.path.append("C:/Users/belgn/OneDrive - UCL/Master 3/Cours/Recommander systems") #adjust the path to your local path
from constants import Constant as C 

from loaders import load_ratings




The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# 1. Loading Data
Prepare a dataset in order to help implementing a user-based recommender system

In [None]:
df_ratings= load_ratings(hack = True)
reader = Reader(rating_scale=(0.5, 5.0))
data = Dataset.load_from_df(df_ratings[['userId', 'movieId', 'rating']], reader)

# Construire le trainset et testset. 
trainset, testset= train_test_split(data,test_size = 0.2 )

# Construire l'anti-testset
anti_testset = trainset.build_anti_testset()

# 2. Creation of our user based model ( ITR, Jacard and msd )

In [34]:
class UserBased(AlgoBase):
    def __init__(self, k=3, min_k=1, sim_options={}, **kwargs):
        AlgoBase.__init__(self, sim_options=sim_options, **kwargs)
        self.k = k
        self.min_k = min_k

        
    def fit(self, trainset):
        AlgoBase.fit(self, trainset)
         # Étape 1 : construire la matrice des notes utilisateur-item
        self.compute_rating_matrix()

        # Étape 2 : construire la matrice de similarité utilisateur-utilisateur
        self.compute_similarity_matrix()

        # Étape 3 : calcul de la moyenne des notes pour chaque utilisateur
        self.mean_ratings = []
        for uid in range(self.trainset.n_users):
            ratings = [r for (_, r) in self.trainset.ur[uid]]
            if ratings:
                self.mean_ratings.append(np.mean(ratings))
            else:
                self.mean_ratings.append(np.nan)

        return self
    
    def estimate(self, u, i):
        if not (self.trainset.knows_user(u) and self.trainset.knows_item(i)):
            raise PredictionImpossible("User and/or item is unknown.")

        mean_u = self.mean_ratings[u]
        neighbors = self.trainset.ir[i]  # utilisateurs ayant noté l’item i

        # Construire liste des voisins valides : (similarité, note)
        neighbor_sims = []
        for v, r_vi in neighbors:
            if v == u:
                continue
            sim = self.sim[u, v]
            if not np.isnan(self.mean_ratings[v]) and sim > 0:
                neighbor_sims.append((sim, r_vi, self.mean_ratings[v]))

        # Garder les k plus similaires
        k_best = heapq.nlargest(self.k, neighbor_sims, key=lambda x: x[0])
        actual_k = len(k_best)

        if actual_k >= self.min_k:
            num = sum(sim * (r_vi - mean_v) for sim, r_vi, mean_v in k_best)
            den = sum(abs(sim) for sim, _, _ in k_best)
            estimate = mean_u + num / den if den != 0 else mean_u
        else:
            estimate = mean_u  # fallback

        estimate = max(0.5, min(5.0, estimate))
        
        return estimate

                    
    def compute_rating_matrix(self):
        m = self.trainset.n_users
        n = self.trainset.n_items

        self.ratings_matrix = np.empty((m, n))
        self.ratings_matrix[:] = np.nan

        for uid in range(m):
            for iid, rating in self.trainset.ur[uid]:
                self.ratings_matrix[uid, iid] = rating
    
    def show_rating_matrix(self):
        user_ids = [self.trainset.to_raw_uid(uid) for uid in range(self.trainset.n_users)]
        item_ids = [self.trainset.to_raw_iid(iid) for iid in range(self.trainset.n_items)]

        df = pd.DataFrame(self.ratings_matrix, index=user_ids, columns=item_ids)
        return df
    
    def compute_sparsity(self):
        total_cells = self.ratings_matrix.size
        filled_cells = np.count_nonzero(~np.isnan(self.ratings_matrix))
        sparsity = 1 - (filled_cells / total_cells)
        return sparsity
    
    
    
    def compute_similarity_matrix(self):
        # soit MSD, jacard, ou ITR
        m = self.trainset.n_users
        self.sim = np.eye(m)  # Similarité par rapport à soi-même = 1 => diago de 1

        sim_type = self.sim_options.get("name", "msd")
        min_support = self.sim_options.get("min_support", 1)

        ############################################################################
        def compute_sigma(ratings):
            mean_rating = np.mean(ratings)
            return np.sqrt(np.mean([(r - mean_rating) ** 2 for r in ratings]))
        
        def sim_triangle_prime(u_r, v_r):
            common = ~np.isnan(u_r) & ~np.isnan(v_r)
            if not np.any(common):
                return 0.0
            diff = u_r[common] - v_r[common]
            num = np.sqrt(np.sum(diff ** 2))
            denom = np.sqrt(np.sum(u_r[common] ** 2)) + np.sqrt(np.sum(v_r[common] ** 2))
            return 1 - num / denom if denom != 0 else 0.0

        def sim_urp(u_r, v_r):
            u_vals = u_r[~np.isnan(u_r)]
            v_vals = v_r[~np.isnan(v_r)]
            if len(u_vals) == 0 or len(v_vals) == 0:
                return 0.0
            mean_u = np.mean(u_vals)
            mean_v = np.mean(v_vals)
            sigma_u = compute_sigma(u_vals)
            sigma_v = compute_sigma(v_vals)
            diff_mean = abs(mean_u - mean_v)
            diff_sigma = abs(sigma_u - sigma_v)
            return 1 - (1 / (1 + np.exp(-diff_mean * diff_sigma)))

        ############################################################################
        
        for u in range(m):
            for v in range(u + 1, m):  # calculer triangle supérieur seulement
                row_u = self.ratings_matrix[u]
                row_v = self.ratings_matrix[v]

                if sim_type == "msd":
                    common_mask = ~np.isnan(row_u) & ~np.isnan(row_v)
                    support = np.sum(common_mask)
                    if support >= min_support:
                        diffs = row_u[common_mask] - row_v[common_mask]
                        msd = np.mean(diffs ** 2)
                        similarity = 1 / (1 + msd)
                    else:
                        similarity = 0

                elif sim_type == "jacard":
                    items_u = ~np.isnan(row_u)
                    items_v = ~np.isnan(row_v)
                    intersection = np.sum(items_u & items_v)
                    union = np.sum(items_u | items_v)
                    similarity = intersection / union if union != 0 else 0

                elif sim_type == "itr":
                    similarity = sim_triangle_prime(row_u, row_v) * sim_urp(row_u, row_v)
                    
                
                self.sim[u, v] = similarity
                self.sim[v, u] = similarity

# 3. Training model 

In [35]:
model = UserBased(k=3 ,min_k= 1, sim_options={"name": "itr"})
model.fit(trainset)

# Affichage de la matrice
df_matrix = model.show_rating_matrix()
print(df_matrix.head()) 
print(model.compute_sparsity())

        356     2338    1209    3697    3418    3396    27826   2136    \
148312     4.0     NaN     NaN     NaN     3.0     NaN     NaN     2.0   
140124     5.0     2.0     NaN     NaN     NaN     NaN     NaN     NaN   
97225      5.0     NaN     4.0     3.5     4.5     NaN     NaN     NaN   
127670     NaN     NaN     NaN     3.0     3.0     NaN     NaN     NaN   
59013      5.0     NaN     NaN     NaN     2.5     NaN     NaN     NaN   

        111     480     ...  112623  26231   4066    85179   100556  8341    \
148312     NaN     4.5  ...     NaN     NaN     NaN     NaN     NaN     NaN   
140124     4.0     5.0  ...     NaN     NaN     NaN     NaN     NaN     NaN   
97225      NaN     NaN  ...     NaN     NaN     NaN     NaN     NaN     NaN   
127670     4.0     NaN  ...     NaN     NaN     NaN     NaN     NaN     NaN   
59013      NaN     NaN  ...     NaN     NaN     NaN     NaN     NaN     NaN   

        149606  95499   78653   27792   
148312     NaN     NaN     NaN     NaN 

 # 4. Testing the model

In [None]:

y_true_custom, y_pred_custom = [], []

for uid, iid, true_rating in testset:
    if trainset.knows_user(uid) and trainset.knows_item(iid):
        inner_uid = trainset.to_inner_uid(uid)
        inner_iid = trainset.to_inner_iid(iid)

        try:
            est = model.estimate(inner_uid, inner_iid)
        except:
            est = np.nan

        if not np.isnan(est):
            y_true_custom.append(true_rating)
            y_pred_custom.append(est)

rmse_custom = mean_squared_error(y_true_custom, y_pred_custom)
mae_custom = mean_absolute_error(y_true_custom, y_pred_custom)


knn = KNNBasic(k=3, sim_options={"name": "msd", "min_k" : 1, "user_based": True})
knn.fit(trainset)
preds_knn = knn.test(testset)

y_true_knn = [pred.r_ui for pred in preds_knn]
y_pred_knn = [pred.est for pred in preds_knn]

rmse_knn = mean_squared_error(y_true_knn, y_pred_knn)
mae_knn = mean_absolute_error(y_true_knn, y_pred_knn)


comparison_df = pd.DataFrame({
    "Modèle": ["Custom UserBased", "KNNBasics Surprise"],
    "RMSE": [rmse_custom, rmse_knn],
    "MAE": [mae_custom, mae_knn]
})

print(comparison_df)


Computing the msd similarity matrix...
Done computing similarity matrix.
               Modèle      RMSE       MAE
0    Custom UserBased  0.807677  0.709432
1  KNNBasics Surprise  0.801287  0.678380


In [37]:
preds= []
for uid, iid, _ in anti_testset[:30]:
    inner_uid = trainset.to_inner_uid(uid)
    inner_iid = trainset.to_inner_iid(iid)
    est = model.estimate(inner_uid, inner_iid)
    preds.append(est)
    

comparison_df = pd.DataFrame({
    "user_id": [uid for uid, _, _ in testset[:30]],
    "item_id": [iid for _, iid, _ in testset[:30]],
    "estimate_itr": preds,})  

comparison_df

Unnamed: 0,user_id,item_id,estimate_itr
0,259277,6378,1.899659
1,142654,53322,3.923475
2,171022,92420,1.905749
3,116578,1359,3.552886
4,154250,2791,3.302023
5,208156,2288,4.630576
6,38195,2396,3.250726
7,96711,1974,2.481835
8,249258,5900,4.425115
9,28187,2396,3.276965


In [38]:
from collections import defaultdict

# Générer les prédictions sur TOUT l'anti-testset
preds = []
for uid, iid, _ in anti_testset:
    try:
        inner_uid = trainset.to_inner_uid(uid)
        inner_iid = trainset.to_inner_iid(iid)
        est = model.estimate(inner_uid, inner_iid)
        preds.append((uid, iid, est))
    except:
        continue  # ignorer les cas impossibles

#  Top-N
def get_top_n(predictions, n=5):
    top_n = defaultdict(list)

    for uid, iid, est in predictions:
        top_n[uid].append((iid, est))

    # Garder les n meilleurs items triés par score décroissant
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

top_n_preds = get_top_n(preds, n=5)

# Transformer en DataFrame pour affichage
rows = []
for uid, items in top_n_preds.items():
    for iid, est in items:
        rows.append({
            "user_id": uid,
            "item_id": iid,
            "predicted_rating": est
        })

topn_df = pd.DataFrame(rows)
print(topn_df.head())

   user_id  item_id  predicted_rating
0   148312     1423               5.0
1   148312    38499               5.0
2   148312      318               5.0
3   148312     1953               5.0
4   148312      549               5.0


## Ne pas regarder en dessous ( ancien code de base )

## ----------------------------------------------------------------------

# 2. Explore Surprise's user-based algorithm
Displays user-based predictions and similarity matrix on the test dataset using the KNNWithMeans class

In [7]:
# -- using surprise's user-based algorithm, explore the impact of different parameters and displays predictions --
# Options de similarité
Param_Sim = {
    'name': 'msd',         # métrique : mean squared difference
    'min_support': 3,      # au moins 3 films en commun pour comparer deux users
    'user_based': True     # user-user similarity
}

In [8]:
algo = KNNWithMeans(k=3, min_k=2, sim_options=Param_Sim)
algo.fit(trainset)

Computing the msd similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithMeans at 0x1fd4f872b60>

In [9]:
prediction = algo.predict(uid=11, iid=364)
print(prediction.est)


4.252920516369196


## 4. Playing with KNN

In [10]:
def test_min_k(min_k_value):
    print("--------------------------------")
    print(f"\n--- Résultats avec min_k = {min_k_value} ---")
    
    sim_options = {
        'name': 'msd',
        'min_support': 3,
        'user_based': True
    }

    algo = KNNWithMeans(k=3, min_k=min_k_value, sim_options=sim_options)
    algo.fit(trainset)
    predictions = algo.test(anti_testset)

    # Afficher les 30 premières prédictions
    for p in predictions[:30]:
        print(f"user: {p.uid}, item: {p.iid}, est: {p.est:.2f}")
        
    rmse = accuracy.rmse(predictions, verbose=False)
    print(f"📉 RMSE pour min_k = {min_k_value} : {rmse:.4f}")


In [11]:
for value in range(1, 4):  # min_k de 1 à 3
    test_min_k(value)

--------------------------------

--- Résultats avec min_k = 1 ---
Computing the msd similarity matrix...
Done computing similarity matrix.
user: 1, item: 10, est: 2.62
user: 1, item: 17, est: 3.19
user: 1, item: 39, est: 2.53
user: 1, item: 47, est: 2.98
user: 1, item: 50, est: 4.04
user: 1, item: 52, est: 2.13
user: 1, item: 62, est: 2.77
user: 1, item: 110, est: 2.70
user: 1, item: 144, est: 2.13
user: 1, item: 150, est: 1.80
user: 1, item: 153, est: 2.13
user: 1, item: 161, est: 2.67
user: 1, item: 165, est: 3.06
user: 1, item: 168, est: 2.86
user: 1, item: 185, est: 2.41
user: 1, item: 186, est: 1.54
user: 1, item: 208, est: 1.23
user: 1, item: 222, est: 2.61
user: 1, item: 223, est: 3.44
user: 1, item: 225, est: 1.91
user: 1, item: 235, est: 2.23
user: 1, item: 248, est: 1.78
user: 1, item: 253, est: 2.77
user: 1, item: 261, est: 2.41
user: 1, item: 265, est: 2.72
user: 1, item: 266, est: 3.00
user: 1, item: 272, est: 3.39
user: 1, item: 273, est: 2.20
user: 1, item: 292, est: 1.

### 1st Observation min_k {1 -> 3}
the prediction are different for the different tests.
As "min_k" increases, the model becomes more selective about which users are considered neighbors, which leads to fewer valid neighbors being used in the prediction.


In [12]:
def test_min_support(min_support_value):
    print(f"\n--- Résultats avec min_support = {min_support_value} ---")
    
    sim_options = {
        'name': 'msd',
        'min_support': min_support_value,
        'user_based': True
    }

    algo = KNNWithMeans(k=3, min_k=2, sim_options=sim_options)
    algo.fit(trainset)
    predictions = algo.test(anti_testset)

    for p in predictions[:30]:
        print(f"user: {p.uid}, item: {p.iid}, est: {p.est:.2f}, actual_k: {p.details['actual_k']}")

    rmse = accuracy.rmse(predictions, verbose=False)
    print(f"📉 RMSE pour min_k = {min_support_value} : {rmse:.4f}")


In [13]:
for value in range(1, 4):  # min_k de 1 à 5
    test_min_support(value)


--- Résultats avec min_support = 1 ---
Computing the msd similarity matrix...
Done computing similarity matrix.
user: 1, item: 10, est: 2.30, actual_k: 3
user: 1, item: 17, est: 3.09, actual_k: 3
user: 1, item: 39, est: 2.70, actual_k: 3
user: 1, item: 47, est: 2.03, actual_k: 3
user: 1, item: 50, est: 3.24, actual_k: 3
user: 1, item: 52, est: 2.37, actual_k: 3
user: 1, item: 62, est: 3.18, actual_k: 3
user: 1, item: 110, est: 2.47, actual_k: 3
user: 1, item: 144, est: 1.79, actual_k: 3
user: 1, item: 150, est: 2.27, actual_k: 3
user: 1, item: 153, est: 1.49, actual_k: 3
user: 1, item: 161, est: 3.18, actual_k: 3
user: 1, item: 165, est: 2.57, actual_k: 3
user: 1, item: 168, est: 1.97, actual_k: 3
user: 1, item: 185, est: 2.26, actual_k: 3
user: 1, item: 186, est: 2.57, actual_k: 3
user: 1, item: 208, est: 1.18, actual_k: 3
user: 1, item: 222, est: 3.11, actual_k: 3
user: 1, item: 223, est: 2.60, actual_k: 3
user: 1, item: 225, est: 2.70, actual_k: 3
user: 1, item: 235, est: 3.01, act

### 2nd Observation min_support {1->3}

The prediction are different for the different tests. As min_support increases, fewer user pairs qualify for similarity calculation, we can see it with the "actual_k".  
This causes fewer neighbors to be available for predictions, which often results by a "default estimation (for example here : user mean), and overall less personalized recommendations.

### -> Simple look at user 11

In [14]:
inner_uid = trainset.to_inner_uid(11)  # conversion userId → inner_id
similarities = algo.sim[inner_uid]

# the best 5 neighbours
import numpy as np

neighbors = [(i, sim) for i, sim in enumerate(similarities) if i != inner_uid]
top_neighbors = sorted(neighbors, key=lambda x: x[1], reverse=True)[:5]

for neighbor_inner_id, score in top_neighbors:
    raw_id = trainset.to_raw_uid(neighbor_inner_id)
    print(f"neighbour: user {raw_id}, similarity = {score:.4f}")

neighbour: user 450, similarity = 1.0000
neighbour: user 125, similarity = 0.9412
neighbour: user 562, similarity = 0.9412
neighbour: user 147, similarity = 0.9231
neighbour: user 595, similarity = 0.9231


---

# 3. Implement and explore a customizable user-based algorithm
Create a self-made user-based algorithm allowing to customize the similarity metric, peer group calculation and aggregation function

In [None]:
model = UserBased(k=5, sim_options={"name": "itr"})
model.fit(trainset)

# Affichage de la matrice
df_matrix = model.show_rating_matrix()
print(df_matrix.head()) 
print(model.compute_sparsity())

   31      1029    1061    1129    1172    1263    1287    1293    1339    \
1     2.5     3.0     3.0     2.0     4.0     2.0     2.0     2.0     3.5   
2     NaN     NaN     NaN     NaN     NaN     NaN     NaN     NaN     NaN   
3     NaN     NaN     NaN     NaN     NaN     NaN     NaN     NaN     NaN   
4     NaN     NaN     NaN     NaN     NaN     NaN     NaN     NaN     NaN   
5     NaN     NaN     NaN     NaN     NaN     NaN     NaN     NaN     NaN   

   1343    ...  134528  134783  137595  138204  60832   64997   72380   \
1     2.0  ...     NaN     NaN     NaN     NaN     NaN     NaN     NaN   
2     NaN  ...     NaN     NaN     NaN     NaN     NaN     NaN     NaN   
3     NaN  ...     NaN     NaN     NaN     NaN     NaN     NaN     NaN   
4     NaN  ...     NaN     NaN     NaN     NaN     NaN     NaN     NaN   
5     NaN  ...     NaN     NaN     NaN     NaN     NaN     NaN     NaN   

   129     4736    6425    
1     NaN     NaN     NaN  
2     NaN     NaN     NaN  
3     Na

In [17]:
print(model.compute_sparsity())

0.9835608583913366


# 4. Compare KNNWithMeans with UserBased
Try to replicate KNNWithMeans with your self-made UserBased and check that outcomes are identical

In [18]:
# Options communes
sim_options = {
    'name': 'msd',
    'min_support': 3,
    'user_based': True
}

# Entraînement du modèle KNNWithMeans
knn_model = KNNWithMeans(k=3, min_k=2, sim_options=sim_options)
knn_model.fit(trainset)
knn_preds = knn_model.test(anti_testset)

# Entraînement de notre modèle UserBased
user_model = UserBased(k=3, min_k=2, sim_options=sim_options)
user_model.fit(trainset)
user_preds = []

for uid, iid, _ in anti_testset:
    try:
        inner_uid = trainset.to_inner_uid(uid)
        inner_iid = trainset.to_inner_iid(iid)
        est = user_model.estimate(inner_uid, inner_iid)
    except:
        est = user_model.mean_ratings[inner_uid]  # fallback simple si erreur
    user_preds.append((uid, iid, est))

# Affichage des 30 premières comparaisons
print("\nComparaison des 30 premières prédictions :")
for i in range(30):
    knn_est = knn_preds[i].est
    user_est = user_preds[i][2]
    print(f"[{i+1}] uid: {knn_preds[i].uid}, iid: {knn_preds[i].iid} → KNN: {knn_est:.4f} | UserBased: {user_est:.4f}")


Computing the msd similarity matrix...
Done computing similarity matrix.

Comparaison des 30 premières prédictions :
[1] uid: 1, iid: 10 → KNN: 2.6248 | UserBased: 2.6248
[2] uid: 1, iid: 17 → KNN: 3.1901 | UserBased: 3.1901
[3] uid: 1, iid: 39 → KNN: 2.5295 | UserBased: 2.5295
[4] uid: 1, iid: 47 → KNN: 2.9796 | UserBased: 2.9796
[5] uid: 1, iid: 50 → KNN: 4.0432 | UserBased: 4.0432
[6] uid: 1, iid: 52 → KNN: 2.1271 | UserBased: 2.1271
[7] uid: 1, iid: 62 → KNN: 2.7740 | UserBased: 2.7740
[8] uid: 1, iid: 110 → KNN: 2.6954 | UserBased: 2.6954
[9] uid: 1, iid: 144 → KNN: 2.1305 | UserBased: 2.1305
[10] uid: 1, iid: 150 → KNN: 1.8002 | UserBased: 1.8002
[11] uid: 1, iid: 153 → KNN: 2.1304 | UserBased: 2.1304
[12] uid: 1, iid: 161 → KNN: 2.6733 | UserBased: 2.6733
[13] uid: 1, iid: 165 → KNN: 3.0645 | UserBased: 3.0645
[14] uid: 1, iid: 168 → KNN: 2.8552 | UserBased: 2.8552
[15] uid: 1, iid: 185 → KNN: 2.4117 | UserBased: 2.4117
[16] uid: 1, iid: 186 → KNN: 1.5386 | UserBased: 1.5386
[17

In [19]:
# (Optionnel) Comparaison dans un DataFrame
comparison_df = pd.DataFrame({
    "user_id": [pred.uid for pred in knn_preds[:30]],
    "item_id": [pred.iid for pred in knn_preds[:30]],
    "knn_est = ": [pred.est for pred in knn_preds[:30]],
    "userbased_est = ": [user_preds[i][2] for i in range(30)],
})

# 4. Erreur absolue
comparison_df["abs_diff"] = (comparison_df["knn_est = "] - comparison_df["userbased_est = "]).abs()

# Affichage
comparison_df.head(10)

Unnamed: 0,user_id,item_id,knn_est =,userbased_est =,abs_diff
0,1,10,2.624776,2.624776,0.0
1,1,17,3.190128,3.190128,0.0
2,1,39,2.529483,2.529483,0.0
3,1,47,2.979633,2.979633,0.0
4,1,50,4.043171,4.043171,0.0
5,1,52,2.127121,2.127121,0.0
6,1,62,2.774034,2.774034,0.0
7,1,110,2.695398,2.695398,0.0
8,1,144,2.130513,2.130513,0.0
9,1,150,1.800208,1.800208,0.0


# 5. Compare MSD and Jacard
Compare predictions made with MSD similarity and Jacard similarity


In [20]:
sim_options_msd = {'name': 'msd', 'min_support': 3, 'user_based': True}
sim_options_jacard = {'name': 'jacard', 'user_based': True}
sim_option_ITR = {'name' : 'itr'}

# Modèle MSD
model_msd = UserBased(k=3, min_k=2, sim_options=sim_options_msd)
model_msd.fit(trainset)
preds_msd = []
for uid, iid, _ in anti_testset[:30]:
    inner_uid = trainset.to_inner_uid(uid)
    inner_iid = trainset.to_inner_iid(iid)
    est = model_msd.estimate(inner_uid, inner_iid)
    preds_msd.append(est)

# Modèle Jacard
model_jacard = UserBased(k=3, min_k=2, sim_options=sim_options_jacard)
model_jacard.fit(trainset)
preds_jacard = []
for uid, iid, _ in anti_testset[:30]:
    inner_uid = trainset.to_inner_uid(uid)
    inner_iid = trainset.to_inner_iid(iid)
    est = model_jacard.estimate(inner_uid, inner_iid)
    preds_jacard.append(est)

# Comparaison
import pandas as pd
comparison_df = pd.DataFrame({
    "user_id": [uid for uid, _, _ in anti_testset[:30]],
    "item_id": [iid for _, iid, _ in anti_testset[:30]],
    "estimate_msd": preds_msd,
    "estimate_jacard": preds_jacard
})
comparison_df["abs_diff"] = (comparison_df["estimate_msd"] - comparison_df["estimate_jacard"]).abs()
comparison_df.head(10)



Unnamed: 0,user_id,item_id,estimate_msd,estimate_jacard,abs_diff
0,1,10,2.624776,2.430417,0.194359
1,1,17,3.190128,2.799238,0.39089
2,1,39,2.529483,2.203109,0.326374
3,1,47,2.979633,2.923179,0.056454
4,1,50,4.043171,2.93789,1.105281
5,1,52,2.127121,2.72361,0.596489
6,1,62,2.774034,1.441311,1.332723
7,1,110,2.695398,3.319595,0.624197
8,1,144,2.130513,2.893976,0.763463
9,1,150,1.800208,2.03212,0.231912


# Comparaison entre Itr et Jacard 

In [None]:
sim_options_msd = {'name': 'msd', 'min_support': 3, 'user_based': True}
sim_options_jacard = {'name': 'jacard', 'user_based': True}
sim_option_itr = {'name' : 'itr'}

# Modèle ITR
model_itr = UserBased(k=3, min_k=2, sim_options=sim_option_itr)
model_itr.fit(trainset)
preds_itr = []
for uid, iid, _ in anti_testset[:30]:
    inner_uid = trainset.to_inner_uid(uid)
    inner_iid = trainset.to_inner_iid(iid)
    est = model_itr.estimate(inner_uid, inner_iid)
    preds_itr.append(est)

# Modèle Jacard
model_jacard = UserBased(k=3, min_k=2, sim_options=sim_options_jacard)
model_jacard.fit(trainset)
preds_jacard = []
for uid, iid, _ in anti_testset[:30]:
    inner_uid = trainset.to_inner_uid(uid)
    inner_iid = trainset.to_inner_iid(iid)
    est = model_jacard.estimate(inner_uid, inner_iid)
    preds_jacard.append(est)

# Comparaison
import pandas as pd
comparison_df = pd.DataFrame({
    "user_id": [uid for uid, _, _ in anti_testset[:30]],
    "item_id": [iid for _, iid, _ in anti_testset[:30]],
    "estimate_itr": preds_itr,
    "estimate_jacard": preds_jacard
})
comparison_df["abs_diff"] = (comparison_df["estimate_itr"] - comparison_df["estimate_jacard"]).abs()
comparison_df.head(10)


Unnamed: 0,user_id,item_id,estimate_itr,estimate_jacard,abs_diff
0,1,10,2.306011,2.430417,0.124406
1,1,17,3.082265,2.799238,0.283028
2,1,39,2.57236,2.203109,0.369251
3,1,47,2.367102,2.923179,0.556077
4,1,50,3.135791,2.93789,0.197901
5,1,52,2.083982,2.72361,0.639628
6,1,62,2.908457,1.441311,1.467146
7,1,110,3.298014,3.319595,0.021581
8,1,144,1.853023,2.893976,1.040954
9,1,150,1.978474,2.03212,0.053647
