# Here is the code to discover the datas and format them

In [None]:
import pandas as pd
import os
from helpers import *
from sklearn.model_selection import train_test_split


# Load and display the data
data_users, data_items, data_ratings = load_movielens_data()

# Useful subsets for SVM algorithm
svm_data = data_ratings.merge(data_items[['movie_id', 'title']], on='movie_id').merge(data_users[['user_id', 'age', 'gender']], on='user_id')

print("\nSVM-ready DataFrame:")
print(svm_data.head())
print(f"SVM data shape: {svm_data.shape}")


Users DataFrame:
   user_id  age gender  occupation zip_code
0        1   24      M  technician    85711
1        2   53      F       other    94043
2        3   23      M      writer    32067
3        4   24      M  technician    43537
4        5   33      F       other    15213
Users shape: (943, 5)

Movies DataFrame:
   movie_id              title
0         1   Toy Story (1995)
1         2   GoldenEye (1995)
2         3  Four Rooms (1995)
3         4  Get Shorty (1995)
4         5     Copycat (1995)
Movies shape: (1682, 24)

Ratings DataFrame:
   user_id  movie_id  rating  timestamp
0      196       242       3  881250949
1      186       302       3  891717742
2       22       377       1  878887116
3      244        51       2  880606923
4      166       346       1  886397596
Ratings shape: (100000, 4)

SVM-ready DataFrame:
   user_id  movie_id  rating  timestamp                       title  age  \
0      196       242       3  881250949                Kolya (1996)   49   
1     

In [None]:
import numpy as np
from typing import NamedTuple, List, Dict, Tuple
class PairwiseDataset(NamedTuple):
    """Data container for pairwise comparisons"""
    users: np.ndarray
    movie_j: np.ndarray
    movie_k: np.ndarray
    preferences: np.ndarray

def create_pairwise_dataset(svm_data):
    # Créer une jointure auto-corrélée pour générer les comparaisons par paires
    merged = svm_data[['user_id', 'movie_id', 'rating']].merge(
        svm_data[['user_id', 'movie_id', 'rating']],
        on='user_id',
        suffixes=('_j', '_k')
    )

    # Filtrer les paires où les films sont différents
    filtered = merged[merged['movie_id_j'] != merged['movie_id_k']]

    # Ajouter la colonne de préférence
    filtered['preference'] = (filtered['rating_j'] > filtered['rating_k']).astype(int) * 2 - 1

    # Extraire les colonnes pertinentes
    df = filtered[['user_id', 'movie_id_j', 'movie_id_k', 'preference']]

    return PairwiseDataset(
        users=df['user_id'].values,
        movie_j=df['movie_j_id'].values,
        movie_k=df['movie_k_id'].values,
        preferences=df['preference'].values
    )

def split_pairwise_dataset(dataset, p_test=0.1, seed=1):
    rng = np.random.default_rng(seed)
    test_mask = rng.uniform(size=len(dataset.preferences)) < p_test
    train_mask = ~test_mask

    train_data = PairwiseDataset(
        users=dataset.users[train_mask],
        movie_j=dataset.movie_j[train_mask],
        movie_k=dataset.movie_k[train_mask],
        preferences=dataset.preferences[train_mask],
    )
    test_data = PairwiseDataset(
        users=dataset.users[test_mask],
        movie_j=dataset.movie_j[test_mask],
        movie_k=dataset.movie_k[test_mask],
        preferences=dataset.preferences[test_mask],
    )
    return train_data, test_data

def mse(predictions, targets):
    return np.mean((predictions - targets) ** 2)

class AltSVMPairwisePredictor:
    def __init__(self, train_data: PairwiseDataset, num_features=20, seed=1):
        rng = np.random.default_rng(seed)

        num_users = np.max(train_data.users) + 1
        num_movies = max(np.max(train_data.movie_j), np.max(train_data.movie_k)) + 1

        self.user_features = rng.normal(size=[num_users, num_features])
        self.movie_features = rng.normal(size=[num_movies, num_features])

    def __call__(self, test_data: PairwiseDataset):
        u = self.user_features[test_data.users]
        v_j = self.movie_features[test_data.movie_j]
        v_k = self.movie_features[test_data.movie_k]
        return np.sum(u * (v_j - v_k), axis=1)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered['preference'] = (filtered['rating_j'] > filtered['rating_k']).astype(int) * 2 - 1


Training data shape: (16080649, 3)
Test data shape: (4020163, 3)


In [None]:

def train_alt_svm(train_data: PairwiseDataset, test_data: PairwiseDataset, num_features=20, user_reg=20, movie_reg=20, max_iter=1000, stop_crit=1e-4):
    predictor = AltSVMPairwisePredictor(train_data, num_features)
    user_features = predictor.user_features
    movie_features = predictor.movie_features

    prev_train_error = None

    for iteration in range(max_iter):
        # Update user features
        for user in np.unique(train_data.users):
            mask = train_data.users == user
            v_diff = predictor.movie_features[train_data.movie_j[mask]] - predictor.movie_features[train_data.movie_k[mask]]
            r = train_data.preferences[mask]
            user_features[user] = np.linalg.solve(v_diff.T @ v_diff + np.eye(num_features) * user_reg, v_diff.T @ r)

        # Update movie features
        for movie in np.unique(np.concatenate((train_data.movie_j, train_data.movie_k))):
            mask_j = train_data.movie_j == movie
            mask_k = train_data.movie_k == movie

            u_j = user_features[train_data.users[mask_j]]
            r_j = train_data.preferences[mask_j]

            u_k = -user_features[train_data.users[mask_k]]
            r_k = -train_data.preferences[mask_k]

            u_combined = np.vstack((u_j, u_k))
            r_combined = np.concatenate((r_j, r_k))

            movie_features[movie] = np.linalg.solve(u_combined.T @ u_combined + np.eye(num_features) * movie_reg, u_combined.T @ r_combined)

        train_error = mse(predictor(train_data), train_data.preferences)
        test_error = mse(predictor(test_data), test_data.preferences)
        print(f"Iteration {iteration+1}: Train Error: {train_error:.4f}, Test Error: {test_error:.4f}")

        if prev_train_error is not None and abs(train_error - prev_train_error) < stop_crit:
            break
        prev_train_error = train_error

    return predictor


In [None]:
import numpy as np

class AltSVMPairwisePredictor:
    def __init__(self, train_data, num_features=20, seed=1):
        rng = np.random.default_rng(seed)

        num_users = np.max(train_data.users) + 1
        num_movies = max(np.max(train_data.movie_j), np.max(train_data.movie_k)) + 1

        self.user_features = rng.normal(size=[num_users, num_features])
        self.movie_features = rng.normal(size=[num_movies, num_features])
        self.alpha = np.zeros(len(train_data.preferences))
        self.beta = np.zeros(len(train_data.preferences))

    def __call__(self, test_data):
        u = self.user_features[test_data.users]
        v_j = self.movie_features[test_data.movie_j]
        v_k = self.movie_features[test_data.movie_k]
        return np.sum(u * (v_j - v_k), axis=1)

    def train(self, train_data, lambda_reg=0.1, num_epochs=10, learning_rate=0.01, S=5, T=4):
        """Train the AltSVM model using pairwise comparison data with delta updates"""
        for epoch in range(num_epochs):
            # Update item features V
            for j in range(len(self.movie_features)):
                # Compute the first sum (positive part)
                positive_indices = (train_data.movie_j == j)
                positive_sum = np.sum(
                    [self.beta[idx] * train_data.preferences[idx] * self.user_features[train_data.users[idx]]
                    for idx in np.where(positive_indices)[0]], axis=0
                )

                # Compute the second sum (negative part)
                negative_indices = (train_data.movie_k == j)
                negative_sum = np.sum(
                    [self.beta[idx] * train_data.preferences[idx] * self.user_features[train_data.users[idx]]
                    for idx in np.where(negative_indices)[0]], axis=0
                )

                # Update movie feature with both sums
                self.movie_features[j] = positive_sum - negative_sum

            # Parallel block for beta updates
            for _ in range(T):
                for _ in range(S):
                    idx = np.random.choice(len(train_data.preferences))
                    i, j, k, pref = train_data.users[idx], train_data.movie_j[idx], train_data.movie_k[idx], train_data.preferences[idx]
                    u_vec = self.user_features[i]
                    v_j = self.movie_features[j]
                    v_k = self.movie_features[k]

                    # Compute delta minimizing equation (10)
                    pred = np.dot(u_vec, v_j - v_k)
                    delta = pref - pred
                    self.beta[idx] += delta
                    self.movie_features[j] += delta * pref * u_vec
                    self.movie_features[k] -= delta * pref * u_vec

            # Update user features U
            for i in range(len(self.user_features)):
                relevant_indices = (train_data.users == i)
                self.user_features[i] = np.sum(
                    [self.alpha[idx] * train_data.preferences[idx] * (self.movie_features[train_data.movie_j[idx]] - self.movie_features[train_data.movie_k[idx]])
                     for idx in np.where(relevant_indices)[0]], axis=0
                )

            # Parallel block for alpha updates
            for _ in range(T):
                for _ in range(S):
                    idx = np.random.choice(len(train_data.preferences))
                    i, j, k, pref = train_data.users[idx], train_data.movie_j[idx], train_data.movie_k[idx], train_data.preferences[idx]
                    v_j = self.movie_features[j]
                    v_k = self.movie_features[k]

                    # Compute delta minimizing equation (8)
                    pred = np.dot(self.user_features[i], v_j - v_k)
                    delta = pref - pred
                    self.alpha[idx] += delta
                    self.user_features[i] += delta * pref * (v_j - v_k)

            print(f"Epoch {epoch + 1}/{num_epochs} completed")

        print("Training completed.")


In [None]:
%load_ext autoreload
%autoreload 2

import numpy as np
import scipy.io
import matplotlib.pyplot as plt
from test_utils import test
from typing import NamedTuple, List, Dict, Tuple

data = scipy.io.loadmat("movielens100k.mat")["ratings"]

In [None]:
class Dataset(NamedTuple):
    """Data container with three arrays of the same length:"""

    movies: np.ndarray
    users: np.ndarray
    ratings: np.ndarray


def load_data() -> Dataset:
    """Load a sparse matrix from a matlab file and return is as a list of"""
    data = scipy.io.loadmat("movielens100k.mat")["ratings"]
    movies, users = data.nonzero()  # indices of available ratings in the matrix
    ratings = data[movies, users].A1
    return Dataset(movies, users, ratings)


dataset = load_data()

num_users = np.max(dataset.users) + 1
num_movies = np.max(dataset.movies) + 1

print(
    f"Loaded {len(dataset.ratings)} ratings of {num_users} users for {num_movies} movies."
)
def split_dataset(dataset, p_test=0.1, seed=1):
    """
    Split a dataset randomly into a train and a test part

    Inputs:
        dataset: Dataset
        p_test: float
            propability (0 < p_test < 1) for a data point to go into the test set
        seed: integer

    Returns:
        train_dataset: Dataset
        test_dataset: Dataset

    >>> split_dataset(Dataset(np.array([0, 0]), np.array([1, 0]), np.array([2.0, 1.0])), p_test=0)
    (Dataset(movies=array([0, 0]), users=array([1, 0]), ratings=array([2., 1.])), Dataset(movies=array([], dtype=int64), users=array([], dtype=int64), ratings=array([], dtype=float64)))

    >>> split_dataset(Dataset(np.array([0, 0]), np.array([1, 0]), np.array([2.0, 1.0])), p_test=1)
    (Dataset(movies=array([], dtype=int64), users=array([], dtype=int64), ratings=array([], dtype=float64)), Dataset(movies=array([0, 0]), users=array([1, 0]), ratings=array([2., 1.])))
    """
    # use this generator (https://numpy.org/doc/stable/reference/random/index.html)
    # you should use rng.uniform() once inside this function to match the automatic test case
    rng = np.random.default_rng(seed)

    ### SOLUTION
    test_mask = rng.uniform(size=len(dataset.ratings)) < p_test
    train_mask = ~test_mask  # invert the mask

    movies, users, ratings = dataset

    train_data = Dataset(movies[train_mask], users[train_mask], ratings[train_mask])
    test_data = Dataset(movies[test_mask], users[test_mask], ratings[test_mask])
    ### END SOLUTION

    return train_data, test_data
train_data, test_data = split_dataset(pruned_dataset, p_test=0.1, seed=10)
print("Number of training points:", len(train_data.ratings))
print("Number of test points:", len(test_data.ratings))
def to_matrix(dataset, num_movies, num_users):
    """
    Construct a dense matrix out of the dataset.

    Input:
        dataset: Dataset

    Output:
        matrix: np.array of floats: (# movies, # users) -> rating (float) or np.NaN if unavailable

    >>> to_matrix(Dataset(np.array([1, 1, 0]), np.array([0, 1, 0]), np.array([1.0, 3.0, 2.5])), 3, 2)
    array([[2.5, nan],
           [1. , 3. ],
           [nan, nan]])
    """
    m = (
        np.zeros([num_movies, num_users]) * np.NaN
    )  # We want NaNs for unavailable ratings
    ### SOLUTION
    if len(dataset.ratings) > 0:
        m[dataset.movies, dataset.users] = dataset.ratings
    ### END SOLUTION
    ### HINT: Edit `m` by filling in the available ratings
    return m

class MatrixFactorizationPredictor(Predictor):
    def __init__(self, train_data: Dataset, num_features=20, seed=1):
        # Randomly initialize features for the users and the movies from N(0, 1)

        # use this generator (https://numpy.org/doc/stable/reference/random/index.html)
        # you are expected to use rng.normal() twice in this function to match the tests, once for movies, and then once for users
        rng = np.random.default_rng(seed)

        num_movies = np.max(train_data.movies) + 1
        num_users = np.max(train_data.users) + 1
        ### SOLUTION
        self.movie_features = rng.normal(size=[num_movies, num_features])
        self.user_features = rng.normal(size=[num_users, num_features])
        ### END SOLUTION

        # Normally, you should train the model here, but we will skip this
        # for now, to be able to take it step-by-step.

    def __call__(self, test_data: Dataset):
        """
        Predict the rating of a user/movie pair as the dot-product
        of representation vectors of the user and the movie.

        >>> train_data = Dataset(np.array([0, 0, 1, 1, 2]), np.array([1, 2, 3, 4, 5]), np.array([4.0, 1.0, 1.0, 2.0, 1.0]))
        >>> test_data = Dataset(np.array([0, 1, 2, 1]), np.array([1, 2, 2, 0]), np.array([1.0, 2.0, 2.5, 5.0]))
        >>> mean_predictor = MatrixFactorizationPredictor(train_data)
        >>> mean_predictor(test_data)  # the factorization is not yet optimized here
        array([ 2.62654714, -2.89866225,  0.70909287,  5.29901482])
        """
        ### SOLUTION
        user_features = self.user_features[test_data.users]
        movie_features = self.movie_features[test_data.movies]
        return (user_features * movie_features).sum(axis=1)
        ### END SOLUTION
movie_regularization = 20
user_regularization = 20
max_iterations = 1000
stop_criterion = 1e-4

predictor = MatrixFactorizationPredictor(train_data)
user_features = predictor.user_features
movie_features = predictor.movie_features
rng = np.random.default_rng(0)

prev_train_error = None

for iteration in range(max_iterations):
    # Optimize the user features
    for user in np.unique(train_data.users):
        # Update `user_features[user]` by optimizing the regularized corresponding least squares objective
        mask = train_data.users == user
        user_movies = train_data.movies[mask]
        ### SOLUTION
        M = movie_features[user_movies]
        r = train_data.ratings[mask]
        user_features[user] = np.linalg.solve(
            M.T @ M + np.eye(M.shape[1]) * user_regularization, M.T @ r
        )
        ### END SOLUTION

    # Optimize the movie features using least squares
    for movie in np.unique(train_data.movies):
        # Update `movie_features[movie]` by optimizing the regularized corresponding least squares objective
        mask = train_data.movies == movie
        movie_users = train_data.users[mask]
        ### SOLUTION
        M = user_features[movie_users]
        r = train_data.ratings[mask]
        movie_features[movie] = np.linalg.solve(
            M.T @ M + np.eye(M.shape[1]) * movie_regularization, M.T @ r
        )
        ### END SOLUTION

    train_error = mse(predictor(train_data), train_data.ratings)
    print(f"Train error after step {iteration+1}: {train_error}")
    print(
        f"Test error after step {iteration+1}: {mse(predictor(test_data), test_data.ratings)}"
    )

    # Stop if the training error is not going down more than 'stop_criterion'
    ### SOLUTION
    if (
        prev_train_error is not None
        and np.abs(train_error - prev_train_error) < stop_criterion
    ):
        break
    prev_train_error = train_error
    ### END SOLUTION