# Here is the code to discover the datas and format them

In [None]:
import pandas as pd
import numpy as np
import os
from helpers_1 import *
from sklearn.model_selection import train_test_split
from typing import NamedTuple, List, Dict, Tuple



# Load and display the data
data_users, data_items, data_ratings = load_movielens_data()

# Useful subsets for SVM algorithm
svm_data = data_ratings.merge(data_items[['movie_id', 'title']], on='movie_id').merge(data_users[['user_id', 'age', 'gender']], on='user_id')

print("\nSVM-ready DataFrame:")
print(svm_data.head())
print(f"SVM data shape: {svm_data.shape}")


Users DataFrame:
   user_id  age gender  occupation zip_code
0        1   24      M  technician    85711
1        2   53      F       other    94043
2        3   23      M      writer    32067
3        4   24      M  technician    43537
4        5   33      F       other    15213
Users shape: (943, 5)

Movies DataFrame:
   movie_id              title
0         1   Toy Story (1995)
1         2   GoldenEye (1995)
2         3  Four Rooms (1995)
3         4  Get Shorty (1995)
4         5     Copycat (1995)
Movies shape: (1682, 24)

Ratings DataFrame:
   user_id  movie_id  rating  timestamp
0      196       242       3  881250949
1      186       302       3  891717742
2       22       377       1  878887116
3      244        51       2  880606923
4      166       346       1  886397596
Ratings shape: (100000, 4)

SVM-ready DataFrame:
   user_id  movie_id  rating  timestamp                       title  age  \
0      196       242       3  881250949                Kolya (1996)   49   
1     

# Beginning of the structure of the SVM algorithm

In [None]:
class PairwiseDataset(NamedTuple):
    """Data container for pairwise comparisons"""
    users: np.ndarray
    movie_j: np.ndarray
    movie_k: np.ndarray
    preferences: np.ndarray

def create_pairwise_dataset(svm_data):
    # Créer une jointure auto-corrélée pour générer les comparaisons par paires
    merged = svm_data[['user_id', 'movie_id', 'rating']].merge(
        svm_data[['user_id', 'movie_id', 'rating']],
        on='user_id',
        suffixes=('_j', '_k')
    )

    # Filtrer les paires où les films sont différents
    filtered = merged[merged['movie_id_j'] != merged['movie_id_k']]

    # Ajouter la colonne de préférence
    filtered['preference'] = (filtered['rating_j'] > filtered['rating_k']).astype(int) * 2 - 1

    # Extraire les colonnes pertinentes
    df = filtered[['user_id', 'movie_id_j', 'movie_id_k', 'preference']]

    return PairwiseDataset(
        users=df['user_id'].values,
        movie_j=df['movie_j_id'].values,
        movie_k=df['movie_k_id'].values,
        preferences=df['preference'].values
    )

def split_pairwise_dataset(dataset, p_test=0.1, seed=1):
    rng = np.random.default_rng(seed)
    test_mask = rng.uniform(size=len(dataset.preferences)) < p_test
    train_mask = ~test_mask

    train_data = PairwiseDataset(
        users=dataset.users[train_mask],
        movie_j=dataset.movie_j[train_mask],
        movie_k=dataset.movie_k[train_mask],
        preferences=dataset.preferences[train_mask],
    )
    test_data = PairwiseDataset(
        users=dataset.users[test_mask],
        movie_j=dataset.movie_j[test_mask],
        movie_k=dataset.movie_k[test_mask],
        preferences=dataset.preferences[test_mask],
    )
    return train_data, test_data

def mse(predictions, targets):
    return np.mean((predictions - targets) ** 2)




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered['preference'] = (filtered['rating_j'] > filtered['rating_k']).astype(int) * 2 - 1


Training data shape: (16080649, 3)
Test data shape: (4020163, 3)


In [None]:

class AltSVMPairwisePredictor:
    def __init__(self, train_data, num_features=20, seed=1):
        rng = np.random.default_rng(seed)

        num_users = np.max(train_data.users) + 1
        num_movies = max(np.max(train_data.movie_j), np.max(train_data.movie_k)) + 1

        self.user_features = rng.normal(size=[num_users, num_features])
        self.movie_features = rng.normal(size=[num_movies, num_features])
        self.alpha = np.zeros(len(train_data.preferences))
        self.beta = np.zeros(len(train_data.preferences))

    def __call__(self, test_data):
        u = self.user_features[test_data.users]
        v_j = self.movie_features[test_data.movie_j]
        v_k = self.movie_features[test_data.movie_k]
        return np.sum(u * (v_j - v_k), axis=1)

    def train(self, train_data, lambda_reg=0.1, num_epochs=10, learning_rate=0.01, S=5, T=4):
        """Train the AltSVM model using pairwise comparison data with delta updates"""
        for epoch in range(num_epochs):
            # Update item features V
            for j in range(len(self.movie_features)):
                # Compute the first sum (positive part)
                positive_indices = (train_data.movie_j == j)
                positive_sum = np.sum(
                    [self.beta[idx] * train_data.preferences[idx] * self.user_features[train_data.users[idx]]
                    for idx in np.where(positive_indices)[0]], axis=0
                )

                # Compute the second sum (negative part)
                negative_indices = (train_data.movie_k == j)
                negative_sum = np.sum(
                    [self.beta[idx] * train_data.preferences[idx] * self.user_features[train_data.users[idx]]
                    for idx in np.where(negative_indices)[0]], axis=0
                )

                # Update movie feature with both sums
                self.movie_features[j] = positive_sum - negative_sum

            # Parallel block for beta updates
            for _ in range(T):
                for _ in range(S):
                    idx = np.random.choice(len(train_data.preferences))
                    i, j, k, pref = train_data.users[idx], train_data.movie_j[idx], train_data.movie_k[idx], train_data.preferences[idx]
                    u_vec = self.user_features[i]
                    v_j = self.movie_features[j]
                    v_k = self.movie_features[k]

                    # Compute delta minimizing equation (10)
                    pred = np.dot(u_vec, v_j - v_k)
                    delta = pref - pred
                    self.beta[idx] += delta
                    self.movie_features[j] += delta * pref * u_vec
                    self.movie_features[k] -= delta * pref * u_vec

            # Update user features U
            for i in range(len(self.user_features)):
                relevant_indices = (train_data.users == i)
                self.user_features[i] = np.sum(
                    [self.alpha[idx] * train_data.preferences[idx] * (self.movie_features[train_data.movie_j[idx]] - self.movie_features[train_data.movie_k[idx]])
                     for idx in np.where(relevant_indices)[0]], axis=0
                )

            # Parallel block for alpha updates
            for _ in range(T):
                for _ in range(S):
                    idx = np.random.choice(len(train_data.preferences))
                    i, j, k, pref = train_data.users[idx], train_data.movie_j[idx], train_data.movie_k[idx], train_data.preferences[idx]
                    v_j = self.movie_features[j]
                    v_k = self.movie_features[k]

                    # Compute delta minimizing equation (8)
                    pred = np.dot(self.user_features[i], v_j - v_k)
                    delta = pref - pred
                    self.alpha[idx] += delta
                    self.user_features[i] += delta * pref * (v_j - v_k)

            print(f"Epoch {epoch + 1}/{num_epochs} completed")

        print("Training completed.")
