# Project : Recommender movies

**Course:** Machine Learning in Production <br>
**Professor:** Soubeiga Armel <br>
**Contributers:**

*   Bellouch Ayoub
*   Mafkoud Khaoula
*   Hamid Hiba
*   Berkani Mohammed Adam 
*   Brunel Nangoum-Tchatchoua

In [None]:
import pandas as pd
import numpy as np
import os
from urllib.request import urlretrieve
import zipfile
from sklearn.model_selection import train_test_split


# --- Download the dataset if it doesn't exist ---
if not os.path.exists('ml-100k'):
    print("Downloading MovieLens 100k dataset...")
    url = 'http://files.grouplens.org/datasets/movielens/ml-100k.zip'
    urlretrieve(url, 'ml-100k.zip')
    with zipfile.ZipFile('ml-100k.zip', 'r') as zip_ref:
        zip_ref.extractall()
    print("Download and extraction complete.")

# --- Load the data ---
# u.data contains the ratings
data_cols = ['user_id', 'item_id', 'rating', 'timestamp']
ratings_df = pd.read_csv('ml-100k/u.data', sep='\t', names=data_cols)

# u.item contains movie titles
item_cols = ['item_id', 'title'] + [f'col{i}' for i in range(22)] # Remaining columns are not needed
movies_df = pd.read_csv('ml-100k/u.item', sep='|', names=item_cols, encoding='latin-1', usecols=['item_id', 'title'])

# Merge the two dataframes to have movie titles and ratings in one place
df = pd.merge(ratings_df, movies_df, on='item_id')

print("Data loaded successfully!")
df.head()

Downloading MovieLens 100k dataset...
Download and extraction complete.
Data loaded successfully!


Unnamed: 0,user_id,item_id,rating,timestamp,title
0,196,242,3,881250949,Kolya (1996)
1,186,302,3,891717742,L.A. Confidential (1997)
2,22,377,1,878887116,Heavyweights (1994)
3,244,51,2,880606923,Legends of the Fall (1994)
4,166,346,1,886397596,Jackie Brown (1997)


In [None]:
def create_user_item_matrix(df):
    """
    Creates the user-item interaction matrix from the dataframe.

    Args:
        df (pd.DataFrame): The dataframe containing user_id, item_id, and rating.

    Returns:
        pd.DataFrame: A user-item matrix with users as rows, items as columns,
                      and ratings as values. NaNs indicate that a user has not
                      rated an item.
    """
    return df.pivot_table(
        index="user_id",
        columns="item_id",
        values="rating"
    )

user_item_matrix = create_user_item_matrix(df)

print(user_item_matrix)


item_id  1     2     3     4     5     6     7     8     9     10    ...  \
user_id                                                              ...   
1         5.0   3.0   4.0   3.0   3.0   5.0   4.0   1.0   5.0   3.0  ...   
2         4.0   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   2.0  ...   
3         NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...   
4         NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...   
5         4.0   3.0   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...   
...       ...   ...   ...   ...   ...   ...   ...   ...   ...   ...  ...   
939       NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   5.0   NaN  ...   
940       NaN   NaN   NaN   2.0   NaN   NaN   4.0   5.0   3.0   NaN  ...   
941       5.0   NaN   NaN   NaN   NaN   NaN   4.0   NaN   NaN   NaN  ...   
942       NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...   
943       NaN   5.0   NaN   NaN   NaN   NaN   NaN   NaN   3.0   NaN  ...   

item_id  16

In [None]:
# TODO:Your code here
ratings = df[['user_id', 'item_id', 'rating']]


# Split into train/test (80/20)

train_df, test_df = train_test_split(
    ratings,
    test_size=0.2,
    random_state=42

)
# Get unique users and items in training set

train_users = train_df['user_id'].unique()
train_items = train_df['item_id'].unique()


print("Train ratings:", len(train_df))

print("Test ratings:", len(test_df))

Train ratings: 80000
Test ratings: 20000


In [None]:
import numpy as np

def initialize_matrices(n_users, n_items, n_factors):
    """
    Initializes the user-feature (P) and item-feature (Q) matrices.

    Args:
        n_users (int): Number of users.
        n_items (int): Number of items.
        n_factors (int): Number of latent factors.

    Returns:
        tuple: A tuple containing:
            - P (np.ndarray): The user-feature matrix (n_users x n_factors).
            - Q (np.ndarray): The item-feature matrix (n_items x n_factors).
    """
    P = np.random.randn(n_users, n_factors) * 0.01
    Q = np.random.randn(n_items, n_factors) * 0.01
    return P, Q




In [None]:
def train_model(train_data, P, Q, learning_rate, regularization, epochs):
    """
    Trains the matrix factorization model using SGD.

    Args:
        train_data (list of tuples): Each tuple is (user_idx, item_idx, rating)
        P (np.ndarray): User-feature matrix (n_users x n_factors)
        Q (np.ndarray): Item-feature matrix (n_items x n_factors)
        learning_rate (float): Learning rate (alpha)
        regularization (float): Regularization parameter (lambda)
        epochs (int): Number of passes over the training data

    Returns:
        tuple: Trained (P, Q)
    """
    for epoch in range(epochs):
        total_loss = 0
        for u_idx, i_idx, r in train_data:
            # Predict the rating
            pred = np.dot(P[u_idx], Q[i_idx])
            error = r - pred

            # Update latent factors
            P[u_idx] += learning_rate * (error * Q[i_idx] - regularization * P[u_idx])
            Q[i_idx] += learning_rate * (error * P[u_idx] - regularization * Q[i_idx])

            # squared error
            total_loss += error**2

    return P, Q


In [None]:
def calculate_rmse(test_data, P, Q):
    """
    Calculates the Root Mean Squared Error (RMSE) on the test set.

    Args:
        test_data (list of tuples): Each tuple is (user_idx, item_idx, rating)
        P (np.ndarray): The trained user-feature matrix.
        Q (np.ndarray): The trained item-feature matrix.

    Returns:
        float: The RMSE value.
    """
    mse = 0
    for u_idx, i_idx, r in test_data:
        pred = np.dot(P[u_idx], Q[i_idx])
        mse += (r - pred) ** 2

    rmse = np.sqrt(mse / len(test_data))
    return rmse


In [None]:

# Hyperparameters

n_factors = 20
learning_rate = 0.01
regularization = 0.02
epochs = 15


# Initialize P and Q

n_users = len(train_users)
n_items = len(train_items)
P, Q = initialize_matrices(n_users, n_items, n_factors)


# Train/test split

ratings = df[['user_id','item_id','rating']]
train_df, test_df = train_test_split(ratings, test_size=0.2, random_state=42)

# Unique users/items in training set
train_users = train_df['user_id'].unique()
train_items = train_df['item_id'].unique()
user2idx = {uid: i for i, uid in enumerate(train_users)}
item2idx = {iid: i for i, iid in enumerate(train_items)}


# Prepare train/test triplets

train_data = [(user2idx[u], item2idx[i], r)
              for u, i, r in zip(train_df['user_id'], train_df['item_id'], train_df['rating'])]

test_data = [(user2idx[u], item2idx[i], r)
             for u, i, r in zip(test_df['user_id'], test_df['item_id'], test_df['rating'])
             if u in user2idx and i in item2idx]



# Train the model

P_trained, Q_trained = train_model(train_data, P, Q, learning_rate, regularization, epochs)


# Evaluate on test set

rmse = calculate_rmse(test_data, P_trained, Q_trained)
print(f"\nTest RMSE: {rmse:.4f}")




Test RMSE: 0.9372


In [None]:
def recommend_top_movies(user_id, P, Q, movie_titles_df, R_df, top_n=10):
    """
    Recommends top N movies for a given user using all movies in the dataset.

    Args:
        user_id (int): The ID of the user.
        P (np.ndarray): The trained user-feature matrix.
        Q (np.ndarray): The trained item-feature matrix (only for items in training).
        movie_titles_df (pd.DataFrame): Dataframe with item_id and title.
        R_df (pd.DataFrame): The original ratings dataframe (user_id, item_id, rating).
        top_n (int): The number of movies to recommend.

    Returns:
        pd.DataFrame: Top N recommended movies with predicted ratings.
    """
    if user_id not in user2idx:
        raise ValueError("User ID not found in training data.")

    u_idx = user2idx[user_id]
    user_vector = P[u_idx]  # p_u


    all_items = movie_titles_df['item_id'].values
    item_vectors = []
    for iid in all_items:
        if iid in item2idx:
            item_vectors.append(Q[item2idx[iid]])
        else:

            item_vectors.append(np.random.randn(Q.shape[1]) * 0.01)
    item_vectors = np.array(item_vectors)


    pred_ratings = item_vectors.dot(user_vector)


    items_pred = pd.DataFrame({
        'item_id': all_items,
        'pred_rating': pred_ratings
    })


    seen_items = R_df[R_df['user_id'] == user_id]['item_id'].tolist()
    items_pred = items_pred[~items_pred['item_id'].isin(seen_items)]

    items_pred = items_pred.merge(movie_titles_df, on='item_id')

    top_movies = items_pred.sort_values(by='pred_rating', ascending=False).head(top_n)
    return top_movies[['title', 'pred_rating']]

# Test usage:
top_recs = recommend_top_movies(user_id=5, P=P_trained, Q=Q_trained,
                                movie_titles_df=movies_df, R_df=train_df, top_n=10)
print(top_recs)


                                                  title  pred_rating
133                          Wrong Trousers, The (1993)     3.991773
103                               Godfather, The (1972)     3.959989
10                           Usual Suspects, The (1995)     3.910691
1310                             Pather Panchali (1955)     3.865023
91    Wallace & Gromit: The Best of Aardman Animatio...     3.846696
257   Paradise Lost: The Child Murders at Robin Hood...     3.839263
464                                  Rear Window (1954)     3.836564
54                     Shawshank Redemption, The (1994)     3.800022
139                                 12 Angry Men (1957)     3.793522
47                                  Pulp Fiction (1994)     3.791349
