# Step 1: Import Necessary Libraries

In [1]:
import pandas as pd
import numpy as np
from scipy.sparse.linalg import svds
from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn.model_selection import train_test_split

# Step 2: Load the Datasets

### 2.1. Load movie metedata with descriptions

This CSV file includes: movieId, title, genres, year, description.

In [2]:
movies_df = pd.read_csv('movielens_movies_with_descriptions.csv')
print("Movies with descriptions shape:", movies_df.shape)
print(movies_df.head())

Movies with descriptions shape: (3883, 5)
   movieId                        title                        genres  year  \
0        1                    Toy Story   Animation|Children's|Comedy  1995   
1        2                      Jumanji  Adventure|Children's|Fantasy  1995   
2        3             Grumpier Old Men                Comedy|Romance  1995   
3        4            Waiting to Exhale                  Comedy|Drama  1995   
4        5  Father of the Bride Part II                        Comedy  1995   

                                         description  
0  Led by Woody, Andy's toys live happily in his ...  
1  When siblings Judy and Peter discover an encha...  
2  A family wedding reignites the ancient feud be...  
3  Cheated on, mistreated and stepped on, the wom...  
4  Just when George Banks has recovered from his ...  


### 2.2. Load user demographics

The users.dat file is delimited by "::".

In [3]:
users_df = pd.read_csv('movielens-1m/users.dat', sep='::', engine='python',
                         header=None, names=['userId', 'Gender', 'Age', 'Occupation', 'Zip-code'])
print("Users dataset shape:", users_df.shape)
print(users_df.head())

Users dataset shape: (6040, 5)
   userId Gender  Age  Occupation Zip-code
0       1      F    1          10    48067
1       2      M   56          16    70072
2       3      M   25          15    55117
3       4      M   45           7    02460
4       5      M   25          20    55455


### 2.3 Load ratings data

The rating.dat file is delimited by "::".

In [4]:
ratings_df = pd.read_csv('movielens-1m/ratings.dat', sep='::', engine='python',
                         header=None, names=['userId', 'movieId', 'rating', 'timestamp'])
print("Ratings dataset shape:", ratings_df.shape)
print(ratings_df.head())

Ratings dataset shape: (1000209, 4)
   userId  movieId  rating  timestamp
0       1     1193       5  978300760
1       1      661       3  978302109
2       1      914       3  978301968
3       1     3408       4  978300275
4       1     2355       5  978824291


# Step 3: Split Ratings into Training and Test Sets

We'll split 80% of the data for training and the rest for testing.

In [5]:
train_data, test_data = train_test_split(ratings_df, test_size=0.2, random_state=42)

Build a full user-item matrix from the entire ratings dataset.\
Rows: users, Columns: movieId. Missing ratings are filled with zeros.

In [6]:
R_full = ratings_df.pivot(index='userId', columns='movieId', values='rating').fillna(0)

We start with the full matrix and then "mask" the test ratings for training. Then, set the corresponding entry in R_train for each test record to 0. After that, convert the training matrix to a NumPy array.

In [7]:
R_train = R_full.copy()
for idx, row in test_data.iterrows():
    R_train.loc[row['userId'], row['movieId']] = 0

# Convert training matrix to a NumPy array.
R_train = R_train.values

# Step 4: Normalize the Training Ratings Matrix and Perform SVD

Compute the mean rating for each user from the training matrix. Then, demean the training matrix.

In [12]:
user_ratings_mean = np.mean(R_train, axis=1)

R_train_demeaned = R_train - user_ratings_mean.reshape(-1, 1)

Perform SVD on the demeaned training matrix.

In [13]:
k = 50
U, sigma, Vt = svds(R_train_demeaned, k=k)
sigma = np.diag(sigma)

Reconstruct the approximated ratings matrix.

In [14]:
R_train_predicted = np.dot(np.dot(U, sigma), Vt) + user_ratings_mean.reshape(-1, 1)

# Step 5: Evaluate the Model on the Test Set

Create the full user-item rating matrix from ratings_df:

In [9]:
R_df = ratings_df.pivot(index='userId', columns='movieId', values='rating').fillna(0)

Compute RMSE for all test entries.

In [15]:
test_preds = []
test_truth = []

`R_df` is the pivoted `DataFrame` we created, with columns corresponding to `movieIds`.\
We'll use `R_df.columns.get_loc(movie)` to get the column index.

In [17]:
for idx, row in test_data.iterrows():
    user = int(row['userId'])
    movie = int(row['movieId'])
    true_rating = row['rating']
    # Get the column index corresponding to the movieId.
    try:
        movie_idx = R_df.columns.get_loc(movie)
    except KeyError:
        # If a movie in the test set is not in the training pivot, skip it.
        continue
    pred_rating = R_train_predicted[user - 1, movie_idx]  # user indexing: userId 1 corresponds to index 0
    test_preds.append(pred_rating)
    test_truth.append(true_rating)

test_rmse = sqrt(mean_squared_error(test_truth, test_preds))
print(f"RMSE on test set: {test_rmse:.4f}\n")

RMSE on test set: 2.7405



# Step 6: Recommend Movies Using the Movie Metadata

## Recommend movies for a given user based on predicted ratings.
    
Parameters:
- user_id: The ID of the user.
- R_predicted_df: DataFrame of predicted ratings (rows: userId, columns: movieId).
- movies_df: DataFrame containing movie metadata (with descriptions).
- ratings_df: Original ratings DataFrame to determine movies already rated.
- num_recommendations: Number of recommendations to output.
    
Returns:
- A DataFrame with the top recommended movies and their details.

In [18]:
def recommend_movies(user_id, R_predicted_df, movies_df, ratings_df, num_recommendations=5):
    # Get predicted ratings for the specified user and sort them.
    user_predictions = R_predicted_df.loc[user_id].sort_values(ascending=False)
    
    # Identify movies already rated by the user.
    user_rated_movies = ratings_df[ratings_df.userId == user_id]['movieId'].tolist()
    
    # Filter out movies the user has already rated.
    recommendations = movies_df[~movies_df['movieId'].isin(user_rated_movies)].copy()
    
    # Map the predicted rating to each movie.
    recommendations['PredictedRating'] = recommendations['movieId'].map(user_predictions)
    
    # Sort recommendations by predicted rating.
    recommendations = recommendations.sort_values('PredictedRating', ascending=False)
    
    return recommendations.head(num_recommendations)

# Create a DataFrame from R_train_predicted with the same row (user) and column (movieId) labels as R_full.
R_predicted_df = pd.DataFrame(R_train_predicted, index=R_full.index, columns=R_full.columns)

Get top 5 recommendations for a sample user (e.g., user 50).

In [19]:
recommended_movies = recommend_movies(50, R_predicted_df, movies_df, ratings_df, num_recommendations=5)

print("Top 5 movie recommendations for user 1:")
print(recommended_movies[['movieId', 'title', 'year', 'genres', 'description', 'PredictedRating']])

Top 5 movie recommendations for user 1:
      movieId          title  year                  genres  \
3412     3481  High Fidelity  2000                  Comedy   
3684     3753    The Patriot  2000        Action|Drama|War   
3486     3555          U-571  2000         Action|Thriller   
3827     3897  Almost Famous  2000            Comedy|Drama   
1239     1259    Stand by Me  1986  Adventure|Comedy|Drama   

                                            description  PredictedRating  
3412  When record store owner Rob Gordon gets dumped...         1.610278  
3684  After proving himself on the field of battle i...         1.341067  
3486  In the midst of World War II, the battle under...         1.177776  
3827  Almost Famous is an autobiographical inspired ...         1.164735  
1239  After the death of a friend, a writer recounts...         1.137794  
