# Step 1: Import Necessary Libraries

In [None]:
import pandas as pd
import numpy as np
from scipy.sparse.linalg import svds
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error
from math import sqrt
import os
from sklearn.model_selection import train_test_split

# Step 2: Datasets Part

## Load the Datasets

In [None]:
# 2.1 Load movie metadata with descriptions.
# This file must include columns: movieId, title, genres, year, description.
if os.path.exists("movielens_movies_with_descriptions.csv"):
    movies_with_des_dir = "movielens_movies_with_descriptions.csv"
else:
    movies_with_des_dir = "../movielens_movies_with_descriptions.csv"
movies_df = pd.read_csv(movies_with_des_dir)
print("Movies dataset shape:", movies_df.shape)
print(movies_df.head(), "\n")

Movies dataset shape: (3883, 5)
   movieId                        title                        genres  year  \
0        1                    Toy Story   Animation|Children's|Comedy  1995   
1        2                      Jumanji  Adventure|Children's|Fantasy  1995   
2        3             Grumpier Old Men                Comedy|Romance  1995   
3        4            Waiting to Exhale                  Comedy|Drama  1995   
4        5  Father of the Bride Part II                        Comedy  1995   

                                         description  
0  Led by Woody, Andy's toys live happily in his ...  
1  When siblings Judy and Peter discover an encha...  
2  A family wedding reignites the ancient feud be...  
3  Cheated on, mistreated and stepped on, the wom...  
4  Just when George Banks has recovered from his ...   



In [None]:
# 2.2 Load ratings data.
# The ratings file is delimited by "::". Adjust file path as needed.
if os.path.exists('movielens-1m/ratings.dat'):
    ratings_dir = 'movielens-1m/ratings.dat'
else:
    ratings_dir = '../movielens-1m/ratings.dat'
ratings_df = pd.read_csv(ratings_dir, sep='::', engine='python',
                         header=None, names=['userId', 'movieId', 'rating', 'timestamp'])
print("Ratings dataset shape:", ratings_df.shape)
print(ratings_df.head(), "\n")

Ratings dataset shape: (1000209, 4)
   userId  movieId  rating  timestamp
0       1     1193       5  978300760
1       1      661       3  978302109
2       1      914       3  978301968
3       1     3408       4  978300275
4       1     2355       5  978824291 



In [None]:
# 2.3 Load user demographics for further analysis.
if os.path.exists('movielens-1m/users.dat'):
    users_dir = 'movielens-1m/users.dat'
else:
    users_dir = '../movielens-1m/users.dat'
users_df = pd.read_csv(users_dir, sep='::', engine='python',
                       header=None, names=['userId', 'Gender', 'Age', 'Occupation', 'Zip-code'])
print("Users dataset shape:", users_df.shape)
print(users_df.head(), "\n")

Users dataset shape: (6040, 5)
   userId Gender  Age  Occupation Zip-code
0       1      F    1          10    48067
1       2      M   56          16    70072
2       3      M   25          15    55117
3       4      M   45           7    02460
4       5      M   25          20    55455 



# Step 3: Create the User-Item Rating Matrix

In [5]:
# Pivot the ratings data so that rows represent users and columns represent movies.
# Missing ratings are filled with zeros.
R_df = ratings_df.pivot(index='userId', columns='movieId', values='rating').fillna(0)
print("User-Item matrix shape (R_df):", R_df.shape)
# print(R_df.head(), "\n")

User-Item matrix shape (R_df): (6040, 3706)


# Step 4: Prepare Training Data and Apply SVD

In [15]:
# To follow the SVD formula from the PDF:
#   1. Center the data: R_adj = R - mean(R) per user.
#   2. Decompose the centered matrix: R_adj = U Σ Vᵀ.
#   3. Reconstruct the prediction: R_predicted = U Σ Vᵀ + mean(R).

# Split the ratings into training and test sets (80% train, 20% test).
train_data, test_data = train_test_split(ratings_df, test_size=0.2, random_state=42)
print("Train set size:", train_data.shape)
print("Test set size:", test_data.shape)

# Build the training matrix from R_df and then mask the test ratings.
R_train_df = R_df.copy()
# For simplicity, here we assume R_df is built from all ratings and then we mask test entries.
for idx, row in test_data.iterrows():
    # Set the entry corresponding to each test rating to 0 (masking)
    R_train_df.at[row['userId'], row['movieId']] = 0

# Convert the training DataFrame to a NumPy array.
R_train = R_train_df.values

# Apply SVD Using the Formulas (Centering, Decomposition, Reconstruction)
# Compute each user's mean rating from the training data.
user_ratings_mean = np.mean(R_train, axis=1)
# Center the training matrix (this is X_adj in the formulas).
R_train_demeaned = R_train - user_ratings_mean.reshape(-1, 1)

# Perform SVD on the demeaned training matrix.
# Here, k is the number of latent factors; adjust based on your dataset.
k = 50
U, sigma, Vt = svds(R_train_demeaned, k=k)
sigma = np.diag(sigma)

# Reconstruct the approximated ratings matrix using the SVD formula:
# R_predicted = U Σ Vᵀ + user_mean
R_train_predicted = np.dot(np.dot(U, sigma), Vt) + user_ratings_mean.reshape(-1, 1)

Train set size: (800167, 4)
Test set size: (200042, 4)


# Step 5: Use SVD Predictions to Compute Item-Item Similarity

In [None]:
# Instead of computing cosine similarity on the original R_df,
# we compute it on the SVD-reconstructed ratings matrix.
R_predicted_df = pd.DataFrame(R_train_predicted, index=R_df.index, columns=R_df.columns)
item_sim_matrix_svd = cosine_similarity(R_predicted_df.T)
item_sim_df_svd = pd.DataFrame(item_sim_matrix_svd, index=R_df.columns, columns=R_df.columns)
print("Item similarity matrix shape (SVD predictions):", item_sim_df_svd.shape)

Item similarity matrix shape (SVD predictions): (3706, 3706)


# Step 6: Apply PCA to the Training Data

In [18]:
# Using scikit-learn's PCA to reduce dimensionality and then reconstruct the ratings.
pca = PCA(n_components=k)
# Fit PCA on the centered training matrix.
R_train_pca = pca.fit_transform(R_train_demeaned)
# Reconstruct: R_pca_pred = inverse_transform + user mean.
R_pca_pred = pca.inverse_transform(R_train_pca) + user_ratings_mean.reshape(-1, 1)
# Convert PCA predictions to a DataFrame.
R_pca_pred_df = pd.DataFrame(R_pca_pred, index=R_df.index, columns=R_df.columns)

In [19]:
# PCA-based similarity: Compute cosine similarity on R_pca_pred_df.T.
item_sim_matrix_pca = cosine_similarity(R_pca_pred_df.T)
item_sim_df_pca = pd.DataFrame(item_sim_matrix_pca, index=R_df.columns, columns=R_df.columns)
print("Item similarity matrix shape (PCA predictions):", item_sim_df_pca.shape)

Item similarity matrix shape (PCA predictions): (3706, 3706)


# Step 7: Item-Based Collaborative Filtering Recommendation

## Recommend movies similar to a given movie using item-based collaborative filtering.
  Parameters:
  - movie_id (int): The ID of the reference movie.
  - item_sim_df (DataFrame): Movie-to-movie cosine similarity matrix.
  - movies_df (DataFrame): Movie metadata with descriptions.
  - top_n (int): Number of similar movies to return.
      
  Returns:
  - DataFrame: Recommended movies with similarity scores.

In [20]:
def recommend_similar_movies(movie_id, item_sim_df, movies_df, top_n=5):
    if movie_id not in item_sim_df.index:
        print(f"Movie ID {movie_id} not found in similarity matrix.")
        return None
    # Retrieve similarity scores for the movie.
    sim_scores = item_sim_df.loc[movie_id]
    # Remove the movie itself.
    sim_scores = sim_scores.drop(movie_id)
    # Select the top_n most similar movies.
    top_movie_ids = sim_scores.sort_values(ascending=False).head(top_n).index.tolist()
    # Retrieve movie details.
    recommendations = movies_df[movies_df['movieId'].isin(top_movie_ids)].copy()
    recommendations['Similarity'] = recommendations['movieId'].apply(lambda x: sim_scores[x])
    recommendations = recommendations.sort_values('Similarity', ascending=False)
    return recommendations

In [26]:
# Example: Recommend top 5 movies similar to a reference movie using SVD predictions.
reference_movie = 318
# Get recommendations using SVD-based similarity.
similar_movies_svd = recommend_similar_movies(reference_movie, item_sim_df_svd, movies_df, top_n=5)
print(f"Top 5 movies similar to movie {reference_movie} (using SVD predictions):")
if similar_movies_svd is not None:
    print(similar_movies_svd[['movieId', 'title', 'year', 'genres', 'description', 'Similarity']])
else:
    print("No recommendations available (SVD).")

# Get recommendations using PCA-based similarity.
similar_movies_pca = recommend_similar_movies(reference_movie, item_sim_df_pca, movies_df, top_n=5)
print(f"\nTop 5 movies similar to movie {reference_movie} (using PCA predictions):")
if similar_movies_pca is not None:
    print(similar_movies_pca[['movieId', 'title', 'year', 'genres', 'description', 'Similarity']])
else:
    print("No recommendations available (PCA).")

Top 5 movies similar to movie 318 (using SVD predictions):
      movieId                     title  year          genres  \
1656     1704         Good Will Hunting  1997           Drama   
35         36          Dead Man Walking  1995           Drama   
589       593  The Silence of the Lambs  1991  Drama|Thriller   
2432     2501               October Sky  1999           Drama   
1337     1358               Sling Blade  1996  Drama|Thriller   

                                            description  Similarity  
1656  Will Hunting has a genius-level IQ but chooses...    0.777827  
35    A justice drama based on a true story about a ...    0.725351  
589   FBI trainee, Clarice Starling ventures into a ...    0.724682  
2432  Based on the true story of Homer Hickam, a coa...    0.724449  
1337  Karl Childers is a mentally disabled man who h...    0.708641  

Top 5 movies similar to movie 318 (using PCA predictions):
      movieId                     title  year          genres  \
1656 

# Step 7: Evaluate the SVD Model Based on Recommended Movie Titles

In [23]:
# Ground truth: define a set of movie titles that are considered similar (this is domain-specific).
# Adjust the set below based on your ground truth for movie 318.
ground_truth_titles = {
    "The Green Mile", 
    "Forrest Gump", 
    "Pulp Fiction", 
    "The Godfather", 
    "Fight Club"
}

# Calculate Precision@K Based on Movie Titles

def precision_at_k(recommended_df, ground_truth_titles, k=5):
    """
    Compute precision@k based on the recommended movie titles.
    
    Parameters:
      recommended_df (DataFrame): DataFrame of recommended movies.
      ground_truth_titles (set): Set of ground-truth similar movie titles.
      k (int): Number of recommendations considered.
      
    Returns:
      float: Precision@k value.
    """
    # Get the recommended titles (limit to k recommendations).
    recommended_titles = recommended_df.head(k)['title'].tolist()
    hits = sum([1 for title in recommended_titles if title in ground_truth_titles])
    return hits / k

if similar_movies_svd is not None:
    prec_svd = precision_at_k(similar_movies_svd, ground_truth_titles, k=5)
    print(f"Precision@5 (SVD): {prec_svd:.4f}")
    
if similar_movies_pca is not None:
    prec_pca = precision_at_k(similar_movies_pca, ground_truth_titles, k=5)
    print(f"Precision@5 (PCA): {prec_pca:.4f}")

Precision@5 (SVD): 0.0000
Precision@5 (PCA): 0.2000
