<a href="https://colab.research.google.com/github/MahdiTheGreat/RecommenderSystem/blob/main/DAS_Ass2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!git clone https://github.com/MahdiTheGreat/RecommenderSystem.git
%cd RecommenderSystem
%ls

Cloning into 'RecommenderSystem'...
remote: Enumerating objects: 10, done.[K
remote: Counting objects: 100% (10/10), done.[K
remote: Compressing objects: 100% (9/9), done.[K
remote: Total 10 (delta 2), reused 0 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (10/10), 115.14 KiB | 848.00 KiB/s, done.
Resolving deltas: 100% (2/2), done.
/content/RecommenderSystem
DAS_Ass2.ipynb  movie_reviews.zip  README.md


In [2]:
!unzip "movie_reviews.zip" -d "DAS_Ass2"
%ls

Archive:  movie_reviews.zip
  inflating: DAS_Ass2/movie_genres.csv  
  inflating: DAS_Ass2/user_reviews.csv  
[0m[01;34mDAS_Ass2[0m/  DAS_Ass2.ipynb  movie_reviews.zip  README.md


In [6]:
#content filtering
import pandas as pd
import numpy as np

# File paths
movie_genres_path = "DAS_Ass2/movie_genres.csv"
user_reviews_path = "DAS_Ass2/user_reviews.csv"

### STEP 1: Load Data
# Load movie genre data (X)
movie_genres_df = pd.read_csv(movie_genres_path)

# Load user review data (Y)
user_reviews_df = pd.read_csv(user_reviews_path)

### STEP 2: Data Cleaning
# Remove non-numeric columns
X = movie_genres_df.iloc[:, 2:].to_numpy()  # Drop movie index and title, keep genre features
Y = user_reviews_df.iloc[:, 2:].to_numpy()  # Drop user index and name, keep ratings

### STEP 3: Replace Missing Ratings (0s with NaN for clarity)
Y = Y.astype(np.float64)  # Change the data type of Y to float
Y[Y == 0] = np.nan  # This makes it easier to handle missing data

### STEP 4: Display Matrix Shapes
print(f"X Shape (Movies × Features): {X.shape}")
print(f"Y Shape (Users × Movies): {Y.shape}")

### Optional: Display a small sample of the cleaned matrices
print("Sample X (Movie Features):")
print(X[:5, :])  # First 5 movies, all features

print("Sample Y (User Ratings):")
print(Y[:5, :5])  # First 5 users, first 5 movies

# Store user preference vectors
Theta = np.zeros((len(Y), X.shape[1]))
print(f"Theta Shape (Users × Features): {Theta.shape}")

X Shape (Movies × Features): (2000, 25)
Y Shape (Users × Movies): (600, 2000)
Sample X (Movie Features):
[[1 0 0 0 0 1 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0]
 [0 1 1 0 1 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [1 1 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0]
 [1 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0]
 [0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0]]
Sample Y (User Ratings):
[[nan nan nan nan nan]
 [nan nan nan nan nan]
 [nan nan nan nan nan]
 [nan nan nan nan nan]
 [nan nan nan nan nan]]
Theta Shape (Users × Features): (600, 25)


In [14]:
from sklearn.linear_model import RidgeCV
from sklearn.impute import SimpleImputer

# Define lambda values to tune
lambda_values = [0.01, 0.1, 1, 10, 100]

# Iterate over the first 5 users
for user_id in range(len(Y)):
    # Get user ratings (Y row)
    y_user = Y[user_id, :]  # Ratings given by this user (some are NaN)

    # Find rated movies (i.e., non-NaN values)
    rated_movies = ~np.isnan(y_user)

    # If the user has rated at least one movie, train the model
    if np.sum(rated_movies) > 0:
        # Extract features (X) and corresponding ratings (y) for rated movies
        X_train = X[rated_movies, :]
        y_train = y_user[rated_movies]

        # Train Ridge regression with cross-validation to find the best λ
        ridge_model = RidgeCV(alphas=lambda_values, store_cv_values=True)
        ridge_model.fit(X_train, y_train)

        # Store learned user preferences (Θ)
        Theta[user_id, :] = ridge_model.coef_





In [62]:
def recommended_movies(Y_pred,top_k=5,users_num=5):
 if users_num is None:
     users_num = len(Y_pred)
 # Recommend top 5 movies for each of the first 5 users
 recommendations = np.argsort(-Y_pred, axis=1)[:, :top_k]  # Sort descending

 # Convert movie indices to actual movie titles
 recommended_movies = []
 for user_id in range(users_num):
     recommended_movies.append(movie_genres_df.iloc[recommendations[user_id], 1].values)

 # Extract user names from the original dataset
 user_names = user_reviews_df.iloc[:users_num, 1].values

 # Display recommendations with user names
 for user_id in range(users_num):
     print(f"\n🔹 **{user_names[user_id]} should watch:**")
     for movie in recommended_movies[user_id]:
         print(f"   🎬 {movie}")
 return recommendations

In [63]:
# Predict ratings for all movies
Y_pred = Theta @ X.T  # Matrix multiplication ΘX^T

# Mask out already rated movies (to avoid recommending watched ones)
Y_pred[np.where(~np.isnan(Y[:len(Y), :]))] = -np.inf

recommendations=recommended_movies(Y_pred,)


🔹 **Vincent should watch:**
   🎬 When the Game Stands Tall
   🎬 Head Over Heels
   🎬 Cheap Thrills
   🎬 Beneath the Planet of the Apes
   🎬 The Story of Us

🔹 **Edgar should watch:**
   🎬 Pirates of the Caribbean: Dead Man's Chest
   🎬 Hoffa
   🎬 The Adventures of Tintin
   🎬 Straight Out of Brooklyn
   🎬 Predators

🔹 **Addilyn should watch:**
   🎬 Maid in Manhattan
   🎬 Morning Glory
   🎬 Brazil
   🎬 Ice Princess
   🎬 The Order

🔹 **Marlee should watch:**
   🎬 American Hero
   🎬 Molière
   🎬 The Good Thief
   🎬 Inception
   🎬 The Perfect Man

🔹 **Javier should watch:**
   🎬 Unbroken
   🎬 Mulan
   🎬 Frozen
   🎬 Soul Survivors
   🎬 Harley Davidson and the Marlboro Man


In [64]:
#collaborative filtering

from sklearn.decomposition import TruncatedSVD
import numpy as np
import pandas as pd

# Replace missing values (NaNs) with the user’s average rating for better SVD performance
Y_filled = np.copy(Y)
for i in range(Y.shape[0]):  # Loop over users
    user_mean = np.nanmean(Y[i, :])  # Compute mean rating for user i
    Y_filled[i, np.isnan(Y[i, :])] = user_mean  # Replace NaN with mean rating


In [65]:
# Apply Truncated SVD to factorize Y into two low-rank matrices
svd = TruncatedSVD(n_components=10)
Theta = svd.fit_transform(Y_filled)  # User preferences (Users × Latent Features)
X = svd.components_.T  # Movie features (Movies × Latent Features)

# Predict ratings for all users and movies
Y_pred = Theta @ X.T  # Matrix multiplication to reconstruct predicted ratings

# Mask out already rated movies (avoid recommending watched ones)
Y_pred[np.where(~np.isnan(Y[:len(Y), :]))] = -np.inf
recommendations=recommended_movies(Y_pred)


🔹 **Vincent should watch:**
   🎬 The Warrior's Way
   🎬 Miss Congeniality
   🎬 Bathory: Countess of Blood
   🎬 The Adventures of Tintin
   🎬 Rosemary's Baby

🔹 **Edgar should watch:**
   🎬 The Woman Chaser
   🎬 Narc
   🎬 Tycoon
   🎬 Witness
   🎬 The Net

🔹 **Addilyn should watch:**
   🎬 About Last Night
   🎬 Idle Hands
   🎬 Torque
   🎬 Shooting Fish
   🎬 Life or Something Like It

🔹 **Marlee should watch:**
   🎬 Time Bandits
   🎬 The Good Thief
   🎬 I Love You Phillip Morris
   🎬 Set It Off
   🎬 The Magic Sword: Quest for Camelot

🔹 **Javier should watch:**
   🎬 Time Bandits
   🎬 Loser
   🎬 Open Season
   🎬 A Cinderella Story
   🎬 Star Trek: The Motion Picture


In [66]:
# Collaborative Filtering with Optimized k using SVD

from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import mean_squared_error
import numpy as np
import pandas as pd


# Range of k values to test
k_values = list(range(5, 100, 5))  # This will create a list from 1 to 100, stepping by 5


# Create a mask for known ratings (not NaN)
known_ratings = ~np.isnan(Y)

# Create training set: Copy Y and hide 20% of ratings for validation
Y_train = np.copy(Y)
num_hidden = int(0.2 * np.sum(known_ratings))  # 20% of known ratings

# Randomly select indices to hide
hidden_indices = np.argwhere(known_ratings)
np.random.shuffle(hidden_indices)
hidden_indices = hidden_indices[:num_hidden]

# Hide ratings in Y_train for validation
for i, j in hidden_indices:
    Y_train[i, j] = np.nan

# Replace missing values in Y_train with user mean
Y_filled = np.copy(Y_train)
for i in range(Y.shape[0]):
    user_mean = np.nanmean(Y_train[i, :])
    Y_filled[i, np.isnan(Y_train[i, :])] = user_mean

# Store RMSE values for different k
rmse_scores = []

for k in k_values:
    svd = TruncatedSVD(n_components=k)
    Theta = svd.fit_transform(Y_filled)
    X = svd.components_.T  # Movies × Features

    # Predict missing ratings
    Y_pred = Theta @ X.T

    # Compute RMSE on the hidden ratings
    true_ratings = Y[hidden_indices[:, 0], hidden_indices[:, 1]]
    predicted_ratings = Y_pred[hidden_indices[:, 0], hidden_indices[:, 1]]

    rmse = np.sqrt(mean_squared_error(true_ratings, predicted_ratings))
    rmse_scores.append(rmse)

    print(f"🔹 For k = {k}, RMSE = {rmse:.4f}")

# Find the best k (minimum RMSE)
best_k = k_values[np.argmin(rmse_scores)]
print(f"\n✅ Best k: {best_k} with RMSE = {min(rmse_scores):.4f}")

### STEP 5: Train Final Model with Best k

# Replace missing values (NaNs) with the user’s average rating for final training
Y_filled_final = np.copy(Y)
for i in range(Y.shape[0]):
    user_mean = np.nanmean(Y[i, :])
    Y_filled_final[i, np.isnan(Y[i, :])] = user_mean

# Apply Truncated SVD with the best k
svd_final = TruncatedSVD(n_components=best_k)
Theta = svd_final.fit_transform(Y_filled_final)
X = svd_final.components_.T  # Movie features (Movies × Latent Features)

# Predict ratings for all users and movies
Y_pred = Theta @ X.T  # Matrix multiplication to reconstruct predicted ratings

recommendations=recommended_movies(Y_pred)

🔹 For k = 5, RMSE = 1.1899
🔹 For k = 10, RMSE = 1.1898
🔹 For k = 15, RMSE = 1.1899
🔹 For k = 20, RMSE = 1.1898
🔹 For k = 25, RMSE = 1.1896
🔹 For k = 30, RMSE = 1.1891
🔹 For k = 35, RMSE = 1.1892
🔹 For k = 40, RMSE = 1.1890
🔹 For k = 45, RMSE = 1.1894
🔹 For k = 50, RMSE = 1.1890
🔹 For k = 55, RMSE = 1.1896
🔹 For k = 60, RMSE = 1.1894
🔹 For k = 65, RMSE = 1.1892
🔹 For k = 70, RMSE = 1.1885
🔹 For k = 75, RMSE = 1.1891
🔹 For k = 80, RMSE = 1.1902
🔹 For k = 85, RMSE = 1.1893
🔹 For k = 90, RMSE = 1.1900
🔹 For k = 95, RMSE = 1.1893

✅ Best k: 70 with RMSE = 1.1885

🔹 **Vincent should watch:**
   🎬 The General's Daughter
   🎬 Magnolia
   🎬 Risen
   🎬 Dawn of the Planet of the Apes
   🎬 About Time

🔹 **Edgar should watch:**
   🎬 Good Kill
   🎬 The Other End of the Line
   🎬 Go for It!
   🎬 A Scanner Darkly
   🎬 Hoffa

🔹 **Addilyn should watch:**
   🎬 The Magic Sword: Quest for Camelot
   🎬 Dirty Work
   🎬 Lilyhammer
   🎬 Neighbors
   🎬 Jonah: A VeggieTales Movie

🔹 **Marlee should watch:**
   🎬