<a href="https://colab.research.google.com/github/MahdiTheGreat/RecommenderSystem/blob/main/DAS_Ass2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!git clone https://github.com/MahdiTheGreat/RecommenderSystem.git
%cd RecommenderSystem
%ls

Cloning into 'RecommenderSystem'...
remote: Enumerating objects: 16, done.[K
remote: Counting objects: 100% (16/16), done.[K
remote: Compressing objects: 100% (15/15), done.[K
remote: Total 16 (delta 6), reused 0 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (16/16), 147.54 KiB | 1.97 MiB/s, done.
Resolving deltas: 100% (6/6), done.
/content/RecommenderSystem
DAS_Ass2.ipynb  movie_reviews.zip  README.md


In [2]:
!unzip "movie_reviews.zip" -d "DAS_Ass2"
%ls

Archive:  movie_reviews.zip
  inflating: DAS_Ass2/movie_genres.csv  
  inflating: DAS_Ass2/user_reviews.csv  
[0m[01;34mDAS_Ass2[0m/  DAS_Ass2.ipynb  movie_reviews.zip  README.md


In [3]:
#content filtering
import pandas as pd
import numpy as np

# File paths
movie_genres_path = "DAS_Ass2/movie_genres.csv"
user_reviews_path = "DAS_Ass2/user_reviews.csv"

### STEP 1: Load Data
# Load movie genre data (X)
movie_genres_df = pd.read_csv(movie_genres_path)

# Load user review data (Y)
user_reviews_df = pd.read_csv(user_reviews_path)

### STEP 2: Data Cleaning
# Remove non-numeric columns
X = movie_genres_df.iloc[:, 2:].to_numpy()  # Drop movie index and title, keep genre features
Y = user_reviews_df.iloc[:, 2:].to_numpy()  # Drop user index and name, keep ratings

### STEP 3: Replace Missing Ratings (0s with NaN for clarity)
Y = Y.astype(np.float64)  # Change the data type of Y to float
Y[Y == 0] = np.nan  # This makes it easier to handle missing data

### STEP 4: Display Matrix Shapes
print(f"X Shape (Movies × Features): {X.shape}")
print(f"Y Shape (Users × Movies): {Y.shape}")

### Optional: Display a small sample of the cleaned matrices
print("Sample X (Movie Features):")
print(X[:5, :])  # First 5 movies, all features

print("Sample Y (User Ratings):")
print(Y[:5, :5])  # First 5 users, first 5 movies

# Store user preference vectors
Theta = np.zeros((len(Y), X.shape[1]))
print(f"Theta Shape (Users × Features): {Theta.shape}")

X Shape (Movies × Features): (2000, 25)
Y Shape (Users × Movies): (600, 2000)
Sample X (Movie Features):
[[1 0 0 0 0 1 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0]
 [0 1 1 0 1 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [1 1 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0]
 [1 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0]
 [0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0]]
Sample Y (User Ratings):
[[nan nan nan nan nan]
 [nan nan nan nan nan]
 [nan nan nan nan nan]
 [nan nan nan nan nan]
 [nan nan nan nan nan]]
Theta Shape (Users × Features): (600, 25)


In [None]:
from sklearn.linear_model import RidgeCV
from sklearn.impute import SimpleImputer

# Define lambda values to tune
lambda_values = [0.01, 0.1, 1, 10, 100]

# Iterate over the first 5 users
for user_id in range(len(Y)):
    # Get user ratings (Y row)
    y_user = Y[user_id, :]  # Ratings given by this user (some are NaN)

    # Find rated movies (i.e., non-NaN values)
    rated_movies = ~np.isnan(y_user)

    # If the user has rated at least one movie, train the model
    if np.sum(rated_movies) > 0:
        # Extract features (X) and corresponding ratings (y) for rated movies
        X_train = X[rated_movies, :]
        y_train = y_user[rated_movies]

        # Train Ridge regression with cross-validation to find the best λ
        ridge_model = RidgeCV(alphas=lambda_values, store_cv_values=True)
        ridge_model.fit(X_train, y_train)

        # Store learned user preferences (Θ)
        Theta[user_id, :] = ridge_model.coef_



In [16]:
def recommended_movies(Y_pred,top_k=5,users_num=5):
 Y_pred[np.where(~np.isnan(Y[:len(Y), :]))] = -np.inf
 if users_num is None:
     users_num = len(Y_pred)
 # Recommend top 5 movies for each of the first 5 users
 recommendations = np.argsort(-Y_pred, axis=1)[:, :top_k]  # Sort descending

 # Convert movie indices to actual movie titles
 recommended_movies = []
 for user_id in range(users_num):
     recommended_movies.append(movie_genres_df.iloc[recommendations[user_id], 1].values)

 # Extract user names from the original dataset
 user_names = user_reviews_df.iloc[:users_num, 1].values

 # Display recommendations with user names
 for user_id in range(users_num):
     print(f"\n🔹 **{user_names[user_id]} should watch:**")
     for movie in recommended_movies[user_id]:
         print(f"   🎬 {movie}")
 return recommendations

In [6]:
# Predict ratings for all movies
Y_pred = Theta @ X.T  # Matrix multiplication ΘX^T

# Mask out already rated movies (to avoid recommending watched ones)
Y_pred[np.where(~np.isnan(Y[:len(Y), :]))] = -np.inf

recommendations=recommended_movies(Y_pred,)


🔹 **Vincent should watch:**
   🎬 Evolution
   🎬 What the #$*! Do We (K)now!?
   🎬 Dark City
   🎬 The Returned
   🎬 The Unborn

🔹 **Edgar should watch:**
   🎬 Alpha and Omega 4: The Legend of the Saw Toothed Cave
   🎬 The Magic Sword: Quest for Camelot
   🎬 Stargate: The Ark of Truth
   🎬 9
   🎬 Centurion

🔹 **Addilyn should watch:**
   🎬 Alvin and the Chipmunks: Chipwrecked
   🎬 Alvin and the Chipmunks: The Road Chip
   🎬 Alvin and the Chipmunks
   🎬 Hannah Montana: The Movie
   🎬 Spice World

🔹 **Marlee should watch:**
   🎬 Zodiac
   🎬 Suspect Zero
   🎬 Gone Girl
   🎬 Narc
   🎬 Regression

🔹 **Javier should watch:**
   🎬 Hannah Montana: The Movie
   🎬 Sinbad: Legend of the Seven Seas
   🎬 Lilo & Stitch
   🎬 The Last Song
   🎬 Mrs. Doubtfire


In [14]:
#collaborative filtering

from sklearn.decomposition import TruncatedSVD
import numpy as np
import pandas as pd

# Replace missing values (NaNs) with the user’s average rating for better SVD performance
Y_filled = np.copy(Y)
for i in range(Y.shape[0]):  # Loop over users
    user_mean = np.nanmean(Y[i, :])  # Compute mean rating for user i
    Y_filled[i, np.isnan(Y[i, :])] = user_mean  # Replace NaN with mean rating

# Create a mask for known ratings (not NaN)
known_ratings = ~np.isnan(Y)

# Create training set: Copy Y and hide 20% of ratings for validation
Y_train = np.copy(Y_filled)
num_hidden = int(0.2 * np.sum(known_ratings))  # 20% of known ratings

# Randomly select indices to hide
hidden_indices = np.argwhere(known_ratings)
np.random.shuffle(hidden_indices)
hidden_indices = hidden_indices[:num_hidden]
true_ratings = Y[hidden_indices[:, 0], hidden_indices[:, 1]]

# Hide ratings in Y_train for validation
for i, j in hidden_indices:
    Y_train[i, j] = np.nan


In [17]:
# Apply Truncated SVD to factorize Y into two low-rank matrices
svd = TruncatedSVD(n_components=10)
Theta = svd.fit_transform(Y_filled)  # User preferences (Users × Latent Features)
X = svd.components_.T  # Movie features (Movies × Latent Features)

# Predict ratings for all users and movies
Y_pred = Theta @ X.T  # Matrix multiplication to reconstruct predicted ratings

# Mask out already rated movies (avoid recommending watched ones)
predicted_ratings = Y_pred[hidden_indices[:, 0], hidden_indices[:, 1]]
rmse = np.sqrt(mean_squared_error(true_ratings, predicted_ratings))
accuracy=accuracy_score(true_ratings, np.round(predicted_ratings))

print(f"For content filtering with latent factor model: RMSE = {rmse:.4f},accuracy = {accuracy:.4f}")

recommendations=recommended_movies(Y_pred)

For content filtering with latent factor model: RMSE = 1.1001,accuracy = 0.3153

🔹 **Vincent should watch:**
   🎬 The Fourth Kind
   🎬 World War Z
   🎬 The Lost Skeleton of Cadavra
   🎬 Space Battleship Yamato
   🎬 Sugar Town

🔹 **Edgar should watch:**
   🎬 A Sound of Thunder
   🎬 Bathory: Countess of Blood
   🎬 The Unborn
   🎬 Loser
   🎬 Seeking a Friend for the End of the World

🔹 **Addilyn should watch:**
   🎬 Now You See Me 2
   🎬 About Last Night
   🎬 I Love You Phillip Morris
   🎬 Torque
   🎬 Heli

🔹 **Marlee should watch:**
   🎬 Speed
   🎬 Suspect Zero
   🎬 Torque
   🎬 The Good Thief
   🎬 World War Z

🔹 **Javier should watch:**
   🎬 Gigli
   🎬 The Cabin in the Woods
   🎬 500 Days of Summer
   🎬 Mongol: The Rise of Genghis Khan
   🎬 Zipper


In [18]:
# Collaborative Filtering with Optimized k using SVD

from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import mean_squared_error,accuracy_score
import numpy as np
import pandas as pd


# Range of k values to test
k_values = list(range(5, 100, 5))  # This will create a list from 1 to 100, stepping by 5


# Store RMSE values for different k
rmse_scores = []
accuracy_scores=[]

for k in k_values:
    svd = TruncatedSVD(n_components=k)
    Theta = svd.fit_transform(Y_filled)
    X = svd.components_.T  # Movies × Features

    # Predict missing ratings
    Y_pred = Theta @ X.T

    # Compute RMSE on the hidden ratings
    true_ratings = Y[hidden_indices[:, 0], hidden_indices[:, 1]]
    predicted_ratings = Y_pred[hidden_indices[:, 0], hidden_indices[:, 1]]

    rmse = np.sqrt(mean_squared_error(true_ratings, predicted_ratings))
    rmse_scores.append(rmse)

    accuracy=accuracy_score(true_ratings, np.round(predicted_ratings))
    accuracy_scores.append(accuracy)

    print(f"🔹 For k = {k}, RMSE = {rmse:.4f},accuracy = {accuracy:.4f}")

# Find the best k (minimum RMSE)
best_k = k_values[np.argmin(rmse_scores)]
print(f"\n✅ Best k: {best_k} with RMSE = {min(rmse_scores):.4f}")

### STEP 5: Train Final Model with Best k

# Replace missing values (NaNs) with the user’s average rating for final training
Y_filled_final = np.copy(Y)
for i in range(Y.shape[0]):
    user_mean = np.nanmean(Y[i, :])
    Y_filled_final[i, np.isnan(Y[i, :])] = user_mean

# Apply Truncated SVD with the best k
svd_final = TruncatedSVD(n_components=best_k)
Theta = svd_final.fit_transform(Y_filled_final)
X = svd_final.components_.T  # Movie features (Movies × Latent Features)

# Predict ratings for all users and movies
Y_pred = Theta @ X.T  # Matrix multiplication to reconstruct predicted ratings

recommendations=recommended_movies(Y_pred)

🔹 For k = 5, RMSE = 1.1228,accuracy = 0.3098
🔹 For k = 10, RMSE = 1.1002,accuracy = 0.3144
🔹 For k = 15, RMSE = 1.0777,accuracy = 0.3219
🔹 For k = 20, RMSE = 1.0561,accuracy = 0.3262
🔹 For k = 25, RMSE = 1.0373,accuracy = 0.3307
🔹 For k = 30, RMSE = 1.0172,accuracy = 0.3389
🔹 For k = 35, RMSE = 0.9996,accuracy = 0.3464
🔹 For k = 40, RMSE = 0.9795,accuracy = 0.3534
🔹 For k = 45, RMSE = 0.9626,accuracy = 0.3622
🔹 For k = 50, RMSE = 0.9451,accuracy = 0.3694
🔹 For k = 55, RMSE = 0.9291,accuracy = 0.3743
🔹 For k = 60, RMSE = 0.9118,accuracy = 0.3815
🔹 For k = 65, RMSE = 0.8947,accuracy = 0.3903
🔹 For k = 70, RMSE = 0.8799,accuracy = 0.4006
🔹 For k = 75, RMSE = 0.8647,accuracy = 0.4064
🔹 For k = 80, RMSE = 0.8504,accuracy = 0.4133
🔹 For k = 85, RMSE = 0.8334,accuracy = 0.4218
🔹 For k = 90, RMSE = 0.8188,accuracy = 0.4315
🔹 For k = 95, RMSE = 0.8048,accuracy = 0.4363

✅ Best k: 95 with RMSE = 0.8048

🔹 **Vincent should watch:**
   🎬 When the Game Stands Tall
   🎬 Cheap Thrills
   🎬 Splice
   