<a href="https://colab.research.google.com/github/NeSma237/collaborative-filtering-recommender/blob/main/collaborative_filtering_recommender.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
!mkdir collaborative-filtering-recommender
!cd collaborative-filtering-recommender
!mkdir data notebooks src results
!touch README.md requirements.txt
!touch README.md requirements.txt

In [5]:
!wget http://files.grouplens.org/datasets/movielens/ml-100k.zip
!unzip ml-100k.zip

import pandas as pd

ratings = pd.read_csv('ml-100k/u.data', sep='\t', names=['user_id', 'movie_id', 'rating', 'timestamp'])
movies = pd.read_csv('ml-100k/u.item', sep='|', names=['movie_id', 'movie_title', 'release_date', 'video_release_date', 'IMDb_URL', 'unknown', 'Action', 'Adventure', 'Animation', "Children's", 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western'], encoding='latin-1')
users = pd.read_csv('ml-100k/u.user', sep='|', names=['user_id', 'age', 'gender', 'occupation', 'zip_code'])

--2025-04-25 17:44:53--  http://files.grouplens.org/datasets/movielens/ml-100k.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.65.152
Connecting to files.grouplens.org (files.grouplens.org)|128.101.65.152|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4924029 (4.7M) [application/zip]
Saving to: ‘ml-100k.zip’


2025-04-25 17:44:54 (5.53 MB/s) - ‘ml-100k.zip’ saved [4924029/4924029]

Archive:  ml-100k.zip
   creating: ml-100k/
  inflating: ml-100k/allbut.pl       
  inflating: ml-100k/mku.sh          
  inflating: ml-100k/README          
  inflating: ml-100k/u.data          
  inflating: ml-100k/u.genre         
  inflating: ml-100k/u.info          
  inflating: ml-100k/u.item          
  inflating: ml-100k/u.occupation    
  inflating: ml-100k/u.user          
  inflating: ml-100k/u1.base         
  inflating: ml-100k/u1.test         
  inflating: ml-100k/u2.base         
  inflating: ml-100k/u2.test         
  inflating: ml-100k/u3.base    

In [6]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

class UserBasedCF:
    def __init__(self):
        self.user_similarity = None
        self.ratings_matrix = None

    def fit(self, ratings_df):
        # Create user-item matrix
        self.ratings_matrix = ratings_df.pivot_table(
            index='user_id',
            columns='movie_id',
            values='rating'
        ).fillna(0)

        # Compute user similarity
        self.user_similarity = cosine_similarity(self.ratings_matrix)

    def predict(self, user_id, movie_id, k=5):
        if user_id not in self.ratings_matrix.index or movie_id not in self.ratings_matrix.columns:
            return 0  # or handle missing data

        # Get similar users
        user_idx = self.ratings_matrix.index.get_loc(user_id)
        similar_users = np.argsort(-self.user_similarity[user_idx])[1:k+1]  # exclude self

        # Calculate weighted average of ratings from similar users
        weighted_sum = 0
        similarity_sum = 0

        for sim_user_idx in similar_users:
            sim_user_id = self.ratings_matrix.index[sim_user_idx]
            rating = self.ratings_matrix.loc[sim_user_id, movie_id]
            if rating > 0:
                similarity = self.user_similarity[user_idx, sim_user_idx]
                weighted_sum += similarity * rating
                similarity_sum += similarity

        if similarity_sum == 0:
            return 0

        return weighted_sum / similarity_sum

In [7]:
from sklearn.metrics import mean_squared_error
import numpy as np

def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

def precision_recall_at_k(predictions, k=10, threshold=3.5):
    """Return precision and recall at k metrics for each user"""

    # First map the predictions to each user
    user_est_true = {}
    for uid, _, true_r, est, _ in predictions:
        if uid not in user_est_true:
            user_est_true[uid] = []
        user_est_true[uid].append((est, true_r))

    precisions = []
    recalls = []

    for uid, user_ratings in user_est_true.items():
        # Sort user ratings by estimated value
        user_ratings.sort(key=lambda x: x[0], reverse=True)

        # Number of relevant items
        n_rel = sum((true_r >= threshold) for (_, true_r) in user_ratings)

        # Number of recommended items in top k
        n_rec_k = sum((est >= threshold) for (est, _) in user_ratings[:k])

        # Number of relevant and recommended items in top k
        n_rel_and_rec_k = sum(((true_r >= threshold) and (est >= threshold))
                             for (est, true_r) in user_ratings[:k])

        # Precision@K: Proportion of recommended items that are relevant
        precisions.append(n_rel_and_rec_k / k if k != 0 else 0)

        # Recall@K: Proportion of relevant items that are recommended
        recalls.append(n_rel_and_rec_k / n_rel if n_rel != 0 else 0)

    return np.mean(precisions), np.mean(recalls)