# Movie Recommender System with Collaborative Filtering using K-Nearest Neighbors




This script demonstrates a movie recommender system using collaborative filtering
with k-nearest neighbors (KNN) algorithm. It loads movie ratings and movie metadata,
creates a user-item matrix, finds similar movies based on user ratings, and recommends
movies to users based on their highest-rated movie.

Author: [Madhu]




In [1]:
# Importing Libraries
import numpy as np
import pandas as pd
from sklearn.neighbors import NearestNeighbors  # Add this line for NearestNeighbors
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.sparse import csr_matrix
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)


In [2]:
#loading rating dataset
ratings = pd.read_csv("ratings.csv")
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [3]:
# loading movie dataset
movies = pd.read_csv("movies.csv")
movies.head()


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
# Basic Statistics
n_ratings = len(ratings)
n_movies = len(ratings['movieId'].unique())
n_users = len(ratings['userId'].unique())


In [5]:
print(f"Number of ratings: {n_ratings}")
print(f"Number of unique movieId's: {n_movies}")
print(f"Number of unique users: {n_users}")
print(f"Average ratings per user: {round(n_ratings/n_users, 2)}")
print(f"Average ratings per movie: {round(n_ratings/n_movies, 2)}")

Number of ratings: 100836
Number of unique movieId's: 9724
Number of unique users: 610
Average ratings per user: 165.3
Average ratings per movie: 10.37


In [6]:
# User frequency
user_freq = ratings[['userId', 'movieId']].groupby('userId').count().reset_index()
user_freq.columns = ['userId', 'n_ratings']
print(user_freq.head())


   userId  n_ratings
0       1        232
1       2         29
2       3         39
3       4        216
4       5         44


In [7]:
# Find Lowest and Highest rated movies
mean_rating = ratings.groupby('movieId')[['rating']].mean()
lowest_rated = mean_rating['rating'].idxmin()
highest_rated = mean_rating['rating'].idxmax()


In [8]:
# Output details for highest and lowest rated movies
print(movies.loc[movies['movieId'] == lowest_rated])
print(movies.loc[movies['movieId'] == highest_rated])

      movieId         title   genres
2689     3604  Gypsy (1962)  Musical
    movieId            title           genres
48       53  Lamerica (1994)  Adventure|Drama


In [9]:
# Movie statistics using Bayesian average
movie_stats = ratings.groupby('movieId')[['rating']].agg(['count', 'mean'])
movie_stats.columns = movie_stats.columns.droplevel()

In [10]:
# Creating user-item matrix using scipy csr matrix
def create_matrix(df):
    N = len(df['userId'].unique())
    M = len(df['movieId'].unique())

    # Map Ids to indices
    user_mapper = dict(zip(np.unique(df["userId"]), list(range(N))))
    movie_mapper = dict(zip(np.unique(df["movieId"]), list(range(M))))
    
    # Map indices to IDs
    user_inv_mapper = dict(zip(list(range(N)), np.unique(df["userId"])))
    movie_inv_mapper = dict(zip(list(range(M)), np.unique(df["movieId"])))

    user_index = [user_mapper[i] for i in df['userId']]
    movie_index = [movie_mapper[i] for i in df['movieId']]
    
    X = csr_matrix((df["rating"], (movie_index, user_index)), shape=(M, N))
    
    return X, user_mapper, movie_mapper, user_inv_mapper, movie_inv_mapper

X, user_mapper, movie_mapper, user_inv_mapper, movie_inv_mapper = create_matrix(ratings)




Find similar movies using KNN

In [11]:
def find_similar_movies(movie_id, X, k, metric='cosine', show_distance=False):
    neighbour_ids = []
    movie_ind = movie_mapper[movie_id]
    movie_vec = X[movie_ind]
    k += 1
    kNN = NearestNeighbors(n_neighbors=k, algorithm="brute", metric=metric)
    kNN.fit(X)
    movie_vec = movie_vec.reshape(1, -1)
    neighbour = kNN.kneighbors(movie_vec, return_distance=show_distance)
    for i in range(0, k):
        n = neighbour.item(i)
        neighbour_ids.append(movie_inv_mapper[n])
    neighbour_ids.pop(0)
    return neighbour_ids


In [12]:
# Example usage for recommending movies to a user based on their highest-rated movie
def recommend_movies_for_user(user_id, X, user_mapper, movie_mapper, movie_inv_mapper, k=10):
    df1 = ratings[ratings['userId'] == user_id]
    
    if df1.empty:
        print(f"User with ID {user_id} does not exist.")
        return

    movie_id = df1[df1['rating'] == max(df1['rating'])]['movieId'].iloc[0]

    movie_titles = dict(zip(movies['movieId'], movies['title']))

    similar_ids = find_similar_movies(movie_id, X, k)
    movie_title = movie_titles.get(movie_id, "Movie not found")

    if movie_title == "Movie not found":
        print(f"Movie with ID {movie_id} not found.")
        return

    print(f"Since you watched {movie_title}, you might also like:")
    for i in similar_ids:
        print(movie_titles.get(i, "Movie not found"))


In [13]:
# Example usage:
user_id = 200  # Replace with your actual user ID
recommend_movies_for_user(user_id, X, user_mapper, movie_mapper, movie_inv_mapper, k=10)


Since you watched Clueless (1995), you might also like:
Sleepless in Seattle (1993)
Beauty and the Beast (1991)
Pretty Woman (1990)
Mrs. Doubtfire (1993)
Aladdin (1992)
Dave (1993)
Batman (1989)
Speed (1994)
Interview with the Vampire: The Vampire Chronicles (1994)
True Lies (1994)


In [14]:
# Checking user details
user_id = 20  # Replace with your numeric user ID
user_details = ratings[ratings['userId'] == user_id]

if user_details.empty:
    print(f"User with ID {user_id} does not exist.")
else:
    print(user_details.head())

      userId  movieId  rating   timestamp
2977      20        2     3.0  1054038313
2978      20        8     1.0  1054038422
2979      20       13     4.0  1054038425
2980      20       34     4.0  1054038093
2981      20       48     5.0  1054038357
