# 🎬 Movie Recommendation Engine (Content-Based Filtering)

This notebook builds a basic content-based movie recommender using genres from the MovieLens 100K dataset.

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from dotenv import load_dotenv
import os
from sqlalchemy import create_engine
import sys 
from urllib.parse import quote_plus
from pathlib import Path


# Add the config file's parent directory to sys.path
sys.path.append(os.path.abspath("/Users/atharvagurav/Documents/movie-recommendation-system"))

import config

In [2]:
# # Load environment variables from .env file
# load_dotenv()

# Convert DB_PORT to an integer
DB_PORT = int(config.DB_PORT)
encoded_DB_PASSWORD = quote_plus(config.DB_PASSWORD)

DATABASE_URL = f"postgresql://{config.DB_USER}:{encoded_DB_PASSWORD}@{config.DB_HOST}:{DB_PORT}/{config.DB_NAME}"

engine = create_engine(DATABASE_URL)

In [3]:
# Load data from database
movies = pd.read_sql("SELECT movie_id, title, genre FROM movies", engine)
ratings = pd.read_sql("SELECT user_id, movie_id, rating FROM ratings", engine)

# Merge for convenience (ratings + movie info)
merged = ratings.merge(movies, on="movie_id")

# Save data to CSV (optional)
Path("data_csv").mkdir(exist_ok=True)
movies.to_csv("data_csv/movies.csv", index=False)
ratings.to_csv("data_csv/ratings.csv", index=False)
merged.to_csv("data_csv/merged_data.csv", index=False)

In [4]:
# # Step 1: Get unique movies and reset index BEFORE building the similarity matrix
# movies = merged[['movie_id', 'title', 'genre']].drop_duplicates().reset_index(drop=True)

# # Step 2: Rebuild vectorizer + similarity matrix from the fresh list
# tfidf = TfidfVectorizer(tokenizer=lambda x: x.split(', '))
# tfidf_matrix = tfidf.fit_transform(movies['genre'])

# # Step 3: Recompute cosine similarity
# cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# # Step 4: Rebuild movie title → index mapping
# movie_indices = pd.Series(movies.index, index=movies['title']).drop_duplicates()


In [5]:
# movies

In [6]:
# merged.head()

In [7]:
# # movies = movies(drop=True)

# # Map movie titles to indices
# movie_indices = pd.Series(movies.index, index=movies['title']).drop_duplicates()


In [8]:
# # Recommend similar movies based on a given title
# def recommend_movies(title, num_recommendations=5):
#     if title not in movie_indices:
#         return f"\u274c '{title}' not found in movie list."

#     try:
#         idx = int(movie_indices[title])
#         sim_scores = list(enumerate(cosine_sim[idx]))
#         sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
#         sim_scores = sim_scores[1:num_recommendations + 1]  # Skip self

#         movie_indices_list = [i[0] for i in sim_scores if i[0] < len(movies)]
#         return movies.iloc[movie_indices_list][['title', 'genre']]
#     except IndexError:
#         return f"❌ Index error for title: {title}"

In [9]:
# recommend_movies("GoldenEye (1995)")


In [10]:
# movies['title'].sample(10).tolist()


In [11]:
# # Recommend movies based on a user's top-rated titles
# def recommend_for_user(user_id, top_n_movies=3, recs_per_movie=2):
#     top_movies = merged[(merged['user_id'] == user_id) & (merged['rating'] >= 4)]['title'].head(top_n_movies)
#     recommendations = pd.DataFrame()

#     for title in top_movies:
#         recs = recommend_movies(title, num_recommendations=recs_per_movie)
#         if isinstance(recs, pd.DataFrame):
#             recommendations = pd.concat([recommendations, recs])

#     return recommendations.drop_duplicates(subset='title').reset_index(drop=True)


In [12]:
# print(recommend_movies("GoldenEye (1995)"))
# print(recommend_for_user(10))

In [13]:
# def recommend_for_user_weighted(user_id, recs_total=10):
#     # Get all movies user has rated highly
#     user_ratings = merged[(merged['user_id'] == user_id) & (merged['rating'] >= 3)]
    
#     if user_ratings.empty:
#         return f"⚠️ No high-rated movies found for user {user_id}."
    
#     # Initialize an empty score array
#     scores = np.zeros(cosine_sim.shape[0])
    
#     # Loop through each liked movie
#     for _, row in user_ratings.iterrows():
#         title = row['title']
#         rating = row['rating']
        
#         if title in movie_indices:
#             idx = movie_indices[title]
#             if isinstance(idx, pd.Series):
#                 idx = idx.iloc[0]
#             idx = int(idx)
            
#             similarity_scores = np.array(cosine_sim[idx]).flatten()
#             scores += similarity_scores * rating


#     # Get indices of top recommendations
#     user_seen_indices = [movie_indices[title] for title in user_ratings['title'] if title in movie_indices]
#     scores[user_seen_indices] = 0  # Mask out already seen movies

#     top_indices = scores.argsort()[::-1][:recs_total]
#     recommended_movies = movies.iloc[top_indices][['title', 'genre']].reset_index(drop=True)
    
#     return recommended_movies


In [14]:
# print(recommend_for_user_weighted(user_id=10))


In [15]:
# Compute movie-level stats
movie_stats = merged.groupby('title').agg({
    'rating': ['mean', 'count']
}).reset_index()
movie_stats.columns = ['title', 'avg_rating', 'rating_count']

# Join movie stats back into movies
movies = movies.merge(movie_stats, on='title')

# Vectorize genres using TF-IDF
tfidf = TfidfVectorizer(tokenizer=lambda x: x.split(', '))
tfidf_matrix = tfidf.fit_transform(movies['genre'])

# Compute similarity matrix
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Map movie titles to indices
movie_indices = pd.Series(movies.index, index=movies['title']).drop_duplicates()



In [34]:


# Recommend similar movies based on a given title
def recommend_movies(title, num_recommendations=5):
    if title not in movie_indices:
        return f"\u274c '{title}' not found in movie list."

    try:
        idx = movie_indices[title]
        if isinstance(idx, pd.Series):
            idx = idx.iloc[0]
        idx = int(idx)

        sim_scores = list(enumerate(cosine_sim[idx]))
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
        sim_scores = sim_scores[1:num_recommendations + 1]  # Skip self

        movie_indices_list = [i[0] for i in sim_scores if i[0] < len(movies)]
        return movies.iloc[movie_indices_list][['title', 'genre', 'avg_rating', 'rating_count']]
    except IndexError:
        return f"❌ Index error for title: {title}"

# Recommend movies based on a user's top-rated titles
def recommend_for_user(user_id, top_n_movies=3, recs_per_movie=3):
    top_movies = merged[(merged['user_id'] == user_id) & (merged['rating'] >= 4)]['title'].head(top_n_movies)
    recommendations = pd.DataFrame()

    for title in top_movies:
        recs = recommend_movies(title, num_recommendations=recs_per_movie)
        if isinstance(recs, pd.DataFrame):
            recommendations = pd.concat([recommendations, recs])

    return recommendations.drop_duplicates(subset='title').reset_index(drop=True)

# Global popularity-aware recommendation using similarity + crowd wisdom
def recommend_for_user_global(user_id, top_n_movies=3, recs_total=10):
    user_seen = merged[merged['user_id'] == user_id]['title'].unique()
    user_top = merged[(merged['user_id'] == user_id) & (merged['rating'] >= 4)]['title'].head(top_n_movies)

    scores = np.zeros(cosine_sim.shape[0])

    for title in user_top:
        if title in movie_indices:
            idx = movie_indices[title]
            if isinstance(idx, pd.Series):
                idx = idx.iloc[0]
            idx = int(idx)
            similarity_scores = np.array(cosine_sim[idx]).flatten()
            scores += similarity_scores

    # Normalize scores by popularity (use avg_rating * log(rating_count))
    popularity = np.log1p(movies['rating_count']) * movies['avg_rating']
    scores = scores * popularity

    # Remove already seen
    seen_indices = [movie_indices[title].iloc[0] if isinstance(movie_indices[title], pd.Series) else movie_indices[title] for title in user_seen if title in movie_indices]
    scores[seen_indices] = 0

    top_indices = scores.argsort()[::-1][:recs_total]
    return movies.iloc[top_indices][['title', 'genre', 'avg_rating', 'rating_count']].reset_index(drop=True)


In [35]:
# Example usage
# print(recommend_movies("Devil in a Blue Dress (1995)"))
print(recommend_for_user(15))



                               title                                genre  \
0       Devil in a Blue Dress (1995)  Crime, Film-Noir, Mystery, Thriller   
1                       Laura (1944)            Crime, Film-Noir, Mystery   
2                   Chinatown (1974)         Film-Noir, Mystery, Thriller   
3                Time Tracers (1995)            Action, Adventure, Sci-Fi   
4                 Judge Dredd (1995)            Action, Adventure, Sci-Fi   
5      Star Trek: Generations (1994)            Action, Adventure, Sci-Fi   
6                    Only You (1994)                      Comedy, Romance   
7           Perez Family, The (1995)                      Comedy, Romance   
8  Pyromaniac's Love Story, A (1995)                      Comedy, Romance   

   avg_rating  rating_count  
0    3.385965            57  
1    4.100000            40  
2    4.136054           147  
3    1.500000             2  
4    2.897436            39  
5    3.336207           116  
6    3.153846      

In [36]:
print(recommend_for_user_global(15))

                                title  \
0          Princess Bride, The (1987)   
1      When Harry Met Sally... (1989)   
2                Groundhog Day (1993)   
3       Much Ado About Nothing (1993)   
4                    True Lies (1994)   
5                   Annie Hall (1977)   
6  Four Weddings and a Funeral (1994)   
7     Empire Strikes Back, The (1980)   
8      Philadelphia Story, The (1940)   
9         Sleepless in Seattle (1993)   

                                            genre  avg_rating  rating_count  
0              Action, Adventure, Comedy, Romance    4.172840           324  
1                                 Comedy, Romance    3.910345           290  
2                                 Comedy, Romance    3.764286           280  
3                                 Comedy, Romance    4.062500           176  
4              Action, Adventure, Comedy, Romance    3.562500           208  
5                                 Comedy, Romance    3.911111           180  
6 