In [70]:
# IMPORT LIBRARIES
import numpy as np
import pandas as pd


# 1. Dataset

* Use a small public dataset of items (e.g., a list of movies with plot summaries, or other textual descriptions).
* Make sure the dataset is easy to handle (maybe 100–500 rows) so the solution remains quick to implement and run.
* Include the dataset in your forked repository or provide instructions/link on how to download it.


In [71]:
# In order to download the data, you can use the following code:
# df.to_csv('DATA/top_100_movies.csv', index=False)

# Or alternatiey, you can use this link to download the data:
# https://www.kaggle.com/datasets/shreyasur965/imdb-top-100-movies-dataset?select=top_100_movies.csv

In [72]:
# After downloading...
df = pd.read_csv('DATA/top_100_movies.csv')

if len(df) >= 100 and len(df) <= 500:
    print('Data has between 100-500 rows')

Data has between 100-500 rows


In [73]:
# Display the first 5 rows of the dataframe
df.head()

Unnamed: 0,rank,title,description,genre,rating,year
0,1,The Shawshank Redemption,Two imprisoned men bond over a number of years...,['Drama'],9.3,1994
1,2,The Godfather,The aging patriarch of an organized crime dyna...,"['Crime', 'Drama']",9.2,1972
2,3,The Dark Knight,When the menace known as the Joker wreaks havo...,"['Action', 'Crime', 'Drama']",9.0,2008
3,4,The Godfather Part II,The early life and career of Vito Corleone in ...,"['Crime', 'Drama']",9.0,1974
4,5,12 Angry Men,The jury in a New York City murder trial is fr...,"['Crime', 'Drama']",9.0,1957


In [74]:
# Check for NAN values (SANITY CHECK)
df.isnull().sum()


rank           0
title          0
description    0
genre          0
rating         0
year           0
dtype: int64

## I'd say the best approach is not worrying about the title since it can be misleading like the movie 'IT'. Which doesn't provide any relevance to someone but descriptions and genre 100% but what about years? and what about the movies release date? All will be accounted for.

# 2. Approach

* Content-Based: At a minimum, use text similarity to recommend items.
For instance, you can transform both the user’s text input and each item’s description into TF-IDF vectors and compute cosine similarity.

* Return the top N similar items (e.g., top 5).

In [None]:
import re
# %pip install sentence-transformers
import os 
os.environ["TOKENIZERS_PARALLELISM"] = "false" # disable parallelism to avoid deadlocks
from sentence_transformers import SentenceTransformer

# Load a BERT-based model for sentence embeddings
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

def build_vectors(user_input, df):
    # Extract year and rating from user input (if specified)
    rating_match = re.search(r'\b(\d\.\d|\d)\b', user_input)
    year_match = re.search(r'\b(19|20)\d{2}\b', user_input)
    
    # Convert text data to vectors (embeddings)
    user_embedding = model.encode(user_input)
    
    # Embed movie descriptions
    desc_embeddings = model.encode(df['description'].tolist())
    
    # Embed movie genres
    genre_embeddings = model.encode(df['genre'].tolist())

    # Return movie embeddings along with user embeddings, rating_match, and year_match
    return desc_embeddings + genre_embeddings, user_embedding, rating_match, year_match

In [None]:
# YEAR THRESHOLD FUNCTION

def extract_year_info(user_input):
    """Extracts year and modifier (before, after, during) from the input."""
    year_match = re.search(r'(?:(before|after|during)\s+)?(19|20)\d{2}', user_input, re.IGNORECASE)
    
    if year_match:
        modifier = year_match.group(1)  # 'before', 'after', or 'during'
        year = int(year_match.group(2))  # Correctly extract the year
        return modifier, year
    return None, None

In [84]:
from sklearn.metrics.pairwise import cosine_similarity

def recommend_movies(user_input, df, n):
    movie_embeddings, user_embedding, rating_match, year_match = build_vectors(user_input, df)
    
    # Compute cosine similarity
    similarities = cosine_similarity([user_embedding], movie_embeddings)[0]
    
    # Add similarity scores to dataframe (for top N recommendations later)
    df['similarity'] = similarities
    
    # Apply rating filter (if specified)
    if rating_match:
        rating_threshold = float(rating_match.group())
        df = df[df['rating'] >= rating_threshold]
    
    # Apply year filter (if specified)
    if year_match:
        year_threshold = int(year_match.group())
        df = df[df['year'] >= year_threshold] # Only recommend movies from that year or later
    
    # Sort by similarity and return top 5 recommendations
    top_movies = df.sort_values(by='similarity', ascending=False).head(n)
    
    # Return top 5 recommendations
    return top_movies[['title', 'genre', 'rating', 'year', 'similarity']]

# 3. Test it out!

In [None]:
# Exampke usage

try: # incase mistake is make in input
    n = int(input("Enter the number of recommendations you want: ").strip() or 5)  # Default to 5 if input is empty
except ValueError:
    print("Invalid input. Using default value: 5")
    n = 5  # Default value

user_input = "I like marvel movies"
recommended_movies = recommend_movies(user_input, df, n)
recommended_movies # returns df with top 5 recommendations

Invalid input. Using default value: 5


Unnamed: 0,title,genre,rating,year,similarity
66,Spider-Man: Into the Spider-Verse,"['Animation', 'Action', 'Adventure']",8.4,2018,0.374157
75,Avengers: Endgame,"['Action', 'Adventure', 'Drama']",8.4,2019,0.334232
62,Avengers: Infinity War,"['Action', 'Adventure', 'Sci-Fi']",8.4,2018,0.333136
90,Toy Story 3,"['Animation', 'Adventure', 'Comedy']",8.3,2010,0.317053
14,Star Wars: Episode V - The Empire Strikes Back,"['Action', 'Adventure', 'Fantasy']",8.7,1980,0.30308
