## Install Dependencies

In [None]:
!pip install kagglehub
!pip install spacy
!pip install scikit-learn
!pip install pandas
!pip install numpy

## Download Dataset

In [None]:
import kagglehub

# download latest version
path = kagglehub.dataset_download("utkarshx27/movies-dataset")

print("Path to dataset files:", path)

Path to dataset files: /root/.cache/kagglehub/datasets/utkarshx27/movies-dataset/versions/1


## Loading Dataset

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

df = pd.read_csv(path + "/movie_dataset.csv")

df.columns

Index(['index', 'budget', 'genres', 'homepage', 'id', 'keywords',
       'original_language', 'original_title', 'overview', 'popularity',
       'production_companies', 'production_countries', 'release_date',
       'revenue', 'runtime', 'spoken_languages', 'status', 'tagline', 'title',
       'vote_average', 'vote_count', 'cast', 'crew', 'director'],
      dtype='object')

## Helper Functions

In [None]:
import spacy

nlp = spacy.load("en_core_web_sm")

In [None]:
def preprocess_data(df):
  df["release_year"] = df["release_date"].apply(lambda x: "" if type(x) == float else x.split("-")[0])
  df["overview"] = df["overview"].fillna("")
  df["metadata"] = (
      df["genres"].fillna("") + " " +
      df["keywords"].fillna("") + " " +
      df["overview"].fillna("") + " " +
      df["tagline"].fillna("") + " " +
      df["title"].fillna("") + " " +
      df["cast"].fillna("") + " " +
      df["director"].fillna("")
  )

  return df

In [None]:
# parse user query using dependency parsing
def parse_query(query):
  doc = nlp(query.lower())
  filters = {
    "include": {
        "genres": [],
        "cast": [],
        "director": [],
        "release_year": []
      },
    "exclude": {
        "genres": [],
        "cast": [],
        "director": [],
        "release_year": []
      }
  }

  # extract noun chunks and their dependencies
  for chunk in doc.noun_chunks:
    # check for negation (e.g., "no Tom Hanks" or "movies without Spielberg")
    excluded_term = None

    # case 1: Direct determiner (e.g., "no action movies")
    if chunk.root.dep_ == "dobj" and chunk.root.head.dep_ == "neg":
      excluded_term = chunk.text
    # case 2: Prepositional phrase (e.g., "movies w/o Tom Hanks")
    elif chunk.root.dep_ == "pobj" and chunk.root.head.text in ["without", "w/o", "no", "not", "excluding"]:
      excluded_term = chunk.text
    # case 3: Determiner "no" (e.g., "no sci-fi")
    elif any(token.text in ["no", "not"] for token in chunk):
      excluded_term = chunk.text

    # classify the excluded term into a category
    if excluded_term:
      # check if term is a person (actor/director)
      if any(ent.label_ == "PERSON" for ent in chunk.ents):
        filters["exclude"]["cast"].append(excluded_term)
        filters["exclude"]["director"].append(excluded_term)
      # check if term is a genre
      elif "movies" in chunk.root.head.text:
        filters["exclude"]["genres"].append(excluded_term)
      # candle years (e.g., "no 90s movies")
      elif any(ent.label_ == "DATE" for ent in chunk.ents):
        year = int(excluded_term.replace("s", ""))
        filters["exclude"]["release_year"].extend(range(year, year + 10))

  # extract positive terms (non-negated)
  for token in doc:
    if token.ent_type_ == "PERSON" and token.text not in filters["exclude"]["cast"]:
      filters["include"]["cast"].append(token.text)
    elif token.ent_type_ == "DATE" and not token.text.endswith("s"):
      year = int(token.text)
      filters["include"]["release_year"].append(year)
    elif token.dep_ == "compound" and token.head.lemma_ == "movie":
      filters["include"]["genres"].append(token.text)

  return filters

In [None]:
# filter movies based on parsed query
def filter_movies(df, filters):
  filtered_df = df.copy()

  # apply include filters
  for category in filters["include"]:
    terms = filters["include"][category]
    if terms:
      if category == "release_year":
        filtered_df = filtered_df[filtered_df[category].isin(terms)]
      else:
        mask = filtered_df[category].str.contains('|'.join(terms), case=False, na=False)
        filtered_df = filtered_df[mask]

  # apply exclude filters
  for category in filters["exclude"]:
    terms = filters["exclude"][category]
    if terms:
      if category == "release_year":
        filtered_df = filtered_df[~filtered_df[category].isin(terms)]
      else:
        mask = filtered_df[category].str.contains('|'.join(terms), case=False, na=False)
        filtered_df = filtered_df[~mask]

  return filtered_df

In [None]:
def recommend_movies(query, top_n=5):
  filters = parse_query(query)
  filtered_df = filter_movies(df, filters)

  if filtered_df.empty:
    return f"Sorry, we can't find movies for {query}"

  # compute similarity with TF-IDF
  query_vec = tfidf.transform([query])
  cosine_sim = cosine_similarity(query_vec, tfidf_matrix[filtered_df.index]).flatten()

  filtered_df["similarity"] = cosine_sim

  recommendations = filtered_df.sort_values(by="similarity", ascending=False).head(top_n)

  recommendations["similarity"] = recommendations["similarity"].apply(lambda x: f"{int(x * 10000) / 100}%")

  return recommendations[["title", "genres", "cast", "overview", "release_date", "similarity"]]

## TF-IDF Generation

In [None]:
df = preprocess_data(df)

# initialize TF-IDF
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(df["metadata"])

In [None]:
tfidf_matrix.shape

(4803, 31064)

## Result

In [None]:
your_query = "horror movies with zombie" # modify the string and run this cell to get movies recommendations
res = recommend_movies(your_query)
res

Unnamed: 0,title,genres,cast,overview,release_date,similarity
880,Grindhouse,Thriller Action Horror,Kurt Russell Zo\u00eb Bell Rosario Dawson Vane...,Two full length feature horror movies written ...,2007-04-06,33.21%
3147,Re-Kill,Horror Science Fiction,Roger Cross Scott Adkins Daniella Alonso Bruce...,"Five years after a zombie outbreak, the men an...",2015-10-16,30.73%
1567,Warm Bodies,Horror Comedy Romance,Nicholas Hoult Teresa Palmer Analeigh Tipton R...,After a zombie becomes involved with the girlf...,2013-01-31,29.14%
3737,Night of the Living Dead,Horror,Duane Jones Judith O'Dea Karl Hardman Marilyn ...,A group of people try to survive an attack of ...,1968-10-01,27.99%
1737,Pride and Prejudice and Zombies,Romance Horror Comedy Thriller,Lily James Sam Riley Jack Huston Bella Heathco...,A zombie outbreak has fallen upon the land in ...,2016-02-04,25.45%


In [None]:
!python --version

Python 3.11.11
