In [383]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
import ast

import nltk
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\lucas\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\lucas\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [384]:
# import data with necessary columns
movies = pd.read_csv("data/tmdb_500_movies.csv")
credits = pd.read_csv("data/tmdb_500_credits.csv")[['movie_id', 'cast', 'crew']]
movies_credits = movies.merge(credits, left_on='id', right_on='movie_id', how='inner')[['genres', 'keywords','overview', 'original_title', 'tagline', 'title', 'cast']]

# make sure all the columns have consistent data type
cols_to_clean = ['genres', 'keywords', 'cast']
convert = lambda x: ", ".join([dct["name"] for dct in ast.literal_eval(x) if 'name' in dct.keys()])
for col in cols_to_clean:
    movies_credits[col] = movies_credits[col].apply(convert)

#check for nan values
print(movies_credits.isna().sum())

#fill nan values
movies_credits.fillna('', inplace=True)

movies_credits.head()

genres             0
keywords           0
overview           0
original_title     0
tagline           13
title              0
cast               0
dtype: int64


Unnamed: 0,genres,keywords,overview,original_title,tagline,title,cast
0,"Action, Adventure, Fantasy, Science Fiction","culture clash, future, space war, space colony...","In the 22nd century, a paraplegic Marine is di...",Avatar,Enter the World of Pandora.,Avatar,"Sam Worthington, Zoe Saldana, Sigourney Weaver..."
1,"Adventure, Fantasy, Action","ocean, drug abuse, exotic island, east india t...","Captain Barbossa, long believed to be dead, ha...",Pirates of the Caribbean: At World's End,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,"Johnny Depp, Orlando Bloom, Keira Knightley, S..."
2,"Action, Adventure, Crime","spy, based on novel, secret agent, sequel, mi6...",A cryptic message from Bond’s past sends him o...,Spectre,A Plan No One Escapes,Spectre,"Daniel Craig, Christoph Waltz, Léa Seydoux, Ra..."
3,"Action, Crime, Drama, Thriller","dc comics, crime fighter, terrorist, secret id...",Following the death of District Attorney Harve...,The Dark Knight Rises,The Legend Ends,The Dark Knight Rises,"Christian Bale, Michael Caine, Gary Oldman, An..."
4,"Action, Adventure, Science Fiction","based on novel, mars, medallion, space travel,...","John Carter is a war-weary, former military ca...",John Carter,"Lost in our world, found in another.",John Carter,"Taylor Kitsch, Lynn Collins, Samantha Morton, ..."


In [385]:
#combine columns for richer context
def enrichment(df):
    summary = df.apply(lambda x: ", ".join(x).lower(), axis=1)
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))
    return summary.apply(lambda x: " ".join([lemmatizer.lemmatize(word) for word in re.sub(r'[^\w\s]', '', x).split() if word not in stop_words]))

#vectorize the summary
def vectorize(context):
    vectorizer = TfidfVectorizer(stop_words='english')
    tfidf_matrix = vectorizer.fit_transform(summary)
    return vectorizer, tfidf_matrix

#compute similarity
def compute_similarity(query, vectorizer, tfidf_matrix):
    query_vec = vectorizer.transform([" ".join([lemmatizer.lemmatize(word) for word in re.sub(r'[^\w\s]', '', query).split() if word not in stop_words])])
    return cosine_similarity(query_vec, tfidf_matrix).flatten()

#recommend top 5 movies
def recommend_movies(df): 
    summary = enrichment(df)
    query = input("What kind of movie would you like to watch?: ")
    vectorizer, tfidf_matrix = vectorize(summary)
    similarity = compute_similarity(query, vectorizer, tfidf_matrix)
    top_indices = similarity.argsort()[-5:][::-1]
    
    recommendations = []
    for idx in top_indices:
        recommendations.append({
            'title': df.iloc[idx]['title'],
            'similarity': similarity[idx]
        })

    return recommendations

In [386]:
# give movie recommendation
recommend_movies(movies_credits)

What kind of movie would you like to watch?:  zombies


[{'title': 'World War Z', 'similarity': 0.3644833263302745},
 {'title': 'Creepshow', 'similarity': 0.17011601646063298},
 {'title': 'Hotel Transylvania', 'similarity': 0.1461970825340342},
 {'title': 'Hotel Transylvania 2', 'similarity': 0.11629041208822206},
 {'title': 'Jack and Jill', 'similarity': 0.0}]

# Movie Recommendation System

This project implements a simple **content-based recommendation system** for movies. Given a short text description of a user's preferences, the system processes the input and returns the top 5 most similar movies based on enriched textual features from the dataset.

---

## Overview

The recommendation system leverages TF-IDF vectorization and cosine similarity to compare a user's query against a combined text summary of each movie. The enriched summary is created by merging multiple columns (such as genres, keywords, cast, and overview) into a single text field to provide richer context.

---

## Dataset

- **Movies Data:**  
  - `tmdb_500_movies.csv`: Contains details for 500 movies.
- **Credits Data:**  
  - `tmdb_500_credits.csv`: Contains information on cast and crew for 500 movies.
- **Source:**  
  - Both files are created by taking the first 500 rows from the larger datasets `tmdb_5000_movies.csv` and `tmdb_5000_credits.csv`, respectively, which can be downloaded from [Kaggle: TMDB Movie Metadata](https://www.kaggle.com/datasets/tmdb/tmdb-movie-metadata?select=tmdb_5000_movies.csv).
- **Preprocessing:**  
  - The datasets are merged on movie IDs.
  - Columns that store lists of dictionaries (e.g., genres, keywords, cast) are converted to a comma-separated string format.
  - Missing values are filled, and text is lowercased for consistency.

*The datasets are stored in the `data/` folder.ovided in the repository.*

---

## Approach

1. **Data Enrichment:**  
   - Relevant columns (genres, keywords, overview, title, tagline, cast) are combined into a single text summary for each movie.
   - The text is preprocessed using tokenization, lemmatization, and stopword removal (using NLTK) to enhance its quality.

2. **Vectorization:**  
   - The enriched summaries are transformed into TF-IDF vectors using scikit-learn's `TfidfVectorizer`, which converts the text into a numerical representation.

3. **Similarity Computation:**  
   - The user's query is similarly vectorized.
   - Cosine similarity is computed between the user's query vector and each movie's TF-IDF vector.

4. **Recommendation:**  
   - Movies are ranked based on similarity scores.
   - The top 5 movies with the higitory:**
   ```bash
   git clone <repository-url>
   cd <repository-directory>
