In [19]:
import pandas as pd

# Load movie data
movies = pd.read_csv("movies.csv")
movies.head()


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [20]:
# Optional: Split title and year
movies['year'] = movies['title'].str.extract(r'\((\d{4})\)', expand=False)
movies['clean_title'] = movies['title'].str.replace(r'\(\d{4}\)', '', regex=True).str.strip()
movies.head()

Unnamed: 0,movieId,title,genres,year,clean_title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995,Toy Story
1,2,Jumanji (1995),Adventure|Children|Fantasy,1995,Jumanji
2,3,Grumpier Old Men (1995),Comedy|Romance,1995,Grumpier Old Men
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,1995,Waiting to Exhale
4,5,Father of the Bride Part II (1995),Comedy,1995,Father of the Bride Part II


In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [22]:
# Replace "(no genres listed)" with empty string
movies['genres'] = movies['genres'].replace("(no genres listed)", "")

# Initialize TF-IDF Vectorizer
tfidf = TfidfVectorizer(stop_words='english')

# Fit and transform genres
tfidf_matrix = tfidf.fit_transform(movies['genres'])

# tfidf_matrix shape (number of movies, number of unique genre terms)
print("TF-IDF matrix shape:", tfidf_matrix.shape)

TF-IDF matrix shape: (9742, 21)


In [23]:
# Compute cosine similarity between all movies
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [24]:
# Reset index to map clean titles to indices
movies = movies.reset_index()

# Create a Series mapping movie titles to index
indices = pd.Series(movies.index, index=movies['clean_title'].str.lower())

def recommend_movies(title, num_recommendations=5):
    title = title.lower()
    
    if title not in indices:
        return "Movie not found in database."
    
    idx = indices[title]
    
    # Get pairwise similarity scores
    sim_scores = list(enumerate(cosine_sim[idx]))
    
    # Sort movies by similarity
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Skip the first one (it's the movie itself)
    sim_scores = sim_scores[1:num_recommendations+1]
    
    # Get movie indices
    movie_indices = [i[0] for i in sim_scores]
    
    return movies[['title', 'genres']].iloc[movie_indices]


In [25]:
recommend_movies("Toy Story")

Unnamed: 0,title,genres
1706,Antz (1998),Adventure|Animation|Children|Comedy|Fantasy
2355,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy
2809,"Adventures of Rocky and Bullwinkle, The (2000)",Adventure|Animation|Children|Comedy|Fantasy
3000,"Emperor's New Groove, The (2000)",Adventure|Animation|Children|Comedy|Fantasy
3568,"Monsters, Inc. (2001)",Adventure|Animation|Children|Comedy|Fantasy
