In [3]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

df = pd.read_csv('movies_metadata.csv', low_memory=False)

movies = df[['title', 'overview']].dropna().reset_index(drop=True)

movies = movies.head(5000)

movies['overview'] = movies['overview'].astype(str)

tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(movies['overview'])

cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

indices = pd.Series(movies.index, index=movies['title']).drop_duplicates()

def recommend_movies(title, cosine_sim=cosine_sim, n=5):
    if title not in indices:
        return "Movie not found in dataset."
    idx = indices[title]

    sim_scores = list(enumerate(cosine_sim[idx]))

    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    top_indices = [i[0] for i in sim_scores[1:n+1]]

    return movies['title'].iloc[top_indices].tolist()

print("Top 5 movie recommendations for 'The Godfather':")
print(recommend_movies('The Godfather'))


Top 5 movie recommendations for 'The Godfather':
['The Godfather: Part II', 'The Godfather: Part III', 'Made', 'Soft Fruit', 'American Movie']
