In [1]:

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import ast

In [2]:
# Load dataset
df = pd.read_csv('data/tmdb_5000_movies.csv')

# Parse JSON-like strings into actual lists
df['genres'] = df['genres'].map(lambda x: [i['name'] for i in ast.literal_eval(x)])
df['keywords'] = df['keywords'].map(lambda x: [i['name'] for i in ast.literal_eval(x)])

# Combine features into one string for TF-IDF
df['combined_features'] = df['keywords'].apply(lambda x: " ".join(x)) + " " + df['genres'].apply(lambda x: " ".join(x))

# TF-IDF Vectorization
tfidf = TfidfVectorizer(stop_words='english', max_features=5000, ngram_range=(1, 2))
tfidf_matrix = tfidf.fit_transform(df['combined_features'])

# Compute cosine similarity matrix
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Recommendation function
def recommend(title, cosine_sim=cosine_sim):
    if title not in df['title'].values:
        return f"'{title}' not found in the dataset."
    idx = df[df['title'] == title].index[0]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:11]  # Top 10 similar movies
    movie_indices = [i[0] for i in sim_scores]
    return df['title'].iloc[movie_indices]

# Example usage
print("Recommended for 'The Dark Knight Rises':")
print(recommend('The Dark Knight Rises'))


Recommended for 'The Dark Knight Rises':
65           The Dark Knight
119            Batman Begins
428           Batman Returns
210           Batman & Robin
1359                  Batman
2793    The Killer Inside Me
1010              Panic Room
3819                Defendor
299           Batman Forever
813                 Superman
Name: title, dtype: object


In [3]:

# User profile-based recommendation
def personalized_recommend(liked_titles, top_n=10):
    liked_indices = df[df['title'].isin(liked_titles)].index
    if len(liked_indices) == 0:
        return "No liked titles found in the dataset."
    
    user_profile = tfidf_matrix[liked_indices].mean(axis=0)
    similarity_scores = cosine_similarity(user_profile, tfidf_matrix).flatten()
    recommended_indices = similarity_scores.argsort()[::-1]
    recommended_indices = [i for i in recommended_indices if i not in liked_indices][:top_n]
    return df.iloc[recommended_indices]['title']
