In [4]:
import numpy as np
import pandas as pd
import ast
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pickle

# Load datasets
movies = pd.read_csv('tmdb_5000_movies.csv')
credits = pd.read_csv('tmdb_5000_credits.csv')

# Merge datasets and select important columns
movies = movies.merge(credits, on='title')
movies = movies[['movie_id', 'title', 'overview', 'genres', 'keywords', 'cast', 'crew']]
movies.dropna(inplace=True)

# Helper functions to process data
def convert(text):
    L = []
    for i in ast.literal_eval(text):
        L.append(i['name'])
    return L

def convert3(text):
    L = []
    counter = 0
    for i in ast.literal_eval(text):
        if counter < 3:
            L.append(i['name'])
        counter += 1
    return L

def fetch_director(text):
    L = []
    for i in ast.literal_eval(text):
        if i['job'] == 'Director':
            L.append(i['name'])
    return L

def collapse(L):
    return [i.replace(" ", "") for i in L]

# Apply processing functions
movies['genres'] = movies['genres'].apply(convert)
movies['keywords'] = movies['keywords'].apply(convert)
movies['cast'] = movies['cast'].apply(convert).apply(lambda x: x[:3])
movies['crew'] = movies['crew'].apply(fetch_director)
movies['overview'] = movies['overview'].apply(lambda x: x.split())
movies['cast'] = movies['cast'].apply(collapse)
movies['crew'] = movies['crew'].apply(collapse)
movies['genres'] = movies['genres'].apply(collapse)
movies['keywords'] = movies['keywords'].apply(collapse)

# Create 'tags' column
movies['tags'] = movies['overview'] + movies['genres'] + movies['keywords'] + movies['cast'] + movies['crew']
# Create 'tags' column safely
new = movies[['movie_id', 'title', 'tags', 'genres']].copy()  # Explicitly create a deep copy
new['tags'] = new['tags'].apply(lambda x: " ".join(x))


# Vectorize tags and calculate similarity
cv = CountVectorizer(max_features=5000, stop_words='english')
vector = cv.fit_transform(new['tags']).toarray()
similarity = cosine_similarity(vector)

# Function to recommend movies based on a genre
def recommend_by_genre(genre_input, num_recommendations=10):
    # Filter movies containing the specified genre
    genre_filtered = new[new['genres'].apply(lambda x: genre_input in x)]
    
    if genre_filtered.empty:
        print(f"No movies found for genre: {genre_input}")
        return

    print(f"Top {num_recommendations} movies in the genre '{genre_input}':")
    for index, row in genre_filtered.head(num_recommendations).iterrows():
        print(row['title'])

# Example: Get recommendations by genre
recommend_by_genre('Comedy', 10)

# Save the processed data and similarity matrix for later use
pickle.dump(new, open('movie_list.pkl', 'wb'))
pickle.dump(similarity, open('similarity.pkl', 'wb'))


Top 10 movies in the genre 'Comedy':
Men in Black 3
Cars 2
Toy Story 3
Brave
Rush Hour 3
Up
Wild Wild West
Evan Almighty
Inside Out
Shrek Forever After
