In [1]:
import numpy as np
import pandas as pd
import ast
import nltk
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Function to convert JSON-like string to a list of names
def convert(obj):
    L = []
    for i in ast.literal_eval(obj):
        L.append(i['name'])
    return L

# Function to convert JSON-like string to a list of the first three names
def convert3(obj):
    L = []
    counter = 0
    for i in ast.literal_eval(obj):
        if counter != 3:
            L.append(i['name'])
            counter += 1
        else:
            break
    return L

# Function to fetch the director's name from JSON-like string
def fetch_director(obj):
    L = []
    for i in ast.literal_eval(obj):
        if i['job'] == 'Director':
            L.append(i['name'])
            break
    return L

# Initialize PorterStemmer
ps = PorterStemmer()

# Function to perform stemming on text
def stem(text):
    y = []
    for i in text.split():
        y.append(ps.stem(i))
    return " ".join(y)

# Function to recommend movies based on a given movie
def recommend(movie):
    movie_index = movies[movies['title']==movie].index[0]
    distances = similarity[movie_index]
    movies_list = sorted(list(enumerate(distances)), reverse=True, key=lambda x: x[1])[1:6]
    for i in movies_list:
        print(movies.iloc[i[0]].title)
    return

# Load movies and credits datasets
movies = pd.read_csv('tmdb_5000_movies.csv')
credits = pd.read_csv('tmdb_5000_credits.csv')

# Merge movies and credits datasets on 'title'
movies = movies.merge(credits, on='title')

# Select relevant columns
movies = movies[['movie_id', 'title', 'overview', 'genres', 'keywords', 'cast', 'crew']]

# Drop missing values
movies.dropna(inplace=True)

# Check for duplicates and missing values
movies.duplicated().sum()
movies.isnull().sum()

# Apply conversion functions to appropriate columns
movies['genres'] = movies['genres'].apply(convert)
movies['keywords'] = movies['keywords'].apply(convert)
movies['cast'] = movies['cast'].apply(convert3)
movies['crew'] = movies['crew'].apply(fetch_director)

# Tokenize overview text
movies['overview'] = movies['overview'].apply(lambda x: str(x).split())

# Remove spaces from genre, keyword, cast, and crew names
movies['genres'] = movies['genres'].apply(lambda x: [i.replace(" ", "") for i in x])
movies['keywords'] = movies['keywords'].apply(lambda x: [i.replace(" ", "") for i in x])
movies['cast'] = movies['cast'].apply(lambda x: [i.replace(" ", "") for i in x])
movies['crew'] = movies['crew'].apply(lambda x: [i.replace(" ", "") for i in x])

# Create tags by combining overview, genres, keywords, cast, and crew
movies['tags'] = movies['overview'] + movies['genres'] + movies['keywords'] + movies['cast'] + movies['crew']

# Select final columns for the dataset
movies = movies[['movie_id', 'title', 'tags']]

# Convert tags to lowercase and join them into a single string
movies['tags'] = movies['tags'].apply(lambda x: " ".join(x))
movies['tags'] = movies['tags'].apply(lambda x: x.lower())

# Apply stemming to tags
movies['tags'] = movies['tags'].apply(stem)

# Display the first few rows of the final dataset
movies.head()

# Initialize CountVectorizer
cv = CountVectorizer(max_features=5000, stop_words='english')

# Fit and transform the tags into vectors
vectors = cv.fit_transform(movies['tags']).toarray()

# Get feature names
feature_names = cv.get_feature_names_out()

# Calculate cosine similarity between vectors
similarity = cosine_similarity(vectors)

# Recommend a movie (example: 'Avatar')
recommend('Avatar')