In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import ast
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import TruncatedSVD
import pickle
import requests
import streamlit as st
import time
from nltk.stem.porter import PorterStemmer

# Load data
movies = pd.read_csv('/Users/nandhinivijayakumar/Desktop/ADM/Project/tmdb_5000_movies.csv')
credits = pd.read_csv('/Users/nandhinivijayakumar/Desktop/ADM/Project/tmdb_5000_credits.csv')

# Merging data
movies = movies.merge(credits, on='title')
movies = movies[['movie_id', 'title', 'genres', 'original_language', 'release_date', 'overview', 'runtime', 'keywords', 'cast', 'crew', 'vote_average']]

# Dropping rows with null values
movies.dropna(inplace=True)

# Helper function to convert JSON columns into lists of strings
def convert(obj):
    return [i['name'] for i in ast.literal_eval(obj)]

# Apply conversion functions
movies['genres'] = movies['genres'].apply(convert)
movies['keywords'] = movies['keywords'].apply(convert)

# Process cast column (only top 3 cast members)
def convert3(obj):
    return [i['name'] for i in ast.literal_eval(obj)[:3]]

movies['cast'] = movies['cast'].apply(convert3)

# Extract director from crew
def fetch_director(obj):
    crew = ast.literal_eval(obj)
    directors = [i['name'] for i in crew if i['job'] == 'Director']
    return directors[0] if directors else None

movies['crew'] = movies['crew'].apply(fetch_director)

# Combine tags into a single string
movies['tags'] = (
    movies['overview'].fillna('') + ' ' +
    movies['genres'].apply(lambda x: " ".join(x)) + ' ' +
    movies['keywords'].apply(lambda x: " ".join(x)) + ' ' +
    movies['cast'].apply(lambda x: " ".join(x)) + ' ' +
    movies['crew'].fillna('')
)

# Clean the tags (remove empty or whitespace-only rows)
movies['tags'] = movies['tags'].apply(lambda x: x.strip().lower())
movies = movies[movies['tags'] != '']  # Remove empty tags

# Apply stemming to the tags
ps = PorterStemmer()
def stem(text):
    return " ".join([ps.stem(word) for word in text.split()])

movies['tags'] = movies['tags'].apply(stem)

# Remove any NaN or empty tags after stemming
movies = movies[movies['tags'].notna()]
movies = movies[movies['tags'].str.strip() != '']

# Vectorize the tags using TfidfVectorizer
tfidf = TfidfVectorizer(max_features=5000, stop_words='english')
tfidf_matrix = tfidf.fit_transform(movies['tags']).toarray()

# Perform Singular Value Decomposition (SVD)
svd = TruncatedSVD(n_components=100, random_state=42)
svd_matrix = svd.fit_transform(tfidf_matrix)

# Calculate cosine similarity between the movies
similarity = cosine_similarity(svd_matrix)

# Save the models for later use
pickle.dump(movies.to_dict(), open('/Users/nandhinivijayakumar/Desktop/ADM/FinalProject_NandhiniVijayakumar/movie_dict.pkl', 'wb'))
pickle.dump(similarity, open('/Users/nandhinivijayakumar/Desktop/ADM/FinalProject_NandhiniVijayakumar/similarity.pkl', 'wb'))

# Function to fetch movie posters from TMDb API
def fetch_poster(movie_id):
    api_key = '020b311fe0559698373a16008dc6a672'
    url = f'https://api.themoviedb.org/3/movie/{movie_id}?api_key={api_key}&language=en-US'
    response = requests.get(url)
    data = response.json()
    return "https://image.tmdb.org/t/p/w500/" + data.get('poster_path', '')

# Function to get movie recommendations
def recommend(movie):
    movie_index = movies[movies['title'] == movie].index[0]
    distances = similarity[movie_index]
    movies_list = sorted(list(enumerate(distances)), reverse=True, key=lambda x: x[1])[1:6]
    
    recommended_movies = []
    recommended_movies_posters = []
    for x in movies_list:
        movie_id = movies.iloc[x[0]].movie_id
        recommended_movies.append(movies.iloc[x[0]].title)
        recommended_movies_posters.append(fetch_poster(movie_id))
    
    return recommended_movies, recommended_movies_posters

# Example usage
# recommendation, posters = recommend("The Dark Knight")
# print(recommendation, posters)
