In [15]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from ast import literal_eval
import zipfile
from nltk.stem.snowball import SnowballStemmer

keywords_zip_path = "keywords.csv.zip"
movies_metadata_zip_path = "movies_metadata.csv.zip"
credits_zip_path = "credits.csv.zip"
links_csv_path = "links.csv"

try:
    with zipfile.ZipFile(keywords_zip_path, 'r') as zip_ref:
        keywords_df = pd.read_csv(zip_ref.open(zip_ref.namelist()[0]))

    with zipfile.ZipFile(movies_metadata_zip_path, 'r') as zip_ref:
        movies_metadata_df = pd.read_csv(zip_ref.open(zip_ref.namelist()[0]), low_memory=False)

    with zipfile.ZipFile(credits_zip_path, 'r') as zip_ref:
        credits_df = pd.read_csv(zip_ref.open(zip_ref.namelist()[0]))

    links_df = pd.read_csv(links_csv_path)

except FileNotFoundError as e:
    print(f"Error: One or more required CSV/ZIP files not found. Please ensure they are in the correct directory.")
    print(f"Missing file: {e.filename}")
    exit()

movies_metadata_df['id'] = pd.to_numeric(movies_metadata_df['id'], errors='coerce')
movies_metadata_df.dropna(subset=['id'], inplace=True)
movies_metadata_df['id'] = movies_metadata_df['id'].astype(int)

links_df['tmdbId'] = pd.to_numeric(links_df['tmdbId'], errors='coerce')
links_df.dropna(subset=['tmdbId'], inplace=True)
links_df['tmdbId'] = links_df['tmdbId'].astype(int)

valid_ids = set(links_df['tmdbId'])
movies_metadata_filtered = movies_metadata_df[movies_metadata_df['id'].isin(valid_ids)]
credits_filtered = credits_df[credits_df['id'].isin(valid_ids)]
keywords_filtered = keywords_df[keywords_df['id'].isin(valid_ids)]

master_dataset = movies_metadata_filtered.merge(credits_filtered, on='id', how='left')
master_dataset = master_dataset.merge(keywords_filtered, on='id', how='left')

def safe_literal_eval(val):
    try:
        return literal_eval(val)
    except (ValueError, SyntaxError):
        return []

for col in ['cast', 'crew', 'keywords', 'genres']:
    master_dataset[col] = master_dataset[col].fillna('[]').apply(safe_literal_eval)

def get_director(x):
    for i in x:
        if isinstance(i, dict) and i.get('job') == 'Director':
            return i.get('name')
    return np.nan

master_dataset['cast'] = master_dataset['cast'].apply(
    lambda x: [i.get('name') for i in x[:3] if isinstance(i, dict) and i.get('name')] if isinstance(x, list) else []
)
master_dataset['keywords'] = master_dataset['keywords'].apply(
    lambda x: [i.get('name') for i in x if isinstance(i, dict) and i.get('name')] if isinstance(x, list) else []
)
master_dataset['director'] = master_dataset['crew'].apply(get_director)

master_dataset['cast'] = master_dataset['cast'].apply(lambda x: [s.lower().replace(" ", "") for s in x])
master_dataset['main_director'] = master_dataset['director']
master_dataset['director'] = master_dataset['director'].astype(str).apply(lambda x: [x.lower().replace(" ", "")] * 3)

stemmer = SnowballStemmer('english')
master_dataset['keywords'] = master_dataset['keywords'].apply(
    lambda x: [stemmer.stem(s).lower().replace(" ", "") for s in x if len(s) > 1]
)

master_dataset['genres'] = master_dataset['genres'].apply(
    lambda x: [g.get('name') for g in x if isinstance(g, dict) and g.get('name')] if isinstance(x, list) else []
)

for col in ['keywords', 'cast', 'director', 'genres']:
    master_dataset[col] = master_dataset[col].apply(lambda x: x if isinstance(x, list) else [])

master_dataset['soup'] = master_dataset['keywords'] + master_dataset['cast'] + master_dataset['director'] + master_dataset['genres']
master_dataset['soup'] = master_dataset['soup'].apply(lambda x: ' '.join(x))

columns_to_keep = ['title', 'main_director', 'release_date', 'soup', 'popularity']
master_dataset_final = master_dataset[columns_to_keep].copy()

master_dataset_final['popularity'] = pd.to_numeric(master_dataset_final['popularity'], errors='coerce')
master_dataset_final.dropna(subset=['popularity'], inplace=True)

master_dataset_final['main_director'] = master_dataset_final['main_director'].apply(
    lambda x: x if isinstance(x, str) and len(x) > 1 else np.nan
)
master_dataset_final.dropna(subset=['main_director'], inplace=True)

master_dataset_final.sort_values(by='popularity', ascending=False, inplace=True)
master_dataset_final.drop('popularity', axis=1, inplace=True)

master_dataset_final.reset_index(drop=True, inplace=True)
master_dataset_final['release_date'] = master_dataset_final['release_date'].apply(
    lambda x: x if isinstance(x, str) and len(x) > 1 else np.nan
)
master_dataset_final.dropna(subset=['release_date'], inplace=True)

master_dataset_final = master_dataset_final.head(2500)

print(f"Shape of the final dataset for recommendations: {master_dataset_final.shape}")
print(master_dataset_final.head())

count_vectorizer = CountVectorizer(analyzer='word', ngram_range=(1, 2), min_df=5)

count_matrix = count_vectorizer.fit_transform(master_dataset_final['soup'])

print(f"\nShape of the Count Matrix: {count_matrix.shape}")

print("\nComputing Cosine Similarity Matrix...")
cosine_sim = cosine_similarity(count_matrix, count_matrix)
print("Cosine Similarity Matrix Computed.")

np.savetxt('cosine_similarity_matrix.csv', cosine_sim, delimiter=',')
print("Cosine Similarity Matrix saved to 'cosine_similarity_matrix.csv'")

indices = pd.Series(master_dataset_final.index, index=master_dataset_final['title']).drop_duplicates()

def get_recommendations(movie_title, cosine_sim_matrix, df, num_recommendations=10):
    if movie_title not in indices:
        print(f"Movie '{movie_title}' not found in the dataset.")
        return []

    idx = indices[movie_title]
    sim_scores = sorted(list(enumerate(cosine_sim_matrix[idx])), key=lambda x: x[1], reverse=True)

    sim_scores = sim_scores[1:num_recommendations+1]

    movie_indices = [i[0] for i in sim_scores]

    recommendations = []
    for i in movie_indices:
        movie_info = df.iloc[i]
        recommendations.append({
            'title': movie_info['title'],
            'director': movie_info['main_director'],
            'release_date': movie_info['release_date']
        })
    return recommendations

print("\n--- Movie Recommendations ---")

movie_to_recommend = "Minions"
recommended_movies = get_recommendations(movie_to_recommend, cosine_sim, master_dataset_final)

if recommended_movies:
    print(f"\nRecommended movies for \"{movie_to_recommend}\":")
    for movie in recommended_movies:
        print(f"- Title: {movie['title']}")
        print(f"  Director: {movie['director']}")
        print(f"  Release Date: {movie['release_date']}")
        print()

movie_to_recommend_avatar = "Avatar"
recommended_movies_avatar = get_recommendations(movie_to_recommend_avatar, cosine_sim, master_dataset_final)

if recommended_movies_avatar:
    print(f"\nRecommended movies for \"{movie_to_recommend_avatar}\":")
    for movie in recommended_movies_avatar:
        print(f"- Title: {movie['title']}")
        print(f"  Director: {movie['director']}")
        print(f"  Release Date: {movie['release_date']}")
        print()


Shape of the final dataset for recommendations: (2500, 4)
                  title   main_director release_date  \
0               Minions      Kyle Balda   2015-06-17   
1          Wonder Woman   Patty Jenkins   2017-05-30   
2  Beauty and the Beast     Bill Condon   2017-03-16   
3           Baby Driver    Edgar Wright   2017-06-28   
4            Big Hero 6  Chris Williams   2014-10-24   

                                                soup  
0  assist aftercreditssting duringcreditssting ev...  
1  dccomic hero greekmytholog island worldwari su...  
2  franc magic castl fairytal music curs anthropo...  
3  robberi atlanta music crimeboss romanc tinnitu...  
4  brotherbrotherrelationship hero talent reveng ...  

Shape of the Count Matrix: (2500, 1916)

Computing Cosine Similarity Matrix...
Cosine Similarity Matrix Computed.
Cosine Similarity Matrix saved to 'cosine_similarity_matrix.csv'

--- Movie Recommendations ---

Recommended movies for "Minions":
- Title: Penguins of Madagasc