In [1]:
# 📦 FINAL IMDB FULL ANALYSIS NOTEBOOK 🔥

import pandas as pd
import ast
import matplotlib.pyplot as plt
import seaborn as sns
from google.colab import files

# ✅ Upload Master File
print("Upload imdb_master_cleaned.csv file")
uploaded = files.upload()
master_file = list(uploaded.keys())[0]
final_df = pd.read_csv(master_file)

# ✅ Clean Columns
final_df['budget'] = pd.to_numeric(final_df['budget'], errors='coerce')
final_df['revenue'] = pd.to_numeric(final_df['revenue'], errors='coerce')
final_df['release_date'] = pd.to_datetime(final_df['release_date'], errors='coerce')
final_df['year'] = final_df['release_date'].dt.year

# ✅ Parse Genres
def parse_genres(genres_json):
    try:
        genres = ast.literal_eval(genres_json)
        return [genre['name'] for genre in genres]
    except:
        return []

final_df['genres_list'] = final_df['genres'].apply(parse_genres)
exploded_genres_df = final_df.explode('genres_list')

# ✅ Top 10 Genres
print("\nTop 10 Genres:")
genre_counts = exploded_genres_df['genres_list'].value_counts().head(10)
print(genre_counts)

# ✅ Directors
valid_directors = final_df[(final_df['revenue'] > 0) & (final_df['director'].notnull())]
director_revenue = valid_directors.groupby('director').agg(
    total_revenue=('revenue', 'sum'),
    avg_revenue=('revenue', 'mean'),
    movie_count=('title', 'count')
).sort_values(by='total_revenue', ascending=False).head(10)

print("\nTop 10 Directors by Total Revenue:")
print(director_revenue)

# ✅ Actors
exploded_actors_df = final_df.explode('actors_list')
valid_actors = exploded_actors_df[(exploded_actors_df['revenue'] > 0) & (exploded_actors_df['actors_list'].notnull())]
actor_revenue = valid_actors.groupby('actors_list').agg(
    total_revenue=('revenue', 'sum'),
    avg_revenue=('revenue', 'mean'),
    movie_count=('title', 'count')
).sort_values(by='total_revenue', ascending=False).head(10)

print("\nTop 10 Actors by Total Revenue:")
print(actor_revenue)

# ✅ Year-wise Movies
movies_per_year = final_df['year'].value_counts().sort_index()
print("\nMovies Released Per Year:")
print(movies_per_year)


Upload imdb_master_cleaned.csv file


Saving imdb_master_cleaned.csv to imdb_master_cleaned.csv

Top 10 Genres:
genres_list
Drama              20312
Comedy             13196
Thriller            7640
Romance             6746
Action              6607
Horror              4679
Crime               4314
Documentary         3937
Adventure           3508
Science Fiction     3061
Name: count, dtype: int64

Top 10 Directors by Total Revenue:
                   total_revenue   avg_revenue  movie_count
director                                                   
Steven Spielberg    9.256621e+09  3.085540e+08           30
Peter Jackson       6.528245e+09  5.934768e+08           11
Michael Bay         6.437467e+09  4.951898e+08           13
James Cameron       5.900610e+09  7.375763e+08            8
David Yates         5.334563e+09  8.890939e+08            6
Christopher Nolan   4.747409e+09  4.747409e+08           10
Robert Zemeckis     4.138234e+09  2.434255e+08           17
Tim Burton          4.032916e+09  2.240509e+08           18
Ri