In [8]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import os

# --- Setup ---
sns.set_theme(style="whitegrid")
plt.rcParams['figure.dpi'] = 100

# Create a directory for results if it doesn't exist
if not os.path.exists('results'):
    os.makedirs('results')

def run_project_analysis():
    """Loads the data, performs the content trend analysis, and generates plots."""

    try:
        # Load the dataset
        df = pd.read_csv("Netflix Dataset 4.csv")
    except FileNotFoundError:
        print("Error: 'Netflix Dataset 4.csv' not found. Please ensure the file is in the correct path.")
        return

    # Data Preprocessing and Cleaning
    df.rename(columns={'Type': 'Genre'}, inplace=True)

    # Calculate Release Year for time-based analysis
    df['Release_Year'] = pd.to_datetime(df['Release_Date'], errors='coerce').dt.year
    df_clean = df.dropna(subset=['Release_Year']).copy()
    df_clean['Release_Year'] = df_clean['Release_Year'].astype(int)

    # **CRITICAL ERROR FIX:** Ensure 'Category' is a clean, single string/scalar column
    # This prevents the "Grouper not 1-dimensional" error caused by prior accidental list assignment.
    df_clean['Category'] = df_clean['Category'].astype(str)

    print("--- Starting Netflix Content Analysis ---")

    # ======================================================================
    # OBJECTIVE 1: Movies vs. TV Shows over the years (Uses df_clean)
    # ======================================================================
    print("\n[1/3] Analyzing Movies vs. TV Shows distribution...")

    content_by_year_category = df_clean.groupby(['Release_Year', 'Category']).size().reset_index(name='Count')
    content_pivot = content_by_year_category.pivot(index='Release_Year', columns='Category', values='Count').fillna(0)

    # Filter for relevant years (2008 onwards)
    content_pivot = content_pivot[content_pivot.index >= 2008]

    # Plot
    plt.figure(figsize=(12, 6))
    content_pivot.plot(kind='bar', stacked=True, figsize=(12, 6), color=['#E50914', '#F46522'], ax=plt.gca())

    plt.title('Distribution of Movies vs. TV Shows Added to Netflix (2008-2021)', fontsize=16)
    plt.xlabel('Release Year', fontsize=12)
    plt.ylabel('Number of Titles Added', fontsize=12)
    plt.xticks(rotation=45, ha='right')
    plt.legend(title='Content Type', loc='upper left')
    plt.tight_layout()
    plt.savefig('results/movies_vs_tv_shows_over_years.png')
    plt.close()
    print("-> Plot saved: results/movies_vs_tv_shows_over_years.png")


    # ======================================================================
    # OBJECTIVE 2: Genre popularity and trend analysis (Creates df_genres_exploded)
    # ======================================================================
    print("\n[2/3] Analyzing Genre trends...")

    # Explode the 'Genre' column for multi-genre counting
    df_genres = df_clean.copy()
    df_genres['Genre'] = df_genres['Genre'].str.split(', ')
    df_genres_exploded = df_genres.explode('Genre')

    # Calculate overall top 10 genres
    genre_totals = df_genres_exploded.groupby('Genre')['Title'].count().sort_values(ascending=False)
    top_n_genres = genre_totals.head(10).index.tolist()

    # Overall Top 10 Genres Plot
    plt.figure(figsize=(10, 6))
    genre_totals.head(10).sort_values(ascending=True).plot(kind='barh', color='#221f1f')
    plt.title('Overall Top 10 Most Common Genres in Netflix Catalog', fontsize=16)
    plt.xlabel('Total Number of Titles', fontsize=12)
    plt.ylabel('Genre', fontsize=12)
    plt.tight_layout()
    plt.savefig('results/overall_top_10_genres.png')
    plt.close()
    print("-> Plot saved: results/overall_top_10_genres.png")

    # Genre Popularity Over Time Plot
    genre_by_year = df_genres_exploded[df_genres_exploded['Genre'].isin(top_n_genres)].groupby(['Release_Year', 'Genre']).size().reset_index(name='Count')
    genre_pivot = genre_by_year.pivot(index='Release_Year', columns='Genre', values='Count').fillna(0)
    genre_pivot = genre_pivot[genre_pivot.index >= 2008]

    plt.figure(figsize=(14, 8))
    genre_pivot.plot(kind='line', figsize=(14, 8), marker='o', linewidth=2, ax=plt.gca(), cmap='tab10')

    plt.title(f'Popularity of Top 10 Genres Added to Netflix Over the Years', fontsize=16)
    plt.xlabel('Release Year', fontsize=12)
    plt.ylabel('Number of Titles Added', fontsize=12)
    plt.xticks(genre_pivot.index, rotation=45, ha='right')
    plt.legend(title='Genre', bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.tight_layout()
    plt.savefig('results/top_genres_over_years.png')
    plt.close()
    print("-> Plot saved: results/top_genres_over_years.png")


    # ======================================================================
    # OBJECTIVE 3: Country-wise contributions (Creates df_country_exploded)
    # ======================================================================
    print("\n[3/3] Analyzing Country contributions...")

    # Explode the 'Country' column
    df_country = df.copy()
    df_country['Country'] = df_country['Country'].str.split(', ')
    df_country_exploded = df_country.explode('Country')

    # Clean country names
    df_country_exploded['Country'] = df_country_exploded['Country'].str.strip()

    # Group by country and count the number of titles
    country_contributions = df_country_exploded.groupby('Country')['Title'].count().sort_values(ascending=False).reset_index(name='Count')

    # Select the top 10 countries
    top_10_countries = country_contributions.head(10)

    # Plot
    plt.figure(figsize=(10, 6))
    top_10_countries.sort_values(by='Count', ascending=True).plot(kind='barh', x='Country', y='Count', color='#831010', legend=False, ax=plt.gca())

    plt.title('Top 10 Country Contributions to Netflix Catalog', fontsize=16)
    plt.xlabel('Number of Titles', fontsize=12)
    plt.ylabel('Country', fontsize=12)
    plt.tight_layout()
    plt.savefig('results/top_10_country_contributions.png')
    plt.close()
    print("-> Plot saved: results/top_10_country_contributions.png")

    print("\n--- Analysis Complete! ---")

# Execute the analysis function
if __name__ == "__main__":
    run_project_analysis()

--- Starting Netflix Content Analysis ---

[1/3] Analyzing Movies vs. TV Shows distribution...
-> Plot saved: results/movies_vs_tv_shows_over_years.png

[2/3] Analyzing Genre trends...
-> Plot saved: results/overall_top_10_genres.png
-> Plot saved: results/top_genres_over_years.png

[3/3] Analyzing Country contributions...
-> Plot saved: results/top_10_country_contributions.png

--- Analysis Complete! ---
