In [None]:
# Importing necessary libraries & modules; checking system compatibility
import sys, pip, numpy as np, pandas as pd, matplotlib.pyplot as plt
from streaming_func import clean_streaming_df, unique_actors_per_genre
from pathlib import Path

In [None]:
# Loading Netflix dataset & info for understanding of present structure & content
DATA_DIR = Path("data")
netflix_df = pd.read_csv(DATA_DIR / "netflix_titles.csv")

print(netflix_df.head(), '\n')
print(netflix_df.info())

In [None]:
# Extracting unique media types from the 'listed_in' column to check for any anomalous genre types & entries
unique_netflix_genre_types = netflix_df.listed_in.unique()
unique_netflix_media_types = netflix_df.type.unique()

# Displaying unique media types in the Netflix dataset
print(f'Unique media types in Netflix dataset: {[type for type in unique_netflix_media_types]} \n')

# Displaying unique media genre lists in the dataset for additional visual debugging & validation§ 
print(f"Unique media genre lists in Netflix dataset: \n{unique_netflix_genre_types}")

In [None]:
# Preliminary check of unique countries in the 'country' column
unique_netflix_countries = netflix_df.country.unique()
print("Countries list for Netflix dataset accredited locations of media production: \n", unique_netflix_countries)

In [None]:
# Data cleaning function for Amazon Prime dataset for easier exploration & manipulation

test_netflix_df = clean_streaming_df(netflix_df)
print(test_netflix_df.head(), '\n')
print(test_netflix_df.info())

In [None]:
# Extracting unique countries from the 'country' column for Amazon Prime dataset
netflix_countries = test_netflix_df["country"]

netflix_countries_unique = []
for country_list in netflix_countries:
    for country in country_list:
        if country not in netflix_countries_unique:
            netflix_countries_unique.append(country)

print(netflix_countries_unique, '\n')
print(f'Total number unique countries for Netflix media production: {len(netflix_countries_unique)}')

In [None]:
# Movies-only DataFrame
test_movies_df = test_netflix_df[test_netflix_df["type"] == "Movie"].copy()

# TV-Shows-only DataFrame
test_tv_shows_df = test_netflix_df[test_netflix_df["type"] == "TV Show"].copy()

print("Movies:", len(test_movies_df))
print("TV Shows:", len(test_tv_shows_df))

In [None]:
# Netflix Movies duration distribution plot
movies_ax = test_movies_df.groupby("duration_mins").size().plot(kind="bar", figsize=(12,6))

movies_ax.set_xticks(movies_ax.get_xticks()[::len(movies_ax.get_xticks()) // 20])
movies_ax.set_title("Distribution of Movie Durations on Netflix")
movies_ax.set_xlabel("Duration (mins)")
movies_ax.set_ylabel("Count")

plt.tight_layout()
plt.show()

In [None]:
# Filtering for TV shows with 10 seasons or less, as few tv shows ever suceed enough to exceed this number
filtered_tv = test_tv_shows_df[test_tv_shows_df["duration_seasons"] <= 10]

shows_ax = filtered_tv.groupby("duration_seasons").size().plot(kind="bar", figsize=(12,6))

shows_ax.set_xticks(shows_ax.get_xticks()[::len(shows_ax.get_xticks()) // 10])
shows_ax.set_yticks(shows_ax.get_yticks()[::len(shows_ax.get_yticks()) // 8])
shows_ax.set_title("Distribution of TV Show Seasons on Netflix")
shows_ax.set_xlabel("Number of seasons")
shows_ax.set_ylabel("Number of TV Shows")
plt.tight_layout()
plt.show()

In [None]:
# Netflix TV Shows cast count distribution plot

test_tv_shows_df["cast_count"] = test_tv_shows_df["cast"].apply(len)
test_tv_shows_df["cast_count"].hist(bins=range(0, 25), edgecolor='black', figsize=(12,6))
plt.title("Distribution of Cast Counts in Netflix TV Shows")
plt.xlabel("Number of Cast Members")
plt.ylabel("Number of TV Shows")
plt.tight_layout()
plt.show()

In [None]:
# # Movies cast size distribution (focused more specifically on casts with 15 or less members, due to their more recorded frequent occurrences)
test_movies_df["cast_count"] = test_movies_df["cast"].apply(len)
test_movies_df["cast_count"].hist(bins=range(0, 15), edgecolor='black', figsize=(12,6))

plt.xlabel("Number of Cast Members")
plt.ylabel("Number of Movies")
plt.title("Distribution of Cast Sizes in Movies on Netflix")

plt.tight_layout()
plt.show()

In [None]:
# Movies cast size distribution (focused more specifically on larger casts, due to their less frequent occurrence)

test_movies_df["cast_count"].hist(bins=range(14, 40), edgecolor='black', figsize=(12,6))

plt.xlabel("Number of Cast Members")
plt.ylabel("Number of Movies")
plt.title("Distribution of Larger Cast Sizes (15 members and above) in Movies on Netflix")

plt.tight_layout()
plt.show()

In [None]:
# Movies cast size distribution using bar plot for better visualization

test_movies_df["cast_count"] = test_movies_df["cast"].apply(len)
'''
test_movies_bar = test_movies_df.groupby("cast_count").size().plot(kind="bar", figsize=(8,4))
test_movies_bar.set_xlabel("Number of Cast Members")
test_movies_bar.set_ylabel("Number of Movies")
test_movies_bar.set_title("Distribution of Cast Sizes in Movies on Netflix")
'''

'''
several films have been recorded to have casts of 30 or more, which is most likely an error in the dataset or an inclusion of all minor roles & extras;
as a result, we'll exclude those from the visualization for better clarity & to remove potential skewing of the data & outliers
'''

test_movies_bar_sample = test_movies_df[test_movies_df["cast_count"] <= 30]
test_movies_bar_sample = test_movies_bar_sample.groupby("cast_count").size().plot(kind="bar", figsize=(12,6))
test_movies_bar_sample.set_xlabel("Number of Cast Members")
test_movies_bar_sample.set_ylabel("Number of Movies")
test_movies_bar_sample.set_title("Distribution of Cast Sizes in Movies on Netflix")
plt.tight_layout()
plt.show()

In [None]:
# Extracting unique genres from Netflix dataset
netflix_media_genres = test_netflix_df["listed_in"]
unique_netlfix_genres = []

for films in netflix_media_genres:
    for genre in films:
        
        # Check & append unique genres only
        if genre not in unique_netlfix_genres:
            unique_netlfix_genres.append(genre)

print(f'List of unique Netflix media genres: {unique_netlfix_genres}\n')
print(f'Number of unique media genres in Netflix dataset: {len(unique_netlfix_genres)}')

In [None]:
test_netflix_casts = test_netflix_df["cast"]    # Series of cast lists

# Extract unique cast members from the Netflix dataset & add to a set
test_netflix_unique_cast_members = set()

# Iterate through each cast entry in the dataset
for cast_list in test_netflix_casts:

    # cast_list is already a list (or []), thanks to clean_streaming_df
    if not cast_list:          # handles empty lists
        continue

    # Split multiple cast members & add unique ones to the set
    for actor in cast_list:

        # paranoia, but good check to have
        if not isinstance(actor, str):
            continue           

        actor = actor.strip()
        
        # skip empty strings
        if actor:  
            test_netflix_unique_cast_members.add(actor)

# Convert the set to a list for easier handling
test_netflix_unique_cast_members = list(test_netflix_unique_cast_members)

print(f"Sample of 'test_netflix_unique_cast_members' list for inspection: \n{test_netflix_unique_cast_members[:10]}\n")
print(f'Number of unique cast members in Netflix dataset: {len(test_netflix_unique_cast_members)}')

In [None]:
# explode netflix movie genres into separate rows
movies_df_exploded = test_movies_df.explode("listed_in").copy()

# adding cast count column to the exploded dataframe
movies_df_exploded["cast_count"] = movies_df_exploded["cast"].str.len()

# Removing empty genres that become NaN after explode, before the groupby
movies_df_exploded = movies_df_exploded[movies_df_exploded["listed_in"].notna()]

# Total number of role appearances in each genre (counting a movie once per genre)
movie_genre_cast_counts = movies_df_exploded.groupby("listed_in")["cast_count"].sum().sort_values(ascending=False)
print(f'Number of role appearances on Netflix media by genre: \n{movie_genre_cast_counts}')

In [None]:
## Netflix TV-shows & Movies unique actors per genre count

def unique_actors_per_genre(df: pd.DataFrame) -> pd.Series:
    """
    The dataframe is expected to be a cleaned 'streaming dataframe' where:
      - df['cast'] is a list of actor names (or [])
      - df['listed_in'] is a list of genres (or [])
    
    Returns a Series indexed by genre, with the number of UNIQUE actors
    that have ever appeared in at least one title in that genre.
    """

    # Explode genres so each row has 1 genre
    exploded = df.explode("listed_in")

    # Explode cast so each row has 1 actor & 1 genre
    exploded = exploded.explode("cast")

    # Clean up: drop empty/NaN genres & actors
    exploded["listed_in"] = exploded["listed_in"].astype(str).str.strip()
    exploded["cast"] = exploded["cast"].astype(str).str.strip()

    mask = (
        exploded["listed_in"].notna() &
        (exploded["listed_in"] != "") &
        exploded["cast"].notna() &
        (exploded["cast"] != "")
    )
    exploded = exploded[mask]

    # Group by genre & count unique actors
    genre_actor_counts = (
        exploded
        .groupby("listed_in")["cast"]
        .nunique()          # <– unique actors per genre
        .sort_values(ascending=False)
    )

    return genre_actor_counts

movie_unique_actors_per_genre = unique_actors_per_genre(test_movies_df)
tv_unique_actors_per_genre = unique_actors_per_genre(test_tv_shows_df)


In [None]:
# top 10 movie genres by their number of unique actors
print(movie_unique_actors_per_genre.head(10), '\n')

# Answers how many for each movie genre, how many distinct actors have appeared in at least one movie of that genre
print(f'Number of unique actors per movie genre: {len(movie_unique_actors_per_genre)}')

In [None]:
# top 10 tv-show genres by unique actors
print(tv_unique_actors_per_genre.head(10), '\n')

# Answers how many for each tv-show genre, how many distinct actors have appeared in at least one tv-show of that genre
print(f'Number of unique actors per tv-show genre: {len(tv_unique_actors_per_genre)}')

In [None]:
# Pie chart of unique actors per movie genre (top 10 genres only)

top_n = 10
top_genres = movie_unique_actors_per_genre.head(top_n)

plt.figure(figsize=(8, 8))
top_genres.plot(kind="pie", autopct="%1.1f%%")
plt.ylabel("")  # hide y-label
plt.title("Share of Unique Actors by Genre on Netflix (Top 10 Movie Genres)")
plt.tight_layout()
plt.show()

In [None]:
# Pie chart of unique actors per TV-show genre (top 10 genres only)

top_n = 10
top_genres = tv_unique_actors_per_genre.head(top_n)

plt.figure(figsize=(8, 8))
top_genres.plot(kind="pie", autopct="%1.1f%%")
plt.ylabel("")  # hide y-label
plt.title("Share of Unique Actors by Genre on Netflix (Top 10 TV-show Genres)")
plt.tight_layout()
plt.show()

In [None]:
# explode netflix tv shows genres into separate rows
shows_df_exploded = test_tv_shows_df.explode("listed_in")
shows_df_exploded["cast_count"] = shows_df_exploded["cast"].apply(len)

# sorted tv show genre cast counts by ascending order to identify genres with largest casts
show_genre_cast_counts = shows_df_exploded.groupby("listed_in")["cast_count"].sum().sort_values(ascending=False)
print(show_genre_cast_counts)

In [None]:
# Pie chart of unique countries where TV-shows are made per tv-show genre (top 10 countries only)

top_n = 10

# Explode list-of-countries into one country per row
show_countries = test_tv_shows_df["country"].explode()

# As a precaution, drop any 'empties' / 'missing' values present in the 'country' column
show_countries = show_countries.dropna()
show_countries = show_countries[show_countries != ""]   # remove empty strings
# (after explode, rows that were [] become NaN & get dropped)

'''
Compared to Amazon Prime dataset, Netflix has a larger variety of countries where TV shows are produced,
so we can show a larger number of countries in the pie chart for better representation of the data
'''

top_show_countries = show_countries.value_counts().head(top_n)

plt.figure(figsize=(14, 9))
top_show_countries.plot(kind="pie", autopct="%1.1f%%", startangle=90)
plt.ylabel("")
plt.title("Share of TV Shows by Country on Netflix (Top 10 Countries)")
plt.tight_layout()
plt.show()

In [None]:
# Pie chart of unique countries where movies are produced per movie genre

top_n = 8  # limited to 8 for readability of pie chart

# Explode list-of-countries into one country per row
movie_countries = test_movies_df["country"].explode()

# As a precaution, drop any 'empties' / 'missing' values present in the 'country' column
movie_countries = movie_countries.dropna()
movie_countries = movie_countries[movie_countries != ""]          # just in case
# (after explode, rows that were [] become NaN & get dropped)

'''
Here we can only shown a more limited number of countries in the pie chart for movies,
so we pick top_n of 8 in order to have a readable chart. Though there are more countries in the dataset, 
the pie chart would become unreadable with too many slices, due to shares taken by 'other' countries 
that are more popular to film & produce media in than the rest.
'''

top_movie_countries = movie_countries.value_counts().head(top_n)

plt.figure(figsize=(12, 8))
top_movie_countries.plot(kind="pie", autopct="%1.1f%%", startangle=90)
plt.ylabel("")
plt.title("Share of Movies by Country on Netflix (Top 8 Countries)")
plt.tight_layout()
plt.show()

In [None]:
# Pie chart showing the share of Netflix titles by country of production

top_n = 10  # limited to 10 for readability of pie chart

# Explode converts each list into separate rows so each country is counted individually
media_countries = test_netflix_df["country"].explode()

# As a precaution, remove missing or empty entries that result from empty lists
media_countries = media_countries.dropna()
media_countries = media_countries[media_countries != ""]   # extra safeguard

# Count how many titles are associated with each country
top_media_countries = media_countries.value_counts().head(top_n)

# Plot pie chart
plt.figure(figsize=(12, 8))
top_media_countries.plot(
    kind="pie",
    autopct="%1.1f%%",
    startangle=90
)
plt.ylabel("")  # hide default y-axis label
plt.title("Share of Netflix Titles by Country of Production (Top 10)")
plt.tight_layout()
plt.show()