In [None]:
# Importing necessary libraries & modules; checking system compatibility
import sys, pip, numpy as np, pandas as pd, matplotlib.pyplot as plt
from pathlib import Path
from streaming_func import clean_streaming_df, unique_actors_per_genre

In [None]:
# Loading Disney+ dataset & displaying basic info on the dataframe
DATA_DIR = Path("data")
disney_df = pd.read_csv(DATA_DIR / "disney_plus_titles.csv")

print(disney_df.head(), '\n')
print(disney_df.info())

In [None]:
# Data cleaning function for Amazon Prime dataset for easier exploration & manipulation

test_disney_df = clean_streaming_df(disney_df)
print(test_disney_df.head(), '\n')
print(test_disney_df.info())

In [None]:
# Extracting unique countries from the 'country' column for Amazon Prime dataset
disney_countries = test_disney_df["country"]

disney_countries_unique = []
for country_list in disney_countries:
    for country in country_list:
        if country not in disney_countries_unique:
            disney_countries_unique.append(country)

print(disney_countries_unique, '\n')
print(f'Total number unique countries for Disney media production: {len(disney_countries_unique)}')

In [None]:
# Extracting unique media types from the 'listed_in' column to check for any anomalous genre types & entries
unique_disney_genre_types = disney_df.listed_in.unique()
unique_disney_media_types = disney_df.type.unique()

# Displaying unique media types in the Disney dataset
print(f'Unique media types in Disney+ dataset: {[type for type in unique_disney_media_types]} \n')

# Displaying unique media genre lists in the dataset for additional visual debugging & validationÂ§ 
print(f"Unique Media genre lists in Disney+ dataset: \n{unique_disney_genre_types}")

In [None]:
# Movies-only DataFrame
test_movies_df = test_disney_df[test_disney_df["type"] == "Movie"].copy()

# TV-Shows-only DataFrame
test_tv_shows_df = test_disney_df[test_disney_df["type"] == "TV Show"].copy()

print("Number of Movies @ Disney+:", len(test_movies_df))
print("Number of TV Shows @ Disney+:", len(test_tv_shows_df))

In [None]:
# Movies duration distribution plot
movies_ax = test_movies_df.groupby("duration_mins").size().plot(kind="bar", figsize=(12,6))

# Adjusting x & y ticks for better readability due to unexpected large number of unique duration values
movies_ax.set_yticks(movies_ax.get_yticks()[::len(movies_ax.get_yticks()) // 6])
movies_ax.set_xticks(movies_ax.get_xticks()[::len(movies_ax.get_xticks()) // 20])

movies_ax.set_title("Distribution of Movie Durations on Disney+")
movies_ax.set_xlabel("Duration (mins)")
movies_ax.set_ylabel("Count")
plt.tight_layout()
plt.show()

In [None]:
# Filtering for TV shows with 10 seasons or less, to avoid skewing the distribution plot

'''
Having gone through the unique genre types and entries in the dataset, it's clear that there are no anomalous entries in the 'duration_seasons' column for TV Shows.
Thus, we can proceed to filter the DataFrame based on the number of seasons for better visualization, as few shows have a high number of seasons (e.g., 10+), which skews the distribution plot.
'''

filtered_tv = test_tv_shows_df[test_tv_shows_df["duration_seasons"] <= 10]
shows_ax = filtered_tv.groupby("duration_seasons").size().plot(kind="bar", figsize=(12,6))

# Adjusting y ticks for better readability
shows_ax.set_yticks(shows_ax.get_yticks()[::len(shows_ax.get_yticks()) // 6])

shows_ax.set_xlabel("Number of seasons")
shows_ax.set_ylabel("Number of TV Shows")
shows_ax.set_title("Distribution of Number of Seasons in TV Shows on Disney+")
plt.tight_layout()
plt.show()

In [None]:
test_tv_shows_df["cast_count"] = test_tv_shows_df["cast"].apply(len)

''' 
Filtering for 10 cast members or less for better visualization, 
as it's shown most of publicly acknowledged shows have at least few cast members
'''

test_tv_shows_df["cast_count"].hist(bins=range(0, 10), edgecolor='black', figsize=(12,6))
plt.xlabel("Number of Cast Members")
plt.ylabel("Number of TV Shows")
plt.title("Distribution of Cast Sizes in TV Shows on Disney+")
plt.tight_layout()
plt.show()

In [None]:
# Movies cast size distribution; adding 'cast_count' column to movies DataFrame to facilitate plotting of additioanl insights

test_movies_df["cast_count"] = test_movies_df["cast"].apply(len)
test_movies_df["cast_count"].hist(bins=range(0, 9), edgecolor='black', figsize=(12,6))
plt.xlabel("Number of Cast Members")
plt.ylabel("Number of Movies")
plt.title("Distribution of Cast Sizes in Movies on Disney+")
plt.tight_layout()
plt.show()

In [None]:
disney_media = disney_df.listed_in

disney_genres = []
for film in disney_media:
    for genre in film.split(', '):
        if genre not in disney_genres:
            disney_genres.append(genre)
print(f'List of unique Disney+ media genres: {disney_genres}')
print(f'Number of unique media genres in Disney+ dataset: {len(disney_genres)}')

In [None]:
# Extract unique cast members from the Disney+ dataset & add to a set
disney_unique_cast_members = set()

for cast_entry in disney_df["cast"]:

    # Skip NaN or missing values
    if not isinstance(cast_entry, str):
        continue

    # Split by comma
    for actor in cast_entry.split(","):
        actor = actor.strip()
        if actor:  # skip empty strings
            disney_unique_cast_members.add(actor)

# Convert the set to a list for easier handling
disney_unique_cast_members = list(disney_unique_cast_members)

print(f"Sample of 'disney_unique_cast_members' list for inspection: \n{disney_unique_cast_members[:10]}\n")
print(f'Number of unique cast members in Disney+ dataset: {len(disney_unique_cast_members)}')

In [None]:
# Explode Disney+ movie genres into separate rows
movies_df_exploded = test_movies_df.explode("listed_in").copy()

# Adding cast count column to the exploded dataframe
movies_df_exploded["cast_count"] = movies_df_exploded["cast"].str.len()

# Removing empty genres that become NaN after explode, before the groupby
movies_df_exploded = movies_df_exploded[movies_df_exploded["listed_in"].notna()]

# Total number of role appearances in each genre (counting a movie once per genre)
movie_genre_cast_counts = movies_df_exploded.groupby("listed_in")["cast_count"].sum().sort_values(ascending=False)
print(f'Number of role appearances on Disney+ media by genre: \n{movie_genre_cast_counts}')

In [None]:
## Disney+ TV-shows & Movies unique actors per genre count

movie_unique_actors_per_genre = unique_actors_per_genre(test_movies_df)
tv_unique_actors_per_genre = unique_actors_per_genre(test_tv_shows_df)

# Answers how many for each movie & tv-show genre on Disney+, how many distinct actors have appeared in at least one movie of that genre
print(f'Number of unique actors per movie genre on Disney+: {len(movie_unique_actors_per_genre)}')
print(f'Number of unique actors per tv-show genre on Disney+: {len(tv_unique_actors_per_genre)}')

In [None]:
# Pie chart of unique actors per movie genre (top 10 genres only)

top_n = 10
top_genres = movie_unique_actors_per_genre.head(top_n)

plt.figure(figsize=(8, 8))
top_genres.plot(kind="pie", autopct="%1.1f%%")
plt.ylabel("")  # hide y-label
plt.title("Share of Unique Actors by Genre on Disney+ (Top 10 Movie Genres)")
plt.tight_layout()
plt.show()

In [None]:
# Pie chart of unique actors per TV-show genre (top 10 genres only)

top_n = 10
top_genres = tv_unique_actors_per_genre.head(top_n)

plt.figure(figsize=(8, 8))
top_genres.plot(kind="pie", autopct="%1.1f%%")
plt.ylabel("")  # hide y-label
plt.title("Share of Unique Actors by Genre on Disney+ (Top 10 TV-show Genres)")
plt.tight_layout()
plt.show()

In [None]:
# explode Disney+ tv shows genres into separate rows
shows_df_exploded = test_tv_shows_df.explode("listed_in")
shows_df_exploded["cast_count"] = shows_df_exploded["cast"].apply(len)

# Sorted tv show genre cast counts by ascending order to identify genres with largest casts
show_genre_cast_counts = shows_df_exploded.groupby("listed_in")["cast_count"].sum().sort_values(ascending=False)
print(f'Number of role appearances on Disney+ TV-shows by genres: \n{show_genre_cast_counts}')

In [None]:
# Pie chart of unique countries where TV-shows are made per tv-show genre (top 10 countries only)

top_n = 6

# Explode list-of-countries into one country per row
show_countries = test_tv_shows_df["country"].explode()

# As a precaution, drop any 'empties' / 'missing' values present in the 'country' column
show_countries = show_countries.dropna()
show_countries = show_countries[show_countries != ""]   # remove empty strings
# (after explode, rows that were [] become NaN & get dropped)

'''
Due to the fact that Disney+ has a more limited selection of TV shows compared to other streaming platforms & is far younger than them,
we can only show a limited number of countries in a pie chart, so we pick top_n of 6 in order to have a readable chart.
'''

top_show_countries = show_countries.value_counts().head(top_n)

plt.figure(figsize=(14, 9))
top_show_countries.plot(kind="pie", autopct="%1.1f%%", startangle=90)
plt.ylabel("")
plt.title("Share of TV Shows by Country on Disney+ (Top 6 Countries)")
plt.tight_layout()
plt.show()

In [None]:
# Pie chart of unique countries where movies are produced per movie genre

top_n = 5  # limited to 8 for readability of pie chart

# Explode list-of-countries into one country per row
movie_countries = test_movies_df["country"].explode()

# As a precaution, drop any 'empties' / 'missing' values present in the 'country' column
movie_countries = movie_countries.dropna()
movie_countries = movie_countries[movie_countries != ""]          # just in case
# (after explode, rows that were [] become NaN & get dropped)

'''
Once again, due to the limited selection of movies on Disney+, we can only show a more limited number of countries in the pie chart for movies,
so we pick top_n of 5 in order to have a readable chart. Though there are more countries in the dataset, 
the pie chart would become unreadable with too many slices, due to shares taken by 'other' countries 
that are more popular to film & produce media in than the rest.
'''

top_movie_countries = movie_countries.value_counts().head(top_n)

plt.figure(figsize=(12, 8))
top_movie_countries.plot(kind="pie", autopct="%1.1f%%", startangle=90)
plt.ylabel("")
plt.title("Share of Movies by Country on Disney+ (Top 5 Countries)")
plt.tight_layout()
plt.show()

In [None]:
# Pie chart showing the share of Disney+ titles by country of production

top_n = 5  # limited to 7 for readability of pie chart

# Explode converts each list into separate rows so each country is counted individually
media_countries = test_disney_df["country"].explode()

# As a precaution, remove missing or empty entries that result from empty lists
media_countries = media_countries.dropna()
media_countries = media_countries[media_countries != ""]   # extra safeguard

# Count how many titles are associated with each country
top_media_countries = media_countries.value_counts().head(top_n)

'''
Due to the significantly smaller dataset of Disney+ titles compared to other streaming platforms, 
due to its relatively recent launch & more limited global reach as a streaming service (whilst predominantly being focused on US market),
we can only show a limited number of countries in the pie chart, so we pick top_n of 5 in order to have a somewhat readable chart
that still captures the main production countries for Disney+ titles.
'''

# Plot pie chart
plt.figure(figsize=(12, 8))
top_media_countries.plot(
    kind="pie",
    autopct="%1.1f%%",
    startangle=90
)
plt.ylabel("")  # hide default y-axis label
plt.title("Share of Disney+ Titles by Country of Production (Top 8)")
plt.tight_layout()
plt.show()