# Import the Required Libraries

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
import altair as alt

import folium
from folium.features import DivIcon

# Dataset

## Title: Global YouTube Statistics Dataset 2023

## Description:

The dataset sourced from Kaggle, titled "Global YouTube Statistics 2023," provides a comprehensive exploration of YouTube stardom by unveiling key statistics related to the most subscribed YouTube channels. The dataset is meticulously curated, offering a rich source for analysis and insights into the luminaries of the platform. It encompasses a diverse range of information, including subscriber counts, video views, upload frequency, country of origin, earnings, and more. Aspiring content creators, data enthusiasts, and those intrigued by the dynamic landscape of online content will find this dataset to be a valuable resource.

Reference Link: https://www.kaggle.com/datasets/nelgiriyewithana/global-youtube-statistics-2023
Data Source: The dataset was meticulously compiled from various reputable sources, ensuring accuracy and reliability of the information presented.

# Data Ingestion

In [2]:
youtube_data = pd.read_csv("Global YouTube Statistics.csv", encoding='ISO-8859-1')
youtube_data.head()

Unnamed: 0,rank,Youtuber,subscribers,video views,category,Title,uploads,Country,Abbreviation,channel_type,...,subscribers_for_last_30_days,year,month,date,Gross tertiary education enrollment (%),Population,Unemployment rate,Urban_population,Latitude,Longitude
0,1,T-Series,245000000,228000000000.0,Music,T-Series,20082,India,IN,Music,...,2000000.0,2006.0,Mar,13.0,28.1,1366418000.0,5.36,471031528.0,20.593684,78.96288
1,2,YouTube Movies,170000000,0.0,Film & Animation,youtubemovies,1,United States,US,Games,...,,2006.0,Mar,5.0,88.2,328239500.0,14.7,270663028.0,37.09024,-95.712891
2,3,MrBeast,166000000,28368840000.0,Entertainment,MrBeast,741,United States,US,Entertainment,...,8000000.0,2012.0,Feb,20.0,88.2,328239500.0,14.7,270663028.0,37.09024,-95.712891
3,4,Cocomelon - Nursery Rhymes,162000000,164000000000.0,Education,Cocomelon - Nursery Rhymes,966,United States,US,Education,...,1000000.0,2006.0,Sep,1.0,88.2,328239500.0,14.7,270663028.0,37.09024,-95.712891
4,5,SET India,159000000,148000000000.0,Shows,SET India,116536,India,IN,Entertainment,...,1000000.0,2006.0,Sep,20.0,28.1,1366418000.0,5.36,471031528.0,20.593684,78.96288


In [3]:
youtube_data.columns

Index(['rank', 'Youtuber', 'subscribers', 'video views', 'category', 'Title',
       'uploads', 'Country', 'Abbreviation', 'channel_type',
       'video_views_rank', 'country_rank', 'channel_type_rank',
       'video_views_for_the_last_30_days', 'lowest_monthly_earnings',
       'highest_monthly_earnings', 'lowest_yearly_earnings',
       'highest_yearly_earnings', 'subscribers_for_last_30_days', 'year',
       'month', 'date', 'Gross tertiary education enrollment (%)',
       'Population', 'Unemployment rate', 'Urban_population', 'Latitude',
       'Longitude'],
      dtype='object')

In [4]:
youtube_data.shape

(995, 28)

# Data Cleaning

In [5]:
# Filter out unrealistic 'year' values (e.g., years before YouTube's after launching worldwide in 2005)
youtube_data = youtube_data[youtube_data['year'] >= 2005]
youtube_data.shape

(989, 28)

### Data Type Conversion:

In [6]:
# Overview of data types
data_types_overview = youtube_data.dtypes
data_types_overview

rank                                         int64
Youtuber                                    object
subscribers                                  int64
video views                                float64
category                                    object
Title                                       object
uploads                                      int64
Country                                     object
Abbreviation                                object
channel_type                                object
video_views_rank                           float64
country_rank                               float64
channel_type_rank                          float64
video_views_for_the_last_30_days           float64
lowest_monthly_earnings                    float64
highest_monthly_earnings                   float64
lowest_yearly_earnings                     float64
highest_yearly_earnings                    float64
subscribers_for_last_30_days               float64
year                           

-- Convert any date-related columns to a date-time format for easier handling in time series analysis.
Other type conversions can be considered based on the specific analysis needs.

In [7]:
youtube_data['date'].head(5), youtube_data['month'].head(5), youtube_data['year'].head(5)

(0    13.0
 1     5.0
 2    20.0
 3     1.0
 4    20.0
 Name: date, dtype: float64,
 0    Mar
 1    Mar
 2    Feb
 3    Sep
 4    Sep
 Name: month, dtype: object,
 0    2006.0
 1    2006.0
 2    2012.0
 3    2006.0
 4    2006.0
 Name: year, dtype: float64)

In [8]:
# Convert 'month' from textual to numerical format
months_mapping = {
    'Jan': 1, 'Feb': 2, 'Mar': 3, 'Apr': 4, 'May': 5, 'Jun': 6,
    'Jul': 7, 'Aug': 8, 'Sep': 9, 'Oct': 10, 'Nov': 11, 'Dec': 12
}
youtube_data['month'] = youtube_data['month'].map(months_mapping)

# Ensure all date components are integers and format them for 'date_full'
youtube_data['year'] = youtube_data['year'].astype(int)
youtube_data['month'] = youtube_data['month'].astype(int)
youtube_data['date'] = youtube_data['date'].astype(int)
youtube_data['date_full'] = pd.to_datetime(
    youtube_data['year'].astype(str) + '-' +
    youtube_data['month'].astype(str).str.zfill(2) + '-' +
    youtube_data['date'].astype(str).str.zfill(2),
    errors='coerce'
)
# Drop the original date columns
youtube_data = youtube_data.drop(columns=['year', 'month', 'date'])

In [9]:
youtube_data.head()

Unnamed: 0,rank,Youtuber,subscribers,video views,category,Title,uploads,Country,Abbreviation,channel_type,...,lowest_yearly_earnings,highest_yearly_earnings,subscribers_for_last_30_days,Gross tertiary education enrollment (%),Population,Unemployment rate,Urban_population,Latitude,Longitude,date_full
0,1,T-Series,245000000,228000000000.0,Music,T-Series,20082,India,IN,Music,...,6800000.0,108400000.0,2000000.0,28.1,1366418000.0,5.36,471031528.0,20.593684,78.96288,2006-03-13
1,2,YouTube Movies,170000000,0.0,Film & Animation,youtubemovies,1,United States,US,Games,...,0.04,0.58,,88.2,328239500.0,14.7,270663028.0,37.09024,-95.712891,2006-03-05
2,3,MrBeast,166000000,28368840000.0,Entertainment,MrBeast,741,United States,US,Entertainment,...,4000000.0,64700000.0,8000000.0,88.2,328239500.0,14.7,270663028.0,37.09024,-95.712891,2012-02-20
3,4,Cocomelon - Nursery Rhymes,162000000,164000000000.0,Education,Cocomelon - Nursery Rhymes,966,United States,US,Education,...,5900000.0,94800000.0,1000000.0,88.2,328239500.0,14.7,270663028.0,37.09024,-95.712891,2006-09-01
4,5,SET India,159000000,148000000000.0,Shows,SET India,116536,India,IN,Entertainment,...,5500000.0,87500000.0,1000000.0,28.1,1366418000.0,5.36,471031528.0,20.593684,78.96288,2006-09-20


### Handle Missing Values:

In [10]:
# Check for missing values
missing_values = youtube_data.isnull().sum()

# Overview of missing values 
missing_values_overview = missing_values[missing_values > 0]
missing_values_overview

category                                    46
Country                                    120
Abbreviation                               120
channel_type                                27
country_rank                               114
channel_type_rank                           29
video_views_for_the_last_30_days            51
subscribers_for_last_30_days               332
Gross tertiary education enrollment (%)    121
Population                                 121
Unemployment rate                          121
Urban_population                           121
Latitude                                   121
Longitude                                  121
dtype: int64

-- For columns critical to our analysis (like Country, channel_type), we should carefully handle missing values. We might drop rows where these key values are missing.
For socio-economic indicators, we can consider imputation based on country averages or other logical methods if needed in our analysis.
<br></br>

In [11]:
# date, subscribers, video views, Country, channel_type, highest_yearly_earnings are my key columns. 
# Only Country and channel_type has null values. I'm dropping the Null values.
key_columns = ['Country', 'channel_type']
youtube_data = youtube_data.dropna(subset=key_columns)

In [12]:
youtube_data.columns

Index(['rank', 'Youtuber', 'subscribers', 'video views', 'category', 'Title',
       'uploads', 'Country', 'Abbreviation', 'channel_type',
       'video_views_rank', 'country_rank', 'channel_type_rank',
       'video_views_for_the_last_30_days', 'lowest_monthly_earnings',
       'highest_monthly_earnings', 'lowest_yearly_earnings',
       'highest_yearly_earnings', 'subscribers_for_last_30_days',
       'Gross tertiary education enrollment (%)', 'Population',
       'Unemployment rate', 'Urban_population', 'Latitude', 'Longitude',
       'date_full'],
      dtype='object')

In [13]:
# Renaming column names for consistency and to make them easy to read and use
youtube_data.rename(columns={
    'Youtuber':'youtuber',
    'video views': 'video_views',
    'Title': 'title',
    'Country': 'country',
    'Abbreviation': 'abbreviation',
    'video_views_for_the_last_30_days': 'video_views_last_30_days',
    'subscribers_for_last_30_days': 'subscribers_last_30_days',
    'Gross tertiary education enrollment (%)': 'tertiary_education_enrollment',
    'Population':'population',
    'Unemployment rate': 'unemployment_rate',
    'Urban_population': 'urban_population',
    'Latitude': 'latitude',
    'Longitude': 'longitude',
    'created_date_full': 'created_date'
}, inplace=True)

# Display the updated columns
youtube_data.columns

Index(['rank', 'youtuber', 'subscribers', 'video_views', 'category', 'title',
       'uploads', 'country', 'abbreviation', 'channel_type',
       'video_views_rank', 'country_rank', 'channel_type_rank',
       'video_views_last_30_days', 'lowest_monthly_earnings',
       'highest_monthly_earnings', 'lowest_yearly_earnings',
       'highest_yearly_earnings', 'subscribers_last_30_days',
       'tertiary_education_enrollment', 'population', 'unemployment_rate',
       'urban_population', 'latitude', 'longitude', 'date_full'],
      dtype='object')

# Visualizations

# Comprehensive Insights into YouTube Channel Dynamics: Growth Trends, Top Genres, and Evolution of Key Types

# 1. Youtube Channel's Growth Over Years (Simple)

This line plot illustrates the trend in the creation of YouTube channels from 2005 to 2022, providing a visual representation of the evolving landscape of channel inception over the years. The x-axis represents the creation year, while the y-axis indicates the frequency of channel creation. The chart effectively captures the trend of channel creation, offering insights into the changing dynamics of YouTube's content creator community.

In [14]:
chart = alt.Chart(youtube_data).mark_line(interpolate='basis').encode(
    alt.X('year(date_full):O', title=None),
    alt.Y('count():Q', title='Frequency Of Channels Created')
).properties(
    title='Yearly Channel Creation Frequency',
    width=800,
    height=400
)

# Create a horizontal reference line at y = 50
reference_line = alt.Chart(pd.DataFrame({'y_value': [50]})).mark_rule(color='red').encode(
    y='y_value:Q'
)

# Combine the chart and the reference line
final_chart = (chart + reference_line).configure_axis(
    labelFontSize=12,
    titleFontSize=16,
).configure_title(
    fontSize=24
)

final_chart

# 2. Top 10 YouTube Channel Types by Quantity (Simple)

This bar chart showcases the distribution of YouTube channels based on their types, highlighting the top 10 channel types with the highest number of channels. The horizontal bars represent the count of channels in that category, while the vertical axis denotes the channel types. The chart provides valuable insights into the diversity of channel types on the platform, emphasizing the prominence of the top categories in terms of channel quantity. This plot makes it easy to explore the dominance of specific channel types and their prevalence within the YouTube content creator community.

In [15]:
# Create a new dataframe from youtube_data by taking top channel types based on the number of channels in each type.
channel_counts = youtube_data.groupby("channel_type").size().reset_index(name="Count")
top_10_channels_data = channel_counts.sort_values(by="Count", ascending=False).head(10)
# Get the top 3 channel types
top_3_channel_types = top_10_channels_data.head(3)['channel_type'].tolist()


chart = alt.Chart(top_10_channels_data).mark_bar(color = 'lightblue').encode(
    alt.X("Count:Q", title="Number of Channels", axis = None),
    alt.Y("channel_type:N", title="Channel Type", sort='-x')
).properties(
    title="Top YouTube Channel Types by Quantity",
    width=600,
    height=500
)

# Modify the color encoding to highlight the top 3 bars
chart = chart.encode(
    color=alt.condition(
        alt.FieldOneOfPredicate(field='channel_type', oneOf=top_3_channel_types),
        alt.value('#1f77b4'),  # Highlight color
        alt.value('lightblue')  # Non-highlight color
    )
).properties(
    title="Top YouTube Channel Types by Quantity",
    width=600,
    height=500
)

# Adding a text layer to display the text on the plot
text = chart.mark_text(
    align='left',
    baseline='middle',
    dx=3 
).encode(
    text='Count:Q'
)

# Combine the bar chart and text labels without grid lines
plot = (chart + text).configure_axis(
    labelFontSize=12,
    titleFontSize=16,
).configure_title(
    fontSize=24
)
plot

# 3. Evolution of Top YouTube Channel Types Over Time 

This line plot provides a captivating visual representation of the growth trends among the most popular YouTube channel types: Entertainment, Music, and Games. Each line in the chart vividly traces the frequency of new channel creations over various years. The x-axis represents the year of channel formation, while the y-axis quantifies the burgeoning number of channels.

Distinctively, the plot employs a tailored color scheme to enhance clarity and distinction among the channel types: Entertainment channels are marked in a striking coral (#FF6F61), Music channels in a deep blue (#5E77FF), and Games channels in a vibrant green (#4CAF50). This color differentiation facilitates an intuitive comparison and analysis of each category's growth trajectory.

The chart forgoes traditional legends. Instead, the name of each channel type is annotated directly on the plot, positioned at the last available data point of each respective line. This decision not only streamlines the visual presentation but also emphasizes the most recent data points, making it easier to track the latest trends.

The design of this plot serves more than just an aesthetic purpose; it provides a clear and comprehensive exploration into how these top channel types have evolved and influenced the dynamic landscape of YouTube's content creator community. This chart is not just a visual treat but also a testament to the ever-changing and growing world of digital content creation.

In [16]:
# Define a custom color scheme
custom_color_scheme = {
    'Entertainment': '#FF6F61',
    'Music': '#5E77FF',
    'Games': '#4CAF50' 
}

# Create the line chart
line_chart = alt.Chart(youtube_data).mark_line(interpolate='basis').encode(
    alt.X('year(date_full):O', title=None),
    alt.Y('count():Q', title='Number of Channels Created'),
    alt.Color('channel_type:N', scale=alt.Scale(domain=list(custom_color_scheme.keys()), range=list(custom_color_scheme.values())), legend = None)
).properties(
    title="Evolution of Top YouTube Channel Types Over Time",
    width=800,
    height=400
).transform_filter(
    'datum.channel_type == "Entertainment" | datum.channel_type == "Music" | datum.channel_type == "Games"'
)

# Create text annotations for the channel types
text_annotations = alt.Chart(youtube_data).mark_text(
    size = 15,
    align='left',
    baseline='middle',
    dx=3,
    dy = -15
).encode(
    x='year(date_full):O',
    y='count():Q',
    text='channel_type:N',
    color='channel_type:N'
).transform_filter(
    (alt.datum.channel_type == 'Entertainment') | (alt.datum.channel_type == 'Music') | (alt.datum.channel_type == 'Games')
).transform_window(
    rank='rank()',
    sort=[alt.SortField('date_full', order='descending')],
    groupby=['channel_type']
).transform_filter(
    (alt.datum.rank == 1)
)

# Combine the line chart with the text annotations
plot = (line_chart + text_annotations).configure_axis(
    labelFontSize=12,
    titleFontSize=16
).configure_title(
    fontSize=24
)

# Display the combined plot
plot

# Channels and Different Genres Distribution Over Countries

# 4. Top 10 Countries with the Highest Number of YouTube Channels

This bar chart illustrates the distribution of YouTube channels across countries, highlighting the top 10 nations with the largest number of channels. The horizontal bars represent the count of channels, while the vertical axis denotes the respective countries. The chart provides valuable insights into the geographic distribution of YouTube content creators, emphasizing the prominence of the top countries in terms of channel quantity. It helps to explore the global landscape of YouTube contributions and understand the concentration of content creation within specific nations with this informative visualization.

In [17]:
# Calculate the number of channels present in each country
country_counts = youtube_data.groupby("country").size().reset_index(name="Count")

# Select the top 10 countries based on number of channels each country is having.
top_10_countries_data = country_counts.sort_values(by="Count", ascending=False).head(10)
# Get the top 3 countries
top_3_countries_data = top_10_countries_data.head(3)['country'].tolist()

# Create the Altair chart
chart = alt.Chart(top_10_countries_data).mark_bar().encode(
    alt.X("Count:Q", title="Number of Youtube Channels", axis = None),
    alt.Y("country:N", title=None, sort='-x'),
    color=alt.condition(
        alt.FieldOneOfPredicate(field='country', oneOf=top_3_countries_data),
        alt.value('#1f77b4'),  # Highlight color
        alt.value('lightblue')  # Non-highlight color
    )
).properties(
    title="Top Countries with Highest Number of YouTube Channels",
    width=600,
    height=500
)


# Adding a text layer to display the text on the plot
text = chart.mark_text(
    align='left',
    baseline='middle',
    dx=3 
).encode(
    text='Count:Q'
)

plot = (chart + text).configure_axis(
    labelFontSize=12,
    titleFontSize=16
).configure_title(
    fontSize=24
)
plot

# 5. Types of Channels in Top Countries with Highest Channels

This faceted bar chart delves into the distribution of YouTube channel types within the top three countries—United States, India, and Brazil—each recognized for hosting the highest number of channels. The horizontal bars illustrate the count of channels, categorized by type, while the vertical axis represents the respective channel types. The custom color scheme distinguishes between the three countries: United States (blue), India (orange), and Brazil (deep orange). The facetted layout allows for a side-by-side comparison of channel types across the selected countries, providing a comprehensive view of the content creation landscape. This visualization aids in understanding the content preferences and diversity within the top YouTube-contributing nations.

In [18]:
# Define the custom sorting order for countries as per the channel count in each country.
custom_sort_order = ["United States", "India", "Brazil"]

# Filter data for the top 3 countries with highest no.of youtube channels(United States, India, Brazil)
selected_countries = custom_sort_order
filtered_data = youtube_data[youtube_data["country"].isin(selected_countries)]

# Calculate the count of channel types for each country
channel_counts_by_country = filtered_data.groupby(["country", "channel_type"]).size().reset_index(name="Count")

# Define the custom colors for countries
# Used the same colors for these countries thorughout the project
custom_colors = {'United States': '#3498db', 'India': '#f39c12', 'Brazil': '#d35400'}

# Create the bar chart
bars = alt.Chart(channel_counts_by_country).mark_bar().encode(
    alt.Y("channel_type:N", title="Channel Type", sort='-x'),
    alt.X("Count:Q", axis = None),
    alt.Color("country:N", title="Country", scale=alt.Scale(domain=list(custom_colors.keys()), range=list(custom_colors.values())), legend=None)
).properties(
    width=250,
    height=400
)

# Create the text labels
text = bars.mark_text(
    align='left',
    baseline='middle',
    dx=2  
).encode(
    text='Count:Q'
)


chart = alt.layer(bars, text).facet(
    alt.Row('country:N', 
            header=alt.Header(title = 'Country',titleFontSize=20, labelFontSize=16),
            sort=custom_sort_order),
    columns=3
).properties(
    title={
        "text": ["Channel Types by Quantity for Top Countries"],
        "dx": 200,
        "fontSize": 24,
        "offset": 20,
    }
).configure_axis(
    labelFontSize=12,
    titleFontSize=16
).configure_title(
    fontSize=24
)

chart

# 6. Top YouTuber in Top Countries with Highest Number of Channels (Complex)

This map offers a geographic perspective on the most prominent YouTubers within the top countries boasting the highest number of channels. By pinpointing the top content creators in each country, the map enhances our understanding of the global YouTube landscape. Each red star icon represents a leading YouTuber, and the accompanying text label provides the creator's name along with their country of origin. The text label is displayed as we can understand the top youtubers even without interacting with the plot. The map allows users to explore the distribution of top YouTubers across different regions, emphasizing the international influence of these content creators. This visualization seamlessly integrates geographical and subscriber data, offering a captivating glimpse into the world of YouTube stardom on a global scale.

In [19]:
df = youtube_data

# Group the data by country and count the number of channels per country
top_countries = df['country'].value_counts().head(10).index.tolist()

top_youtubers = df[df['country'].isin(top_countries)].groupby('country').apply(
    lambda x: x.nlargest(1, 'subscribers')).reset_index(drop=True)

# Select relevant data for mapping
top_youtubers_map_data = top_youtubers[['youtuber', 'country', 'latitude', 'longitude']]

# Initialize a map
m = folium.Map(location=[20, 0], zoom_start=2)

# Function to create a marker with a simple icon and a popup text
def add_marker(map_obj, location, popup_text, label_text):
    # Marker with popup
    folium.Marker(
        location=location,
        popup=folium.Popup(popup_text, parse_html=True),
        icon=folium.Icon(color='red', icon='star')
    ).add_to(map_obj)
    
    # Custom text label
    folium.map.Marker(
        location=location,
        icon=DivIcon(
            icon_size=(150,36),
            icon_anchor=(0,0),
            html=f'<div style="font-size: 10pt">{label_text}</div>',
        )
    ).add_to(map_obj)

# Add markers for the top YouTubers on the map
for idx, row in top_youtubers_map_data.iterrows():
    add_marker(m, (row['latitude'], row['longitude']), f"{row['youtuber']} from {row['country']}", row['youtuber'])

m

# Subscriber Analysis, Channel Activeness, Subscribers and Growth

# 7. Average Subscribers Evolution Over Channel Creation Years

This line plot provides a compelling insight into the evolving landscape of YouTube channels by illustrating the average number of subscribers over the years of their creation. The x-axis represents the creation year, offering a chronological view, while the y-axis quantifies the average number of subscribers. The smooth line, generated using basis interpolation, captures trends in subscriber engagement over time. This visualization enables a nuanced understanding of how audience reach has evolved across the platform, highlighting periods of notable growth or shifts in subscriber dynamics.

*Note: The data is based on the mean number of subscribers for channels created each year.*

In [20]:
chart = alt.Chart(youtube_data).mark_line(interpolate='basis').encode(
    alt.X('year(date_full):O', title = None),
    alt.Y('mean(subscribers):Q', title = "Average Subscribers (Millions)", axis=alt.Axis(format='.2s'))
).properties(
    title="Average Subscribers For Channels Created Over Years",
    width=800,
    height=400
).configure_axis(
    labelFontSize=12,
    titleFontSize=16
).configure_title(
    fontSize=24
)

chart

# 8. Average Subscribers Evolution Over Channel Creation Years for Top YouTube Channel Types

This line plot provides a comprehensive view of the evolving landscape of YouTube channels by depicting the average number of subscribers over the years of their creation. The x-axis showcases the chronological progression of channel creation years, while the y-axis quantifies the average number of subscribers in millions. The smooth lines, distinguished by a custom color scheme for Entertainment (coral), Music (deep blue), and Games (vibrant green), offer insights into the subscriber engagement trends over time. This visualization enables a nuanced understanding of how audience reach has evolved for the top channel types—Entertainment, Music, and Games—highlighting periods of notable growth or shifts in subscriber dynamics.

In [21]:
# Creating altair chart
chart = alt.Chart(youtube_data).mark_line(interpolate='basis').encode(
    alt.X('year(date_full):O', title=None),
    alt.Y('mean(subscribers):Q',title = "Average Subscribers (Millions)", axis=alt.Axis(format='.2s')),
    alt.Color('channel_type:N', scale=alt.Scale(domain=list(custom_color_scheme.keys()), range=list(custom_color_scheme.values())), legend = None)
).properties(
    title='Average Subscribers For Frequent Channel Types Created Each Year',
    width=800,
    height=400
).transform_filter('datum.channel_type == "Entertainment" | datum.channel_type == "Music" | datum.channel_type == "Games"')


# Create text annotations for the channel types
text_annotations = alt.Chart(youtube_data).mark_text(
    size = 15,
    align='left',
    baseline='middle',
    dx=3,
    dy = -15
).encode(
    x='year(date_full):O',
    y='mean(subscribers):Q',
    text='channel_type:N',
    color='channel_type:N'
).transform_filter(
    (alt.datum.channel_type == 'Entertainment') | (alt.datum.channel_type == 'Music') | (alt.datum.channel_type == 'Games')
).transform_window(
    rank='rank()',
    sort=[alt.SortField('date_full', order='descending')],
    groupby=['channel_type']
).transform_filter(
    (alt.datum.rank == 1)
)

plot = (chart + text_annotations).configure_axis(
    labelFontSize=12,
    titleFontSize=16
).configure_title(
    fontSize=24
)

# Display the combined plot
plot

# 9. Average Video Views Evolution Over Channel Creation Years for Top YouTube Channel Types

This line plot offers a comprehensive overview of the evolution of average video views for top YouTube channel types—Entertainment, Music, and Games—across different years of their creation. The x-axis represents the chronological progression of channel creation years, while the y-axis quantifies the mean total video views, displayed in a user-friendly format. The smooth lines, distinguished by a custom color scheme, provide insights into how video views have changed over time for each channel type. The plot allows for a nuanced exploration of trends, highlighting the impact of content creators in the Entertainment, Music, and Games categories on the platform.

In [22]:
chart = alt.Chart(youtube_data).mark_line(interpolate='monotone').encode(
    alt.X('year(date_full):O', title = None),
    alt.Y('mean(video_views):Q', axis=alt.Axis(format='~s', title='Video Views(in Billions)')),
    alt.Color('channel_type:N', scale=alt.Scale(domain=list(custom_color_scheme.keys()), range=list(custom_color_scheme.values())), legend = None)
).properties(
    title='Average Video Views For Frequent Channel Types Created Each Year',
    width=800,
    height=400
).transform_filter('datum.channel_type == "Entertainment" | datum.channel_type == "Music" | datum.channel_type == "Games"')

# Create text annotations for the channel types
text_annotations = alt.Chart(youtube_data).mark_text(
    align='left',
    baseline='middle',
    dx=3,
    dy = -8
).encode(
    x='year(date_full):O',
    y='mean(video_views):Q',
    text='channel_type:N',
    color='channel_type:N'
).transform_filter(
    (alt.datum.channel_type == 'Entertainment') | (alt.datum.channel_type == 'Music') | (alt.datum.channel_type == 'Games')
).transform_window(
    rank='rank()',
    sort=[alt.SortField('date_full', order='descending')],
    groupby=['channel_type']
).transform_filter(
    (alt.datum.rank == 1)
)


plot = (chart + text_annotations).configure_axis(
    labelFontSize=12,
    titleFontSize=16,
    labelAngle=360
).configure_title(
    fontSize=24
)

plot

# 10. Unveiling the Dynamics: Video Views vs Subscribers in Channel Growth

This captivating scatter plot explores the intricate relationship between the number of video views and channel subscribers on YouTube. The x-axis represents the total number of video views, while the y-axis illustrates the count of channel subscribers. Each circle on the plot signifies a unique YouTube channel, and the scatter plot is augmented by a revealing regression line that captures the overarching trend.

The position of each point in relation to the regression line provides insights into how video views correlate with subscriber growth. Content creators and analysts can use this visualization to inform strategies, emphasizing the critical interplay between views and subscriber acquisition. This visual analysis serves as a powerful tool for creators and enthusiasts alike, shedding light on the underlying dynamics that contribute to the growth of YouTube channels.

In [23]:
# Base scatter plot with adjusted text labels
scatter_plot = alt.Chart(youtube_data).mark_circle().encode(
    x=alt.X('video_views', title="Number of Video Views (Billions)", axis=alt.Axis(format='~s')),
    y=alt.Y('subscribers', title="Number of Subscribers (Millions)", axis=alt.Axis(format='.2s')),
    color=alt.condition(
        (alt.datum.subscribers > 100e6) & (alt.datum.video_views < 40e9),
        alt.value('red'),  # The color for outliers
        alt.value('#1f77b4')  # The color for non-outliers
    ),
    tooltip=['youtuber', 'video_views', 'subscribers']  # Add tooltip to show the YouTuber's name and other details
).properties(
    title="Views vs Subs: Unveiling the Magic Behind Channel Growth",
    width=600,
    height=500
)

# Identify outliers for labeling
outliers = scatter_plot.transform_filter(
    (alt.datum.subscribers > 100e6) & (alt.datum.video_views < 40e9)
)

# Add labels to the outliers with adjusted dx and dy
text_labels = outliers.mark_text(
    align='left',
    baseline='middle',
    dx=6.5,  # Adjust this value to move the text left or right
    dy=1.25   # Adjust this value to move the text up or down
).encode(
    x='video_views',
    y='subscribers',
    text='youtuber'
)

# Add a regression line to the chart
regression_line = scatter_plot.transform_regression('video_views', 'subscribers').mark_line()

# Combine the scatter plot, regression line, and text labels
plot = (scatter_plot + regression_line + text_labels).configure_axis(
    labelFontSize=12,
    titleFontSize=16
).configure_title(
    fontSize=24
)


plot


# 11. Content Marathon: Charting the Top 10 Most Prolific YouTube Channels! (Moderate)

Dive into the realm of content creation as we unveil the most active YouTube channels based on sheer volume. This bar chart showcases the champions of uploads, revealing the top 10 channels that have embarked on a content marathon. The y-axis proudly displays the names of these prolific channels, sorted in descending order of their upload counts, while the x-axis quantifies the impressive number of uploads.
It helps in identifying the YouTube giants consistently feeding the platform with a high volume of content and exploring the dedication and work ethic of these top channels through their prolific upload counts. We can understand how these channels contribute to the vibrant and ever-growing landscape of YouTube.

This visualization offers a captivating glimpse into the world of YouTube's most prolific uploaders.

In [24]:
# Sort the data by the number of uploads and select the top 10 channels
top_10_uploads = youtube_data.sort_values(by='uploads', ascending=False).head(10)

# Create the Altair chart
chart = alt.Chart(top_10_uploads).mark_bar().encode(
    alt.Y('youtuber:N', title='Youtube Channel', sort = '-x'),
    alt.X('uploads:Q', title='Number of Uploads', axis = None)
).properties(
    title='Most Active Channels by Number of Uploads',
    width=600,
    height=400
)

# Adding a text layer to display the text on the plot
text = chart.mark_text(
    align='center',
    baseline='bottom',
    dx=20  # Adjust the text position
).encode(
    text=alt.Text('uploads:Q')
)

# Combine the chart and text layers
plot = (chart + text).configure_axis(
    labelFontSize=12,
    titleFontSize=16
).configure_title(
    fontSize=24
)


plot

# 12. Top 10 Channels by Subscriber Count (Moderate)

This dynamic bar chart unveils the power players of the YouTube landscape by showcasing the top 10 channels based on their staggering subscriber counts. The y-axis proudly lists the channel names, meticulously sorted for visual clarity, while the x-axis quantifies the sheer magnitude of subscribers each channel commands.

The accompanying text on each bar reveals the exact subscriber count, formatted for enhanced readability. Whether you're an enthusiast keeping tabs on digital trends or a creator seeking inspiration, this visualization provides a snapshot of YouTube's subscriber royalty.

In [25]:
# Identifying the top 10 channels by subscriber count
top_channels = youtube_data.nlargest(10, 'subscribers')

# Create the Altair chart
chart = alt.Chart(top_10_uploads).mark_bar().encode(
    alt.Y('youtuber:N', title='Youtube Channel', sort = '-x'),
    alt.X('subscribers:Q', title='Number of Subscribers', axis=None)
).properties(
    title='Top 10 Channels by Subscriber Count',
    width=600,
    height=400
)

text = chart.mark_text(
    align='center',
    baseline='bottom',
    dx=15  # Adjust the text position
).encode(
    text=alt.Text('subscribers:Q', format='.2s')
)

# Combine the chart and text layers
plot = (chart + text).configure_axis(
    labelFontSize=12,
    titleFontSize=16
).configure_title(
    fontSize=24
)


plot

**We can see that some of the channels which are actively uploading videos are having more subscribers**

# 13. Maximum Yearly Earnings for Each YouTube Channel Type (Moderate)

This insightful bar chart unveils the maximum yearly earnings achieved by YouTube channels across various channel types. Each bar represents a specific channel category, showcasing the pinnacle of financial success in the realm of content creation. The y-axis quantifies the maximum yearly earnings in a visually digestible format, allowing for easy comparison between different channel types. Whether exploring the lucrative heights of Entertainment, Music, or Gaming channels, this visualization provides a snapshot of the exceptional financial milestones attained by diverse content creators. The chart's intuitive design emphasizes the diversity of earnings within the YouTube ecosystem, highlighting the standout performers in each channel category. Explore the financial peaks of YouTube content creation with this compelling representation of Maximum Yearly Earnings for Each YouTube Channel Type.

In [26]:
# Create the Altair chart
chart = alt.Chart(youtube_data).mark_bar().encode(
    alt.Y("channel_type:N", title="Channel Type", sort='-x'),
    alt.X("max(highest_yearly_earnings):Q", title="Maximum Yearly Earnings", axis=None)
).properties(
    width=800,
    height=500,
    title="Maximum Yearly Earnings for Each YouTube Channel Type"
)

text = chart.mark_text(
    align='center',
    baseline='bottom',
    dx=20  # Adjust the text position
).encode(
    text=alt.Text('max(highest_yearly_earnings):Q', format='$.2s')
)

plot = (chart + text).configure_axis(
    labelFontSize=12,
    titleFontSize=16
).configure_title(
    fontSize=24
)


plot

# 14. Yearly Earnings Comparison Across Top 3 Countries: United States, India, Brazil (Complex)

This visual representation provides a comprehensive overview of the yearly earnings for various YouTube channel types in the top three countries—United States, India, and Brazil. Each bar in the grouped chart corresponds to a specific channel type, showcasing the annual earnings with a breakdown for each country. The distinctive colors differentiate between the countries, allowing for a clear comparison of how different channel types contribute to the overall yearly earnings landscape. Explore the financial dynamics of the top YouTube-contributing nations and gain insights into the lucrative domains within the diverse content creation ecosystem.

In [27]:
# Filter data for the top 3 countries with highest no. of YouTube channels (United States, India, Brazil)
selected_countries = ["United States", "India", "Brazil"]
filtered_data = youtube_data[youtube_data["country"].isin(selected_countries)]

# Define the custom sorting order for countries and custom colors for countries
custom_sort_order = ["United States", "India", "Brazil"]
custom_colors = {'United States': '#3498db', 'India': '#f39c12', 'Brazil': '#d35400'}

# Create the Altair chart
chart = alt.Chart(filtered_data).mark_bar().encode(
    alt.Y("channel_type:N", title="Channel Type", sort='-x'),
    alt.X("max(highest_yearly_earnings):Q", title="Yearly Earnings", axis= None),
    alt.Color("country:N", title="Country", scale=alt.Scale(domain=list(custom_colors.keys()), range=list(custom_colors.values())), legend = None)
).properties(
    width=250,
    height=400,
)

text = chart.mark_text(
    align='center',
    baseline='bottom',
    dx=15 
).encode(
    text=alt.Text('max(highest_yearly_earnings):Q', format='$.2s')
)


plot = (chart + text).facet(
    alt.Row('country:N', 
            header=alt.Header(title = 'Country',titleFontSize=20, labelFontSize=16),
            sort=custom_sort_order),
    columns=3
).properties(
    title= {
        "text" : ["Maximum Yearly Earnings Comparison Across Top 3 Countries"],
        "dx":150,
        "fontSize":24,
        "offset":20, 
    }
).configure_axis(
    labelFontSize=12,
    titleFontSize=16,
).configure_title(
    fontSize=24
)


plot