In [1]:
# Install and import packages
from googleapiclient.discovery import build
from dateutil import parser
import pandas as pd
from IPython.display import JSON

# Data visualization packages
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import plotly.graph_objects as go

# Natual language processing packages
# Natural Language Toolkit
import nltk
# Stop words are words that you want to ignore, and can be filtered out
from nltk.corpus import stopwords
# Tokenizing allows you to split up text by word or by sentence
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
nltk.download('punkt')
# Wordcloud is a technique to show which words are the most frequent in a given text
from wordcloud import WordCloud

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [2]:
# YouTube API key
api_key = 'AIzaSyDZjq7msyScuLwW3rT4nIx7GFETIhU57Q4'

# List of Popular YouTube Channels for Kids
channel_ids = [
           'UCbCmjCuTUZos6Inko4u57UQ',  # Cocomelon
           'UCX6OQ3DkcsbYNE6H8uQQuVA',  # Mr Beast
           'UCk8GzjMOrta8yxDcKfylJYw',  # Kids Diana Show
           'UCJplp5SjeGSdVdwsfb9Q7lQ',  # Like Nastya
           'UChGJGhZ9SOOHvBB0Y4DOO_w'   # Ryan's World
            ]

# List of Playlist IDs for each channel
playlist_ids = [
             "UUbCmjCuTUZos6Inko4u57UQ",
             "UUX6OQ3DkcsbYNE6H8uQQuVA",
             "UUJplp5SjeGSdVdwsfb9Q7lQ",
             "UUk8GzjMOrta8yxDcKfylJYw",
             "UUhGJGhZ9SOOHvBB0Y4DOO_w"
            ]


# Create the YouTube API client
youtube = build('youtube', 'v3', developerKey=api_key)

# Function to get video IDs from playlists
def get_video_ids(youtube, playlist_ids):
    all_video_ids = []

    for playlist_id in playlist_ids:
        request = youtube.playlistItems().list(
            part="contentDetails",
            playlistId=playlist_id,
            maxResults=50
        )
        response = request.execute()

        for item in response['items']:
            video_id = item['contentDetails']['videoId']
            all_video_ids.append(video_id)

    return all_video_ids

# Function to get channel stats
def get_channel_stats(youtube, channel_ids):
    all_data = []

    request = youtube.channels().list(
        part="snippet,contentDetails,statistics",
        id=','.join(channel_ids)
    )
    response = request.execute()

    for item in response['items']:
        data = {
            'channelName': item['snippet']['title'],
            'subscribers': item['statistics']['subscriberCount'],
            'views': item['statistics']['viewCount'],
            'totalVideos': item['statistics']['videoCount'],
            'playlistId': item['contentDetails']['relatedPlaylists']['uploads']
        }

        all_data.append(data)

    return pd.DataFrame(all_data)

# Function to get video details
def get_video_details(youtube, video_ids):
    all_video_info = []

    for i in range(0, len(video_ids), 50):
        request = youtube.videos().list(
            part="snippet,contentDetails,statistics",
            id=','.join(video_ids[i:i+50])
        )
        response = request.execute()

        for video in response['items']:
            stats_to_keep = {
                'snippet': ['channelTitle', 'title', 'description', 'tags', 'publishedAt'],
                'statistics': ['viewCount', 'likeCount', 'favoriteCount', 'commentCount'],
                'contentDetails': ['duration', 'definition', 'caption']
            }
            video_info = {'video_id': video['id']}

            for k in stats_to_keep.keys():
                for v in stats_to_keep[k]:
                    try:
                        video_info[v] = video[k][v]
                    except:
                        video_info[v] = None

            all_video_info.append(video_info)

    return pd.DataFrame(all_video_info)

# Get video IDs from playlists
video_ids = get_video_ids(youtube, playlist_ids)

# Get channel statistics
channel_df = get_channel_stats(youtube, channel_ids)

# Get video details
video_df = get_video_details(youtube, video_ids)

# Rename the 'channelTitle' column in video_df to 'channelName'
video_df.rename(columns={'channelTitle': 'channelName'}, inplace=True)

# Merge the two dataframes based on a common column (e.g., channelName)
merged_df = pd.merge(channel_df, video_df, on='channelName')
new_df = merged_df.copy()


# Fill missing values in 'tags' with an empty string
#merged_df['tags'].fillna('', inplace=True)

# Fill missing values in 'commentCount' with zero
#merged_df['commentCount'].fillna(0, inplace=True)

# Check for missing values in the merged dataframe
#missing_values = merged_df.isnull().sum()
#print(missing_values)

new_df['subscribers'] = new_df['subscribers'].astype(float)
new_df['views'] = new_df['views'].astype(float)
new_df['totalVideos'] = new_df['totalVideos'].astype(float)

new_df['subscribers'] = new_df['subscribers'].apply(lambda x: '{:,}'.format(x))
new_df['views'] = new_df['views'].apply(lambda x: '{:,}'.format(x))
new_df['totalVideos'] = new_df['totalVideos'].apply(lambda x: '{:,}'.format(x))


# Clean the data if necessary
def merged_df(df):
    # Convert numeric columns to appropriate data types
    df['subscribers'] = pd.to_numeric(df['subscribers'])
    df['views'] = pd.to_numeric(df['views'])
    df['totalVideos'] = pd.to_numeric(df['totalVideos'])

    # Convert duration to seconds
    df['duration'] = df['duration'].apply(convert_duration_to_seconds)

    # Convert publishedAt to datetime
    df['publishedAt'] = pd.to_datetime(df['publishedAt'])
    df['duration'] = pd.to_datetime(df['duration'])

    # Remove unnecessary characters from tags and convert to lowercase
    df['tags'] = df['tags'].str.lower().str.replace(r'[^\w\s]', '')

    # Remove stopwords from description and title
    stop_words = set(stopwords.words('english'))
    df['description'] = df['description'].apply(remove_stopwords)
    df['title'] = df['title'].apply(remove_stopwords)


    return df

#non_numeric_subscribers = pd.to_numeric(new_df['subscribers'], errors='coerce').isna()
#non_numeric_views = pd.to_numeric(new_df['views'], errors='coerce').isna()
#non_numeric_subscribers = pd.to_numeric(new_df['totalVideos'], errors='coerce').isna()

#print(new_df.loc[non_numeric_subscribers, 'subscribers'])
#print(new_df.loc[non_numeric_views, 'views'])
#print(new_df.loc[non_numeric_subscribers, 'totalVideos'])

new_df['subscribers'] = new_df['subscribers'].str.replace(',', '').astype(float)
new_df['views'] = new_df['views'].str.replace(',', '').astype(float)
new_df['totalVideos'] = new_df['totalVideos'].str.replace(',', '').astype(float)




In [None]:
print(new_df.dtypes)

channelName       object
subscribers      float64
views            float64
totalVideos      float64
playlistId        object
video_id          object
title             object
description       object
tags              object
publishedAt       object
viewCount         object
likeCount         object
favoriteCount     object
commentCount      object
duration          object
definition        object
caption           object
dtype: object


In [3]:
# Function to plot a bar chart or pie chart
def plot_channel_subscribers(subscribers):
    plt.figure(figsize=(10, 6))
    plt.bar(subscribers['channelName'], subscribers['subscribers'])
    plt.xlabel('Channel')
    plt.ylabel('Subscriber Count')
    plt.title('Subscriber Counts of YouTube Channels for Kids')
    plt.xticks(rotation=45)
    plt.show()

# Function to plot a scatter plot or line plot
def plot_subscriber_views(subscribers, views):
    plt.figure(figsize=(10, 6))
    plt.scatter(subscribers['subscribers'], views['views'])
    plt.xlabel('Subscriber Count')
    plt.ylabel('Total Views')
    plt.title('Relationship between Subscriber Count and Total Views')
    plt.show()

# Function to plot line plots or area plots
def plot_growth_patterns(subscribers, views):
    plt.figure(figsize=(10, 6))
    #Plot subscriber count over time
    plt.plot(subscribers['publishedAt'], subscribers['subscribers'], label='Subscribers')
    #Plot total views over time
    plt.plot(views['publishedAt'], views['views'], label='Total Views')
    plt.xlabel('Date')
    plt.ylabel('Count')
    plt.title('Growth Patterns of YouTube Channels for Kids')
    plt.legend()
    plt.xticks(rotation=45)
    plt.show()

In [4]:
import plotly.graph_objects as go

# Data for growth analysis
growth_data = {
    'Like Nastya': {
        'publishedAt': ['2023-01-01', '2023-02-01', '2023-03-01'],
        'subscribers': [1000000, 1500000, 2000000],
        'views': [5000000, 6000000, 7000000],
        'totalVideos': [100, 120, 140]
    },
    'Cocomelon - Nursery Rhymes': {
        'publishedAt': ['2023-01-01', '2023-02-01', '2023-03-01'],
        'subscribers': [2000000, 2500000, 3000000],
        'views': [10000000, 11000000, 12000000],
        'totalVideos': [200, 220, 240]
    },
    'Kids Diana Show': {
        'publishedAt': ['2023-01-01', '2023-02-01', '2023-03-01'],
        'subscribers': [500000, 600000, 700000],
        'views': [2000000, 2500000, 3000000],
        'totalVideos': [50, 60, 70]
    },
    'MrBeast': {
        'publishedAt': ['2023-01-01', '2023-02-01', '2023-03-01'],
        'subscribers': [30000000, 35000000, 40000000],
        'views': [50000000, 60000000, 70000000],
        'totalVideos': [500, 550, 600]
    },
    'Ryans World': {
        'publishedAt': ['2023-01-01', '2023-02-01', '2023-03-01'],
        'subscribers': [15000000, 18000000, 20000000],
        'views': [25000000, 28000000, 30000000],
        'totalVideos': [300, 320, 340]
    }
}

# Create Plotly traces for each creator
fig = go.Figure()
for channelName, data in growth_data.items():
    fig.add_trace(go.Scatter(x=data['publishedAt'], y=data['subscribers'], name=f"{channelName} - Subscribers"))
    fig.add_trace(go.Scatter(x=data['publishedAt'], y=data['views'], name=f"{channelName} - Views"))
    fig.add_trace(go.Scatter(x=data['publishedAt'], y=data['totalVideos'], name=f"{channelName} - Total Videos"))

# Set layout and show the plot
fig.update_layout(xaxis_title="Published Date", yaxis_title="Count")
fig.show()


In [5]:

# Data for growth analysis (yearly)
growth_data = {
    'Like Nastya': {
        'year': [2020, 2021, 2022],
        'subscribers': [500000, 1000000, 2000000],
        'views': [2000000, 4000000, 7000000],
        'totalVideos': [50, 80, 140]
    },
    'Cocomelon - Nursery Rhymes': {
        'year': [2020, 2021, 2022],
        'subscribers': [1000000, 2000000, 3000000],
        'views': [4000000, 8000000, 12000000],
        'totalVideos': [100, 160, 240]
    },
    'Kids Diana Show': {
        'year': [2020, 2021, 2022],
        'subscribers': [200000, 400000, 700000],
        'views': [800000, 1600000, 3000000],
        'totalVideos': [20, 40, 70]
    },
    'MrBeast': {
        'year': [2020, 2021, 2022],
        'subscribers': [20000000, 30000000, 40000000],
        'views': [80000000, 120000000, 70000000],
        'totalVideos': [300, 400, 600]
    },
    'Ryans World': {
        'year': [2020, 2021, 2022],
        'subscribers': [10000000, 15000000, 20000000],
        'views': [40000000, 60000000, 30000000],
        'totalVideos': [150, 220, 340]
    }
}

# Create Plotly traces for each creator
fig = go.Figure()
for channelName, data in growth_data.items():
    fig.add_trace(go.Scatter(x=data['year'], y=data['subscribers'], name=f"{channelName} - Subscribers"))
    fig.add_trace(go.Scatter(x=data['year'], y=data['views'], name=f"{channelName} - Views"))
    fig.add_trace(go.Scatter(x=data['year'], y=data['totalVideos'], name=f"{channelName} - Total Videos"))

# Set layout and show the plot
fig.update_layout(xaxis_title="Year", yaxis_title="Count")
fig.show()


In [6]:
import plotly.graph_objects as go

# Dummy data for total views
total_views = {
    'Like Nastya': 2000000000,
    'Cocomelon - Nursery Rhymes': 3500000000,
    'Kids Diana Show': 1500000000,
    'MrBeast': 5000000000,
    'Ryans World': 2500000000
}

# Bar chart for total views
fig1 = go.Figure(data=go.Bar(x=list(total_views.keys()), y=list(total_views.values())))
fig1.update_layout(title='Total Views among YouTube Channels',
                   xaxis_title='Channel',
                   yaxis_title='Total Views')
fig1.show()


In [7]:
import plotly.graph_objects as go

# Dummy data for growth analysis
growth_data = {
    'Like Nastya': {
        'publishedAt': ['2020-01-01', '2020-02-01', '2020-03-01', '2020-04-01'],
        'subscribers': [1000000, 2000000, 3000000, 4000000]
    },
    'Cocomelon - Nursery Rhymes': {
        'publishedAt': ['2020-01-01', '2020-02-01', '2020-03-01', '2020-04-01'],
        'subscribers': [5000000, 6000000, 7000000, 8000000]
    },
    'Kids Diana Show': {
        'publishedAt': ['2020-01-01', '2020-02-01', '2020-03-01', '2020-04-01'],
        'subscribers': [2000000, 2500000, 3000000, 3500000]
    },
    'MrBeast': {
        'publishedAt': ['2020-01-01', '2020-02-01', '2020-03-01', '2020-04-01'],
        'subscribers': [5000000, 6000000, 7000000, 8000000]
    },
    'Ryans World': {
        'publishedAt': ['2020-01-01', '2020-02-01', '2020-03-01', '2020-04-01'],
        'subscribers': [3000000, 3500000, 4000000, 4500000]
    }
}

# Line chart for growth analysis
fig = go.Figure()
for channel, data in growth_data.items():
    fig.add_trace(go.Scatter(x=data['publishedAt'], y=data['subscribers'], name=channel, mode='lines+markers'))

fig.update_layout(title='Subscriber Growth over Time',
                  xaxis_title='Date',
                  yaxis_title='Subscribers')
fig.show()


In [9]:
import plotly.graph_objects as go
import pandas as pd

# Define the data
data = {
    'channelName': ['Cocomelon - Nursery Rhymes', 'Like Nastya', 'MrBeast', "Ryan's World", '✿ Kids Diana Show'],
    'title': [
        'JJ and Cody\'s Shopping Cart Race!',
        'Nastya and the Phone challenge for kids',
        'Extreme $1,000,000 Hide And Seek',
        'How to make DIY foosball from Cardboard at home',
        'Diana Wednesday and School Friendship Story'
    ],
    'views': [9774646, 9850162, 91287836, 964677, 9851155]
}

# Create a DataFrame from the data
df = pd.DataFrame(data)

# Create the bar chart
fig = go.Figure(data=go.Bar(x=df['channelName'], y=df['views'], text=df['title'], textposition='auto'))

# Customize the chart layout
fig.update_layout(
    title='Top 5 Videos with Most Views',
    xaxis_title='Channel',
    yaxis_title='Views',
    barmode='group',
)

# Show the chart
fig.show()


In [10]:
import plotly.graph_objects as go
import pandas as pd
import plotly.io as pio


# Define the data
data = {
    'channelName': [
        'Cocomelon - Nursery Rhymes', 'Cocomelon - Nursery Rhymes', 'Cocomelon - Nursery Rhymes',
        'Cocomelon - Nursery Rhymes', 'Cocomelon - Nursery Rhymes',
        'Like Nastya', 'Like Nastya', 'Like Nastya', 'Like Nastya', 'Like Nastya',
        'MrBeast', 'MrBeast', 'MrBeast', 'MrBeast', 'MrBeast',
        "Ryan's World", "Ryan's World", "Ryan's World", "Ryan's World", "Ryan's World",
        '✿ Kids Diana Show', '✿ Kids Diana Show', '✿ Kids Diana Show', '✿ Kids Diana Show', '✿ Kids Diana Show'
    ],
    'title': [
        "JJ and Cody's Shopping Cart Race!", "Opposites Song (Animal Version)", "CoComelon Song Dance + MORE",
        "Play Outside at the Beach with JJ and Nina!", "Baby Animal Dance + MORE",
        "Nastya and the Phone challenge for kids", "Nastya Cube Challenge and funny kids stories",
        "Nastya and her friends decorate playhouses and other adventures of friends.",
        "Nastya and Evelyn help each other as best friends",
        "Nastya plays Pink vs. Black Challenge with Wednesday",
        "Extreme $1,000,000 Hide And Seek", "1,000 Deaf People Hear For The First Time",
        "Do Pawnshops Scam You?", "I Didn’t Eat Food For 30 Days", "Would You Fly To Paris For A Baguette?",
        "How to make DIY foosball from Cardboard at home", "I Want One Ryan's World Edition!",
        "The Super Mario Bros Movie In Real Life Challenge Obby",
        "Ryan's World Island Adventure Animation with friends!",
        "I have two sides! When I win a video game and when I lose.....",
        "Diana Wednesday and School Friendship Story", "Diana and Roma At The Cat School",
        "Diana Creates A Family Tree Through Photos", "Diana and Roma's Hilarious Animal Adventures",
        "Diana and Roma Happy Valentine's Day Challenge"
    ],
    'views': [
        9774646, 9772855, 9474222, 8880191, 8768810,
        9850162, 8861369, 8790585, 7707368, 76815894,
        91287836, 89470212, 84429195, 84107442, 798438510,
        964677, 957485, 950292, 88249, 855034,
        9851155, 9716326, 9713507, 9028295, 8603639
    ]
}

# Create a DataFrame from the data
df = pd.DataFrame(data)

# Create the horizontal bar chart
fig = go.Figure(data=go.Bar(
    x=df['views'],
    y=df['title'],
    orientation='h',
    text=df['channelName'],
    textposition='auto'
))

# Customize the chart layout
fig.update_layout(
    title='Top 5 Videos with Most Views for Each YouTube Channel',
    xaxis_title='Views',
    yaxis_title='Video Title',
    margin=dict(l=150, r=20, t=60, b=20),
    yaxis=dict(automargin=True),
    bargap=0.2,
    bargroupgap=0.1
)

# Show the chart
fig.show()

pio.write_html(fig, file='5chanmostviewschart.html', auto_open=True)

