In [1]:
#You don't need to install this, but for some reaason it wasn't working to me without it.
#!pip install --upgrade google-api-python-client google-auth-httplib2 google-auth-oauthlib

In [2]:
# Import necessary libraries
import os
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from googleapiclient.discovery import build
import re
from datetime import datetime
import numpy as np

In [3]:
# Read API key from a file
file = open("API_GOOGLE.txt", "r")
api_key = file.read()
file.close()

In [4]:
# Define a list of YouTube channel IDs
channel_id = ['UChBEbMKI1eCcejTtmI32UEw', 'UCfE5Cz44GlZVyoaYTHJbuZw', 'UCMyOj6fhvKFMjxUCp3b_3gA']

# Initialize YouTube API client
youtube = build('youtube', 'v3', developerKey=api_key)

In [5]:
# Function to retrieve channel statistics
def get_channel_stats(youtube, channel_id):
    request = youtube.channels().list(
    part="snippet,contentDetails,statistics",
        id = channel_id)
    response = request.execute()
    return response['items']

In [6]:
# Function to retrieve a list of videos from a channel
def get_video_list(youtube, upload_id):
    video_list = []
    request = youtube.playlistItems().list(
    part="snippet,contentDetails",
    playlistId = upload_id,
    maxResults=50
    )
    
    next_page = True
    
    while next_page:
        response = request.execute()
        data = response['items']
        
        for video in data:
            video_id = video['contentDetails']['videoId']
            if video_id not in video_list:
                video_list.append(video_id)
                
        if 'nextPageToken' in response.keys():
            next_page = True
            
            request = youtube.playlistItems().list(
                part="snippet,contentDetails",
                playlistId = upload_id,
                maxResults=50,
                pageToken=response['nextPageToken']
            )
        else:
            next_page = False
            
    return video_list

In [7]:
# Function to aggregate video data for multiple channels
def get_all_video_data_for_channels(youtube, channel_ids):
    all_video_data = []

    
    for ch_id in channel_ids:
        channel_stats = get_channel_stats(youtube, [ch_id]) 

        if not channel_stats:  
            continue
        
    
        channel_name = channel_stats[0]['snippet']['title']

        playlist_id = channel_stats[0]['contentDetails']['relatedPlaylists']['uploads']
        video_list = get_video_list(youtube, playlist_id)
        video_data = get_video_details(youtube, video_list, channel_name)  # Passing channel name
        
        all_video_data.extend(video_data)

    return all_video_data


In [8]:
# Function to retrieve detailed statistics for each video
def get_video_details(youtube, video_list, channel_name):
    stats_list = []
    
    for i in range(0, len(video_list), 50):
        request = youtube.videos().list(
            part="snippet,contentDetails, statistics",
            id=video_list[i:i+50]
        )
        
        data = request.execute()
        
        for video in data['items']:
            title = video['snippet']['title']
            published = video['snippet']['publishedAt']
            description = video['snippet']['description']
            length = video['contentDetails']['duration']
            tag_count = len(video['snippet'].get('tags',[]))
            tag = "No tags" if tag_count == 0 else "Have tags"
            view_count = video['statistics'].get('viewCount',0)
            like_count = video['statistics'].get('likeCount',0)
            comment_count = video['statistics'].get('commentCount',0)
            

            stats_dictionary = dict(
                channel_name = channel_name, 
                title = title, 
                published = published, 
                description = description,
                length = length,
                tag = tag,
                view_count = view_count,
                like_count = like_count,
                comment_count = comment_count 
            )
            stats_list.append(stats_dictionary)
    return stats_list

In [9]:
# Collecting data and creating a DataFrame
video_data = get_all_video_data_for_channels(youtube, channel_id)
df = pd.DataFrame(video_data)
df

Unnamed: 0,channel_name,title,published,description,length,tag,view_count,like_count,comment_count
0,Joshua Weissman,Perfect Steak Au Poivre,2023-11-15T17:16:05Z,,PT30S,No tags,68400,4890,64
1,Joshua Weissman,Every Way to Cook Steak (34 Ways),2023-11-12T15:30:10Z,The steak recipe to end all recipes. Special t...,PT29M18S,Have tags,982065,38534,2111
2,Joshua Weissman,I Made The Easiest Ramen Ever,2023-11-08T16:00:23Z,Easiest Ramen (that's not instant ramen) 3 dif...,PT10M56S,Have tags,1300983,54963,1260
3,Joshua Weissman,MCRIB At Home With Terry Crews,2023-11-04T19:30:00Z,,PT31S,No tags,854163,70695,696
4,Joshua Weissman,I Tried Every Fast Food Fried Chicken Sandwich...,2023-11-01T14:30:16Z,Get MY NEW Cookbook: https://bit.ly/TextureOve...,PT23M1S,Have tags,2659431,85543,6267
...,...,...,...,...,...,...,...,...,...
1788,Nick DiGiovanni,How To Cook Arctic Char,2020-10-13T01:56:10Z,"The perfect combination of salmon and trout, s...",PT4M55S,Have tags,430582,13068,379
1789,Nick DiGiovanni,How To Cook Salmon,2020-09-29T00:12:38Z,This is salmon made easy. Follow my 80-20 rule...,PT2M31S,Have tags,444421,14634,282
1790,Nick DiGiovanni,Beef Wellington,2020-09-14T18:41:27Z,Beef Wellington doesn't have to be scary. \n\n...,PT8M5S,Have tags,830320,27475,1116
1791,Nick DiGiovanni,Japanese A5 Wagyu Beef,2020-08-31T21:42:22Z,You can almost eat this steak with a spoon. \n...,PT10M29S,Have tags,795692,20420,592


In [10]:
# Data cleaning and transformation
df['collaboration'] = df['title'].str.contains('ft|Ft', case=False)

In [11]:
#Change length to minutes
def convert_to_minutes(iso_duration):
    minutes_match = re.search(r'(\d+)M', iso_duration)
    seconds_match = re.search(r'(\d+)S', iso_duration)
    
    minutes = int(minutes_match.group(1)) if minutes_match else 0
    seconds = int(seconds_match.group(1)) if seconds_match else 0

    return minutes + seconds/60


df['duration_in_minutes'] = df['length'].apply(convert_to_minutes)

In [12]:
#Get day of publication
df['published'] = df['published'].apply(lambda x: datetime.strptime(x, "%Y-%m-%dT%H:%M:%SZ"))

def get_weekday(date_string):
    return date_string.strftime("%A").lower() 

df['day_published'] = df['published'].apply(get_weekday)
df

Unnamed: 0,channel_name,title,published,description,length,tag,view_count,like_count,comment_count,collaboration,duration_in_minutes,day_published
0,Joshua Weissman,Perfect Steak Au Poivre,2023-11-15 17:16:05,,PT30S,No tags,68400,4890,64,False,0.500000,wednesday
1,Joshua Weissman,Every Way to Cook Steak (34 Ways),2023-11-12 15:30:10,The steak recipe to end all recipes. Special t...,PT29M18S,Have tags,982065,38534,2111,False,29.300000,sunday
2,Joshua Weissman,I Made The Easiest Ramen Ever,2023-11-08 16:00:23,Easiest Ramen (that's not instant ramen) 3 dif...,PT10M56S,Have tags,1300983,54963,1260,False,10.933333,wednesday
3,Joshua Weissman,MCRIB At Home With Terry Crews,2023-11-04 19:30:00,,PT31S,No tags,854163,70695,696,False,0.516667,saturday
4,Joshua Weissman,I Tried Every Fast Food Fried Chicken Sandwich...,2023-11-01 14:30:16,Get MY NEW Cookbook: https://bit.ly/TextureOve...,PT23M1S,Have tags,2659431,85543,6267,False,23.016667,wednesday
...,...,...,...,...,...,...,...,...,...,...,...,...
1788,Nick DiGiovanni,How To Cook Arctic Char,2020-10-13 01:56:10,"The perfect combination of salmon and trout, s...",PT4M55S,Have tags,430582,13068,379,False,4.916667,tuesday
1789,Nick DiGiovanni,How To Cook Salmon,2020-09-29 00:12:38,This is salmon made easy. Follow my 80-20 rule...,PT2M31S,Have tags,444421,14634,282,False,2.516667,tuesday
1790,Nick DiGiovanni,Beef Wellington,2020-09-14 18:41:27,Beef Wellington doesn't have to be scary. \n\n...,PT8M5S,Have tags,830320,27475,1116,False,8.083333,monday
1791,Nick DiGiovanni,Japanese A5 Wagyu Beef,2020-08-31 21:42:22,You can almost eat this steak with a spoon. \n...,PT10M29S,Have tags,795692,20420,592,False,10.483333,monday


In [13]:
df.to_csv("youtube_data.csv")