- ## Parse hours in time column

In [12]:
import pandas as pd
import os
from googleapiclient.discovery import build
import re

In [13]:
# API Parameters
api_key = os.environ.get('YOUTUBE-API-KEY')
api = build('youtube', 'v3', developerKey = api_key)

In [14]:
def get_playlist_videos(api, playlist_id, max_results=50):
    """
    Get videos from a YouTube playlist.

    Args:
    - api: YouTube Data API service object.
    - playlist_id: ID of the playlist from which to get videos.
    - max_results: Maximum number of results to get per page.

    Returns:
    - A list of dictionaries, each representing a video in the playlist.
    """
    playlist_videos = []

    # Initial playlist request
    pl_request = api.playlistItems().list(
        part='snippet',
        playlistId=playlist_id,
        maxResults=max_results
    )

    # Retrieve all pages of videos
    while pl_request:
        pl_snippet = pl_request.execute()  # Execute the request
        playlist_videos.extend(pl_snippet['items'])  # Add videos to the list
        pl_request = api.playlistItems().list_next(pl_request, pl_snippet)  # Check for more pages until all videos are pulled

    return playlist_videos



In [15]:
def extract_publish_dates(playlist_videos):
    publish_dates = []
    for video_info in playlist_videos:
        if 'snippet' in video_info and video_info['snippet'].get('title') != 'Private video':
            publish_dates.append(video_info['snippet'].get('publishedAt'))
    return publish_dates

def extract_video_ids(playlist_videos):
    video_ids = []
    for video_info in playlist_videos:
        if 'snippet' in video_info and video_info['snippet'].get('title') != 'Private video':
            video_ids.append(video_info['snippet'].get('resourceId', {}).get('videoId'))
    return video_ids

def extract_titles(playlist_videos):
    titles = []
    for video_info in playlist_videos:
        if 'snippet' in video_info and video_info['snippet'].get('title') != 'Private video':
            titles.append(video_info['snippet']['title'])
    return titles

def extract_descriptions(playlist_videos):
    descriptions = []
    for video_info in playlist_videos:
        if 'snippet' in video_info and video_info['snippet'].get('title') != 'Private video':
            descriptions.append(video_info['snippet'].get('description'))
    return descriptions

def extract_video_duration(video_content_details_responses):
    """Function to extract video durations from video content details response."""
    durations = []
    for item in video_content_details_responses:
        durations.append(item['contentDetails'].get('duration'))
    return durations


def extract_video_definition(video_content_details_responses):
    """Function to extract video definitions from video content details response."""
    definitions = []
    for item in video_content_details_responses:
        definitions.append(item['contentDetails'].get('definition'))
    return definitions


def extract_view_count(video_statistics_response):
    """Function to extract view counts from video statistics response."""
    view_counts = []
    for item in video_statistics_response:
        view_counts.append(item['statistics'].get('viewCount'))
    return view_counts

def extract_like_count(video_statistics_responses):
    """Function to extract like counts from video statistics response."""
    like_counts = []
    for item in video_statistics_responses:
        like_counts.append(item['statistics'].get('likeCount'))
    return like_counts

def extract_favorite_count(video_statistics_response):
    """Function to extract favorite counts from video statistics response."""
    favorite_counts = []
    for item in video_statistics_response:
        favorite_counts.append(item['statistics'].get('favoriteCount'))
    return favorite_counts

def extract_comment_count(video_statistics_response):
    """Function to extract comment counts from video statistics response."""
    comment_counts = []
    for item in video_statistics_response:
        comment_counts.append(item['statistics'].get('commentCount'))
    return comment_counts

def extract_topic_categories(video_topic_response):
    """Function to extract topic categories from video topic response."""
    topic_categories = []
    for item in video_topic_response:
        try:
            topic_categories.append(item['topicDetails'].get('topicCategories'))
        except Exception as e:
            topic_categories.append(None)
        
    return topic_categories

In [16]:
def split_list(lst, batch_size):
    """Function to split a list into batches."""
    for i in range(0, len(lst), batch_size):
        yield lst[i:i + batch_size]

def get_video_statistics(video_ids):
    """Function to get statistics for a list of video IDs."""
    batch_size = 50  # Adjust batch size based on API limits
    all_videos_stats = []

    for batch in split_list(video_ids, batch_size):
        video_ids_str = ','.join(batch)
        stat_request = api.videos().list(part='statistics', id=video_ids_str)

        try:
            response = stat_request.execute()
            all_videos_stats.extend(response['items'])
        except Exception as e:
            print(f"Error getting statistics for batch {batch}: {e}")

    return all_videos_stats

def get_video_topic_details(video_ids):
    """Function to get topic details for a list of video IDs."""
    batch_size = 50  # Adjust batch size based on API limits
    all_video_topic_details = []

    for batch in split_list(video_ids, batch_size):
        video_ids_str = ','.join(batch)
        topic_request = api.videos().list(part='topicDetails', id=video_ids_str)

        try:
            response = topic_request.execute()
            all_video_topic_details.extend(response['items'])
        except Exception as e:
            print(f"Error getting topic details for batch {batch}: {e}")

    return all_video_topic_details

def get_video_content_details(video_ids):
    """Function to get content details for a list of video IDs."""
    batch_size = 50  # Adjust batch size based on API limits
    all_video_content_details = []

    for batch in split_list(video_ids, batch_size):
        video_ids_str = ','.join(batch)
        content_request = api.videos().list(part='contentDetails', id=video_ids_str)

        try:
            response = content_request.execute()
            all_video_content_details.extend(response['items'])
        except Exception as e:
            print(f"Error getting content details for batch {batch}: {e}")

    return all_video_content_details

In [17]:
def get_playlist_df(api, playlist_id):
    # Get Playlist Videos
    playlist_videos = get_playlist_videos(api, playlist_id)

    # Get Video Info
    publish_dates = extract_publish_dates(playlist_videos)
    video_ids = extract_video_ids(playlist_videos)
    titles = extract_titles(playlist_videos)
    descriptions = extract_descriptions(playlist_videos)

    # Get Video Content Details
    content_details = get_video_content_details(video_ids)

    video_durations = extract_video_duration(content_details)
    video_hd = extract_video_definition(content_details)

    # Get Video Stats
    stats = get_video_statistics(video_ids)

    video_views = extract_view_count(stats)
    video_likes = extract_like_count(stats)
    video_favorites = extract_favorite_count(stats)
    video_comments = extract_comment_count(stats)

    # Get Topic Details
    topic_details = get_video_topic_details(video_ids)

    topic_categories = extract_topic_categories(topic_details)
    
    
    # Create DataFrame
    df = pd.DataFrame({
        'PublishDate': publish_dates,
        'VideoId': video_ids,
        'Title': titles,
        'Description': descriptions,
        'Duration': video_durations,
        'HD': video_hd,
        'Views': video_views,
        'Likes': video_likes,
        'Favorites': video_favorites,
        'Comments': video_comments,
        'TopicCategories': topic_categories
    })
    
    return df    

In [18]:
def parse_date(df):
    df[['Date', 'Time']] = df['PublishDate'].str.split('T', expand=True)
    df['Time'] = df['Time'].str.rstrip('Z')
    df.drop(columns=['PublishDate'], inplace=True)
    return df


def parse_duration(df):
    def parse_duration(duration):
        hours = 0
        minutes = 0
        seconds = 0

        hours_match = re.search(r'(\d+)H', duration)
        minutes_match = re.search(r'(\d+)M', duration)
        seconds_match = re.search(r'(\d+)S', duration)

        if hours_match:
            hours = int(hours_match.group(1))
        if minutes_match:
            minutes = int(minutes_match.group(1))
        if seconds_match:
            seconds = int(seconds_match.group(1))

        total_seconds = hours * 3600 + minutes * 60 + seconds
        return total_seconds

    new_df = df.copy()
    new_df['TotalSeconds'] = df['Duration'].apply(parse_duration)
    new_df.drop(columns=['Duration'], inplace=True)
    return new_df


def parse_topic_categories(df):
    df['TopicCategories'] = df['TopicCategories'].apply(lambda x: [url.replace('https://en.wikipedia.org/wiki/', '') if url is not None else None for url in x] if isinstance(x, list) else None)
    return df


def preprocess_df(df):
    df_temp = df.copy()
    df_temp = parse_date(df_temp)
    df_temp = parse_duration(df_temp)
    df_temp = parse_topic_categories(df_temp)
    
    return df_temp

In [19]:
df = get_playlist_df(api, playlist_id='PLjkZIuJPz3rOwDyDazAKJIaniwTxwZ970')

In [20]:
df = preprocess_df(df)

In [21]:
df

Unnamed: 0,VideoId,Title,Description,HD,Views,Likes,Favorites,Comments,TopicCategories,Date,Time,TotalSeconds
0,ZR9R4I7dLw8,20 WOMEN VS 2 SIDEMEN: ANGRY GINGE & DANNY AAR...,🎥: Watch the 20v2 BTS: https://watch.sideplus....,hd,16107996,504415,0,9697,[Entertainment],2024-01-18,11:24:15,5063
1,m7YSTtiPMl4,20 WOMEN VS 1 SIDEMEN: SPEED EDITION,🎥: WATCH THE SPEED BTS here: https://watch.sid...,hd,55621360,2143633,0,46924,[Lifestyle_(sociology)],2024-01-08,12:31:05,4529
2,NNePgYyI-Ns,SIDEMEN REVERSE 20 VS 1: TANA MONGEAU EDITION,🍗: Order food NOW at: https://www.eatsides.com...,hd,11853625,409853,0,11650,"[Entertainment, Film, Television_program]",2024-01-08,12:30:58,3684
3,gzJND7rlajM,20 WOMEN VS 1 SIDEMEN: DEJI EDITION,Deji speaks to females\n🎥: Catch ALL of the ac...,hd,14075149,565822,0,16640,[Entertainment],2024-01-08,12:30:51,3577
4,lz4R4FHFr90,20 WOMEN VS 1 SIDEMEN: KAI CENAT EDITION,🎥: WATCH THE KAI BTS here: https://watch.sidep...,hd,64072436,1812020,0,44069,"[Entertainment, Humour, Television_program]",2024-01-08,12:30:39,3148
5,M0zEjvvTsoc,20 WOMEN VS 1 SIDEMEN: LOGAN PAUL EDITION,Today Logan Paul takes on Sidemen 20 vs 1. Enj...,hd,24678738,956098,0,22378,"[Entertainment, Film]",2024-01-08,12:30:29,3259
6,WB4LhvhLzlw,20 WOMEN VS 1 SIDEMEN: JIDION EDITION,JiDion's Channel: https://www.youtube.com/c/Ji...,hd,19771971,850394,0,58728,"[Entertainment, Television_program]",2024-01-08,12:30:20,3635
7,DUrBIxB1q0o,20 WOMEN VS 1 SIDEMEN: FILLY EDITION,Filly's Channel: https://www.youtube.com/c/Yun...,hd,41625671,1395657,0,33136,"[Humour, Lifestyle_(sociology)]",2024-01-08,12:30:14,4366
8,qG3AS3RKlF0,20 WOMEN VS 1 SIDEMEN: CALLUX EDITION,Callux Channel: https://www.youtube.com/channe...,hd,15203055,667242,0,22440,[Lifestyle_(sociology)],2024-01-08,12:29:58,3682
9,hiehLFrTlRs,20 WOMEN VS 1 SIDEMEN: KSI EDITION,olajide olatunji dates some lovely ladies\n🍗: ...,hd,26822630,941601,0,24223,[Lifestyle_(sociology)],2024-01-08,12:29:48,3110
