In [22]:
import pytube
from pytube.exceptions import PytubeError
import http.client
from googleapiclient.discovery import build
import pandas as pd
import sys
from constants import YOUTUBE_API_KEY
from data_mining import data_mining_source
import os
from random import sample
import moviepy
import pretty_midi
import numpy as np
import copy

  warn("Couldn't import ipywidgets properly, progress bar will use console behavior")


In [30]:
def robust_download(video_url: str) -> List[Union[str, int]]:
    """Downloads the mp4 video file from the input YouTube video URL, converts it to an mp3 audio file and returns
       the video metadata.

    Args:
        video_url: The URL of the YouTube video to download.

    Returns:
        A tuple of two items:
            - A dictionary containing video metadata, including video ID, file path, length, rating, description, and keywords.
            - A list containing video channel ID, video ID, description, length, publish date, and file path.
    """
    try:
        video = pytube.YouTube(video_url)
        video_name = video.title.replace('/', '') + '.mp4'
        audio_name = video.title.replace('/', '') + '.mp3'
        output_path = os.path.join('dataset', GENRE_TAG, video.title)

        if not os.path.isdir(output_path) and not video.length > 360:
            os.makedirs(output_path)

        if not os.path.isfile(os.path.join(output_path, audio_name)) and not video.length > 360:
            video.streams.filter(progressive=True, file_extension='mp4').order_by('resolution').asc() \
                 .first().download(filename=video_name, output_path=output_path)

            video_editor = moviepy.editor.VideoFileClip(os.path.join(output_path, video_name))
            video_editor.audio.write_audiofile(os.path.join(output_path, audio_name), logger=None)
            os.remove(os.path.join(output_path, video_name))

        row = []
        if not video.length > 360:
            row = [video.video_id, video.channel_id,
                   video.title, video.description,
                   video.length, video.publish_date,
                   video.rating, video.keywords,
                   os.path.join(output_path, audio_name)
            ]

        return row

    except (PytubeError, http.client.IncompleteRead) as e:
        print('Error: ', e)
        print(video_url)
        return []

In [33]:
def song_playlists(playlist_id: str) -> pd.DataFrame:
    
    data = []
    url = f'https://www.youtube.com/playlist?list={playlist_id}'
    playlist = pytube.Playlist(url)
    video_urls = playlist.video_urls
    for video_url in video_urls:
        row = robust_download(video_url)
        row.insert(2, playlist_id)
        data.append(row)

    videos_table = pd.DataFrame(data, columns=[
        'VideoID', 'ChannelID', 'PlaylistID',
        'videoTitle', 'descriptions', 'length', 'publishedDate',
        'videoRating', 'videoKeyWords', 'pathToRaw']
    )
    output_path = os.path.join('dataset', GENRE_TAG, f'{playlist_id}_META_DATA.csv')
    videos_table.to_csv(output_path)
    return videos_table

In [34]:
# example use (insert playlist id, can be found at the end of the playlist link)
GENRE_TAG = 'electronic'
playlists = ['PLEb0_nNrLidAcRNKYubWmueQKGbhp9cUf', 'PLh5yQEh2F60v3bjv5M1PlD-DO9V1D7GkI']
for playlist_id in playlists:
    song_playlists(playlist_id)

Error:  Exception while accessing title of https://youtube.com/watch?v=GDnU6nbL1uQ. Please file a bug report at https://github.com/pytube/pytube
https://www.youtube.com/watch?v=GDnU6nbL1uQ
Error:  Exception while accessing title of https://youtube.com/watch?v=GaYzv2cvGNc. Please file a bug report at https://github.com/pytube/pytube
https://www.youtube.com/watch?v=GaYzv2cvGNc
Error:  Exception while accessing title of https://youtube.com/watch?v=w6x6TdOI_GE. Please file a bug report at https://github.com/pytube/pytube
https://www.youtube.com/watch?v=w6x6TdOI_GE
Error:  Exception while accessing title of https://youtube.com/watch?v=sLyrooXOGkU. Please file a bug report at https://github.com/pytube/pytube
https://www.youtube.com/watch?v=sLyrooXOGkU
Error:  Exception while accessing title of https://youtube.com/watch?v=cOfa6poaRU8. Please file a bug report at https://github.com/pytube/pytube
https://www.youtube.com/watch?v=cOfa6poaRU8
Error:  Exception while accessing title of https://yout

In [35]:
GENRE_TAG = 'movie'
playlists = ['PLzZfeWyGFlb-dre7blTNpEu4kG_PdGddw', 'PLh5yQEh2F60uqmOl9oRWD6SyEarU2kqo2']
for playlist_id in playlists:
    song_playlists(playlist_id)

Error:  Exception while accessing title of https://youtube.com/watch?v=9Tao0pjhWlY. Please file a bug report at https://github.com/pytube/pytube
https://www.youtube.com/watch?v=9Tao0pjhWlY
Error:  Exception while accessing title of https://youtube.com/watch?v=PjftUHqOueI. Please file a bug report at https://github.com/pytube/pytube
https://www.youtube.com/watch?v=PjftUHqOueI
Error:  Exception while accessing title of https://youtube.com/watch?v=84JJDtesQdE. Please file a bug report at https://github.com/pytube/pytube
https://www.youtube.com/watch?v=84JJDtesQdE
Error:  Exception while accessing title of https://youtube.com/watch?v=girEeLHsGEE. Please file a bug report at https://github.com/pytube/pytube
https://www.youtube.com/watch?v=girEeLHsGEE
Error:  Exception while accessing title of https://youtube.com/watch?v=2StnCCog7IY. Please file a bug report at https://github.com/pytube/pytube
https://www.youtube.com/watch?v=2StnCCog7IY
Error:  Exception while accessing title of https://yout