In [18]:
from googleapiclient.discovery import build
import json
import pandas as pd
import re

In [19]:
API_KEY = %env API_KEY

In [20]:
YOUTUBE_API_SERVICE_NAME = 'youtube'
YOUTUBE_API_VERSION = 'v3'

youtube = build(YOUTUBE_API_SERVICE_NAME, YOUTUBE_API_VERSION, developerKey=API_KEY)


In [21]:
def clean_channel_title(channel_title):
    # Remove 'VEVO' from the channel title
    channel_title = re.sub(r'vevo', '', channel_title, flags=re.IGNORECASE)
    # Remove '- Topic' from the channel title
    channel_title = re.sub(r'- Topic', '', channel_title, flags=re.IGNORECASE)
    # Insert spaces before capital letters (if words are concatenated)
    channel_title = re.sub(r'(?<=[a-z])(?=[A-Z])', ' ', channel_title)
    # Remove extra spaces
    channel_title = channel_title.strip()
    return channel_title

In [22]:
def clean_title(title):
    # List of patterns to remove
    patterns = [
        r'\(.*?official music video.*?\)',
        r'\[.*?official music video.*?\]',
        r'\(.*?official video.*?\)',
        r'\[.*?official video.*?\]',
        r'\(.*?official mv.*?\)',
        r'\[.*?official mv.*?\]', 
        r'\(.*?lyrics.*?\)',
        r'\[.*?lyrics.*?\]',
        r'\(.*?audio.*?\)',
        r'\[.*?audio.*?\)',
        r'\(.*?lyric video.*?\)',
        r'\[.*?lyric video.*?\)',
        r'\(.*?official visualizer.*?\)',
        r'\[.*?official visualizer.*?\]',
        r'\(.*?lyric.*?\)',
        r'\[.*?lyric.*?\]',
        r'-\s*visualizer',
        r'-\s*visualiser',
        r'\(.*?mv.*?\)',
        r'\[.*?mv.*?\]',
        r'\(.*?hd.*?\)',
        r'\[.*?hd.*?\]',
        r'\(.*?hq.*?\)',
        r'\[.*?hq.*?\]',
        r'official mv',
        r'lyrics in video \+ description',    # Added pattern
        r'lyric video',                        # Added pattern
        r'lyrics hd/hq',                       # Added pattern
        r'lyrics hq/hd',                       # Added pattern (reverse order)
        r'lyrics',                             # Added pattern
        r'lyric',                              # Added pattern
        r'ft\.',                               # We'll handle 'ft.' and 'feat.' in another function
        r'feat\.',
        r'vevo',
        r'\"',
        r'\'',
        r'\|.*',
    ]

    # Remove unwanted patterns
    for pattern in patterns:
        title = re.sub(pattern, '', title, flags=re.IGNORECASE)
    # Remove extra spaces and special characters at the ends
    title = title.strip(' -–—_|')
    title = re.sub(r'\s+', ' ', title)
    return title.strip()

In [23]:
def split_artists(artist_str):
    # Define separators
    separators = [',', '&', ' and ']
    # Create a regex pattern to split on any of the separators
    regex_pattern = '|'.join(map(re.escape, separators))
    # Split the artist string
    artists_list = [a.strip() for a in re.split(regex_pattern, artist_str) if a.strip()]
    return artists_list


In [24]:
def extract_featured_artists(text):
    ft_pattern = r'(?i)\b(ft\.?|feat\.?|featuring)\b\s*(.*)'
    match = re.search(ft_pattern, text)
    if match:
        main_text = text[:match.start()].strip()
        featured_artists_str = match.group(2).strip()
        featured_artists = split_artists(featured_artists_str)
        return main_text, featured_artists
    else:
        return text.strip(), []


In [25]:
def get_playlist_videos(youtube, playlist_id):
    videos = []
    next_page_token = None
    seen_page_tokens = set()

    while True:
        # Build the request parameters
        request_params = {
            'part': 'contentDetails',
            'playlistId': playlist_id,
            'maxResults': 50,
        }
        if next_page_token:
            request_params['pageToken'] = next_page_token

        try:
            pl_request = youtube.playlistItems().list(**request_params)
            pl_response = pl_request.execute()
        except Exception as e:
            print(f"An error occurred: {e}")
            break

        # Extract video IDs
        video_ids = [item['contentDetails']['videoId'] for item in pl_response.get('items', [])]
        videos.extend(video_ids)

        # Get the next page token
        previous_page_token = next_page_token
        next_page_token = pl_response.get('nextPageToken')

        # Debugging: Print the page tokens
        # page_number = len(videos) // 50 + 1
        # print(f'Page {page_number}: Retrieved {len(video_ids)} videos.')
        # print(f'Previous page token: {previous_page_token}')
        # print(f'Next page token: {next_page_token}')

        # Check if we've seen this page token before
        if next_page_token in seen_page_tokens:
            # print("Next page token has been seen before. Exiting loop to prevent infinite execution.")
            break

        # If next_page_token is None, we've reached the last page
        if not next_page_token:
            # print("No more pages. Exiting loop.")
            break

        # Add the current next_page_token to the set of seen tokens
        seen_page_tokens.add(next_page_token)

    return videos


In [26]:
def get_video_details(youtube, video_ids):
    video_details = []
    special_char_patterns = r'[\(\[\{]([^)\]\}]+)[\)\]\}]'

    for i in range(0, len(video_ids), 50):
        request = youtube.videos().list(
            part='snippet',
            id=','.join(video_ids[i:i+50])
        )
        response = request.execute()

        for item in response['items']:
            video_id = item['id']
            snippet = item['snippet']
            full_title = snippet['title']
            channel_title = snippet['channelTitle']

            # Clean the title
            cleaned_title = clean_title(full_title)

            # Remove any text in parentheses or brackets
            cleaned_title = re.sub(r'\(.*?\)', '', cleaned_title)
            cleaned_title = re.sub(r'\[.*?\]', '', cleaned_title)
            cleaned_title = re.sub(r'\{.*?\}', '', cleaned_title)
            cleaned_title = cleaned_title.strip()

            # Split the title to extract artist and song title
            separators = [' - ', '-', ':', '|', '–', '—']
            regex_pattern = '|'.join(map(re.escape, separators))
            split_title = re.split(regex_pattern, cleaned_title, maxsplit=1)

            if len(split_title) == 2:
                artist, song_title = split_title
            elif ' by ' in cleaned_title.lower():
                song_title, artist = cleaned_title.rsplit(' by ', 1)
            else:
                artist = ''
                song_title = cleaned_title

            artist = artist.strip()
            song_title = song_title.strip()

            # If artist is empty, use the cleaned channel title
            if not artist:
                artist = clean_channel_title(channel_title)

            # Initialize singers_list
            singers_list = []

            # Extract featured artists from artist
            artist, featured_artists_from_artist = extract_featured_artists(artist)
            # Extract featured artists from song_title
            song_title, featured_artists_from_title = extract_featured_artists(song_title)

            # Combine all artists
            main_artists = split_artists(artist)
            singers_list.extend(main_artists)
            singers_list.extend(featured_artists_from_artist)
            singers_list.extend(featured_artists_from_title)

            # Remove duplicates and normalize singers_list
            singers_list = list(set(singer.strip() for singer in singers_list if singer.strip()))

            video_details.append({
                "url": f"https://www.youtube.com/watch?v={video_id}",
                "title": song_title,
                "singers": singers_list
            })

    return video_details


In [27]:
# Get all video IDs from the playlist
playlist_ids = [
    'RDCLAK5uy_lBGRuQnsG37Akr1CY4SxL0VWFbPrbO4gs', # On Everything: Today's Hip-Hop Hits
    'RDCLAK5uy_kmPRjHDECIcuVwnKsx2Ng7fyNgFKWNJFs', # The Hit List
    'RDCLAK5uy_lBNUteBRencHzKelu5iDHwLF6mYqjL-JU', # Pop Certified
    'RDCLAK5uy_nZiG9ehz_MQoWQxY5yElsLHCcG0tv9PRg', # Classic Rock's Greatest Hits
    'RDCLAK5uy_nmS3YoxSwVVQk9lEQJ0UX4ZCjXsW_psU8', # Pop's Biggest Hits
    # 'RDCLAK5uy_nKsmIDujCJTYRInvAJirUkn0KjgwKNiZE', # Disney Songs
    # 'RDCLAK5uy_lMzXQA761IIDTLJJwgpD67INZ8lL6UsVU', # Christian Music's Biggest Hits
    'PLB5Ac5TbLc2OHUC5uaAuFdbxMXQK3ZaAF', # Most Iconic Songs of All Time
    # 'RDCLAK5uy_k5vcGRXixxemtzK1eKDS7BeHys7mvYOdk' # Maximum Decibels: Today's Rock Hits
    'RDCLAK5uy_mGYde2Wyx9INZd6GbPcMWkxDOu6Utmedw' # The Hits: '10s
    'PLOhV0FrFphUfHqxfhIBju7zu_2CTqG01F' # Best Indie Rock Songs/ Alternative Playlist
    'RDCLAK5uy_mplKe9BIYCO3ZuNWSHZr48bm9DUDzbWnE' # The Millennial Mixtape
    # 'PL6Lt9p1lIRZ311J9ZHuzkR5A3xesae2pk' # Alternative rock of the 2000s
]
video_details = []

# Get all video IDs from the playlists
for playlist_id in playlist_ids:
    video_ids = get_playlist_videos(youtube, playlist_id)
    print(f'Number of videos retrieved from playlist {playlist_id}: {len(video_ids)}')
    # Get video details and extend the list
    video_details.extend(get_video_details(youtube, video_ids))

print(f'Total number of videos before removing duplicates: {len(video_details)}')

# Remove duplicates based on 'url' (video_id)
unique_videos = {video['url']: video for video in video_details}
video_details = list(unique_videos.values())
print(f'Number of unique videos after removing duplicates by URL: {len(video_details)}')

# Now remove duplicates based on song title and singers
unique_songs = {}
for video in video_details:
    # Create a key using the song title and sorted list of singers
    key = (
        video['title'].strip().lower(),  # Normalize the song title
        tuple(sorted(singer.strip().lower() for singer in video['singers']))  # Normalize and sort singers
    )
    if key not in unique_songs:
        unique_songs[key] = video  # Add the video if the key is not present

# Update video_details with the unique songs
video_details = list(unique_songs.values())
print(f'Number of unique songs after removing duplicates by title and singers: {len(video_details)}')

# Save data to CSV and JSON
data = pd.DataFrame(video_details)
data.to_csv('playlist_videos_cleaned.csv', index=False)

with open('playlist_videos.json', 'w', encoding='utf-8') as f:
    json.dump(video_details, f, ensure_ascii=False, indent=4)


Number of videos retrieved from playlist RDCLAK5uy_lBGRuQnsG37Akr1CY4SxL0VWFbPrbO4gs: 199
Number of videos retrieved from playlist RDCLAK5uy_kmPRjHDECIcuVwnKsx2Ng7fyNgFKWNJFs: 94
Number of videos retrieved from playlist RDCLAK5uy_lBNUteBRencHzKelu5iDHwLF6mYqjL-JU: 89
Number of videos retrieved from playlist RDCLAK5uy_nZiG9ehz_MQoWQxY5yElsLHCcG0tv9PRg: 199
Number of videos retrieved from playlist RDCLAK5uy_nmS3YoxSwVVQk9lEQJ0UX4ZCjXsW_psU8: 299
Number of videos retrieved from playlist PLB5Ac5TbLc2OHUC5uaAuFdbxMXQK3ZaAF: 114
An error occurred: <HttpError 404 when requesting https://youtube.googleapis.com/youtube/v3/playlistItems?part=contentDetails&playlistId=RDCLAK5uy_mGYde2Wyx9INZd6GbPcMWkxDOu6UtmedwPLOhV0FrFphUfHqxfhIBju7zu_2CTqG01FRDCLAK5uy_mplKe9BIYCO3ZuNWSHZr48bm9DUDzbWnE&maxResults=50&key=AIzaSyCoop3L4OhBNltsNFYaZb2CNBTXstnqjTA&alt=json returned "The playlist identified with the request's <code>playlistId</code> parameter cannot be found.". Details: "[{'message': "The playlist ide