# Data collection for music video analysis

To analyze music video's and their lyrics, the music videos need to be found and downloaded from YouTube. The main approach for this is to search YouTube for "song_name + "music video"" and scrape the first search result.

First, we import the lyrics dataset.

In [1]:
# This code is from tutorial 3.1
import pickle
import pandas as pd

PATH_DF = 'english_cleaned_lyrics.csv' 
PATH_CORRECTION = 'indx2newdate.p' 

def load_dataset(data_path, path_correction):
    df = pd.read_csv(data_path)
    indx2newdate = pickle.load(open(PATH_CORRECTION, 'rb'))
    df['year'] = df['index'].apply(lambda x: int(indx2newdate[x][0][:4]) if indx2newdate[x][0] != '' else 0)
    return df[df.year > 1960][['song', 'year', 'artist', 'genre', 'lyrics']]

df = load_dataset(PATH_DF, PATH_CORRECTION)

In [27]:
# Just to check what the data looks like
df.head(20)

Unnamed: 0,song,year,artist,genre,lyrics
0,ego-remix,2009,beyonce,Pop,Oh baby how you doing You know I'm gonna cut r...
5,all-i-could-do-was-cry,2008,beyonce,Pop,I heard Church bells ringing I heard A choir s...
6,once-in-a-lifetime,2008,beyonce,Pop,This is just another day that I would spend Wa...
9,why-don-t-you-love-me,2009,beyonce,Pop,N n now honey You better sit down and look aro...
16,poison,2009,beyonce,Pop,You're bad for me I clearly get it I don't see...
18,beautiful-liar,2007,beyonce,Pop,Ay Ay Ay Nobody likes to be played Oh Beyonce ...
19,beautiful-liar-spanish,2007,beyonce,Pop,Ay Ay Nobody likes being played Ay Oh Beyonc B...
20,beautiful-liar-spanglish-version,2007,beyonce,Pop,Ay Ay Nobody likes being played Ay Oh Beyonce ...
27,if,2008,beyonce,Pop,He is always laughin and flirting with me And ...
29,flaws-all,2007,beyonce,Pop,I'm a train wreck in the morning I'm a bitch i...


In [None]:
import urllib.request
import re

video_path = []
for i, song in enumerate(df.song.values):
    search_term = song.replace('-', '+') + '+' + df.artist.values[i].replace('-', '+') + '+music+video'
    search_url = "https://www.youtube.com/results?search_query=" + search_term
    html = urllib.request.urlopen(search_url)
    video_ids = re.findall(r"watch\?v=(\S{11})", html.read().decode())
    video_path.append(video_ids[0])

In [39]:
# The following code checks for each unique artist in the dataset whether they have an entity on IMVDb (the international
# music database). If it gives an error, it puts the artist in a list.
import urllib3.request
import urllib.error
import re
from tqdm import tqdm
import threading

artist_novids = []
for artist in tqdm(df.artist.unique()):
    imvdb_url = 'http://imvdb.com/n/' + artist
    try:
        html_imvdb = urllib.request.urlopen(imvdb_url)
    except urllib.error.HTTPError as e:
        artist_novids.append(artist)

100%|██████████| 8498/8498 [1:37:38<00:00,  1.45it/s] 


In [41]:
# Here I create a dataset that does not contain the artists without music videos
df_cl = df.loc[~df.artist.isin(artist_novids)]

In [49]:
# To check how many songs are now in each genre
df_cl.groupby('genre').count()

Unnamed: 0_level_0,song,year,artist,lyrics
genre,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Country,8753,8753,8753,8753
Electronic,4539,4539,4539,4539
Folk,901,901,901,901
Hip-Hop,12775,12775,12775,12775
Indie,1827,1827,1827,1827
Jazz,4436,4436,4436,4436
Metal,12879,12879,12879,12879
Other,936,936,936,936
Pop,19977,19977,19977,19977
R&B,1949,1949,1949,1949


In [59]:
# Here, I first create seperate dataframes for all genres
df_genre = dict(tuple(df_cl.groupby('genre')))
# Here, I sample 500 songs from each genre and put those into new dataframes
df_sample = {x: df_genre[x].sample(n=500, random_state=42) for x in df_genre}

In [74]:
import urllib.request
import urllib.error
import re
from tqdm import tqdm
import numpy as np
import requests

# This function checks whether or not a website exists or not. I checked various methods, and this one seems to be the fastest
def uri_exists_stream(uri: str) -> bool:
    try:
        with requests.get(uri, stream=True) as response:
            try:
                response.raise_for_status()
                return True
            except requests.exceptions.HTTPError:
                return False
    except requests.exceptions.ConnectionError:
        return False

# This loop checks for each song in the df_sample df whether a music video exists. If it exists, it retrieves the first youtube
# link after a search on youtube with the song name and artist.
for genre in df_sample:
    video_path = []
    counter = 0
    for i, song in enumerate(tqdm(df_sample[genre].song.values)):
        imvdb_url = 'http://imvdb.com/video/' + df_sample[genre].artist.values[i] + '/' + song
        if uri_exists_stream(imvdb_url) is True:
            search_term = song.replace('-', '+') + '+' + df_sample[genre].artist.values[i].replace('-', '+') + '+music+video'
            search_url = "https://www.youtube.com/results?search_query=" + search_term
            html = urllib.request.urlopen(search_url)
            video_ids = re.findall(r"watch\?v=(\S{11})", html.read().decode())
            video_path.append("https://www.youtube.com/watch?v=" + video_ids[0])
            counter += 1
        else:
            video_path.append(np.nan)
    print(f'{genre} has {counter} videos')
    df_sample[genre]['video_path'] = video_path

100%|██████████| 500/500 [05:36<00:00,  1.49it/s]
  0%|          | 0/500 [00:00<?, ?it/s]

Country has 16 videos


100%|██████████| 500/500 [06:52<00:00,  1.21it/s]
  0%|          | 0/500 [00:00<?, ?it/s]

Electronic has 66 videos


100%|██████████| 500/500 [05:24<00:00,  1.54it/s]
  0%|          | 0/500 [00:00<?, ?it/s]

Folk has 9 videos


100%|██████████| 500/500 [06:03<00:00,  1.37it/s]
  0%|          | 0/500 [00:00<?, ?it/s]

Hip-Hop has 36 videos


100%|██████████| 500/500 [18:11<00:00,  2.18s/it]    
  0%|          | 0/500 [00:00<?, ?it/s]

Indie has 58 videos


100%|██████████| 500/500 [05:21<00:00,  1.56it/s]
  0%|          | 0/500 [00:00<?, ?it/s]

Jazz has 3 videos


100%|██████████| 500/500 [05:27<00:00,  1.53it/s]
  0%|          | 0/500 [00:00<?, ?it/s]

Metal has 16 videos


100%|██████████| 500/500 [27:43<00:00,  3.33s/it]    
  0%|          | 0/500 [00:00<?, ?it/s]

Other has 34 videos


100%|██████████| 500/500 [09:23<00:00,  1.13s/it]  
  0%|          | 0/500 [00:00<?, ?it/s]

Pop has 41 videos


100%|██████████| 500/500 [05:26<00:00,  1.53it/s]
  0%|          | 0/500 [00:00<?, ?it/s]

R&B has 18 videos


100%|██████████| 500/500 [05:51<00:00,  1.42it/s]

Rock has 26 videos





In [52]:
# This function downloads a video from youtubbe to your downloads folder. I tried it once with a video in the video_paths list
import youtube_dl

def download(url: str, options: dict):
    with youtube_dl.YoutubeDL(options) as ydl:
        ydl.download([url])
        
options = {'format':'133'}  

download(video_path[0], options)

[youtube] iW5EzxFR4SM: Downloading webpage
[youtube] iW5EzxFR4SM: Downloading player f93a7034
[download] Destination: Beyoncé - Ego (Remix) ft. Kanye West-iW5EzxFR4SM.mp4
[download] 100% of 6.52MiB in 00:0194MiB/s ETA 00:009


In [77]:
# Create the cleaned_songs dataset, without artists that have no music video's
df_cl.to_csv('cleaned_songs.csv')

In [78]:
# Concatenate all genres after finding video links
df_sample_concat = pd.concat(df_sample)

In [87]:
# Remove NA's and write to csv
df_sample_concat.loc[~(df_sample_concat.video_path.isnull())].to_csv('music_videos.csv')