# PROJECT NOTEBOOK PART 1
#### THIS NOTEBOOK IS DEDICATED TO PREPARATION OF THE AUDIO SAMPLE DATASET AND WEBSCRAPING.

##### 1) Download Songs From Youtube. https://pypi.org/project/yt-dlp
##### 2) Take Sample From a song & Save For Acoustic Feature Extraction,  https://ffmpeg.org/
                                          setx /m PATH "C:\ffmpeg\bin;%PATH%"
                                          pip install -r requirements.txt
##### 3) Webscraping video_title, length , tags using BeautifulSoup https://www.crummy.com/software/BeautifulSoup/
##### 4) Use Regular Expressions to get Lists of Music Genres and sub-Genres from https://www.musicgenreslist.com/
##### 5) READ_EXCEL and Use Data as Input to Our Functions and Automate the Entire Process.

In [1]:
import os
import sys
from time import time
import yt_dlp
from pydub import AudioSegment
# FUNCTION #1
def song_download_split(web_link,destination,sample_name,t_sec, duration = 10):
    """
    Download and store 10 second audio sample from YOUTUBE link.

    Parameters
    ----------
    web_link : string
        The youtube link that we get the audio sample from.
    destination : string
        The destination which we will store the sample .wav file
            >>>> f"{str(os.getcwd())}\datasets\AUDIO\{destination}\{sample_name}.wav"  
    sample_name : string
        Name of the sample for future references.
    t_sec : int
        Timestamp in seconds which is the moment 10 second sample begins.
        
    duration : int
        In seconds, how long the sample is going to be.

    Returns
    -------
    True

    """
    web_link = check_for_http(web_link)
    options = {
      'format': 'bestaudio/best',
      'extractaudio' : True,    # only keep the audio
      'audioformat' : "wav",    # convert to wav
      'outtmpl': 'temp.wav',    # '%(id)s' == name the file the ID of the video
      'noplaylist' : True,      # only download single song, not playlist
    }
    with yt_dlp.YoutubeDL(options) as ydl:
        ydl.download([web_link])
        
    t1 = t_sec * 1000 #Works in milliseconds
    t2 = (t_sec +duration)* 1000
    AudioSegment.converter = r"ffmpeg/bin/ffmpeg.exe" #r"C:\ffmpeg\bin\ffmpeg.exe"
    newAudio = AudioSegment.from_file(r'temp.wav')
    newAudio = newAudio[t1:t2]
    
    sample_name = "".join(i for i in sample_name if i not in "\/:*?<>|") # DROP ILLEGAL CHARS FOR FILE NAMING
    newAudio.export(f"{str(os.getcwd())}\\datasets\\AUDIO\\{destination}\\{sample_name}.wav", format="wav")
    os.remove(r'temp.wav')
    return f"Downloaded:{destination}\\{sample_name}.wav"

# EXTRA FUNCTIONS
def check_for_http(url):
    if url[0:8] == "https://":
        new_url = url
    else:
        new_url = r"https://" + url
    return new_url

# Disable / enable prints to keep the output clean
temp_stdout = None
# Disable
def disablePrint():
    global temp_stdout
    temp_stdout = sys.stdout
    sys.stdout = open(os.devnull, 'w')
def enablePrint():
    global temp_stdout
    sys.stdout = temp_stdout

In [2]:
# TESTING OUR FUNCTION
web_link = "https://www.youtube.com/watch?v=ySk8BaXiHv0"
destination = r'Piano Sample'
sample_name = "piano d3"
t_sec = 16
duration = 10

start_t =time()
output_msg = song_download_split(web_link, destination, sample_name, t_sec,duration)

print(f"{output_msg}\nTime taken : {(time() - start_t):.3f} seconds")

[youtube] ySk8BaXiHv0: Downloading webpage
[youtube] ySk8BaXiHv0: Downloading android player API JSON
[youtube] ySk8BaXiHv0: Downloading MPD manifest
[youtube] ySk8BaXiHv0: Downloading MPD manifest
[info] ySk8BaXiHv0: Downloading 1 format(s): 251
[download] Destination: temp.wav
[download] 100% of 549.49KiB in 00:00                      
Downloaded:Piano Sample\piano d3.wav
Time taken : 3.602 seconds


### WEBSCRAPING FOR GENRE & SUBGENRE NAMES
##### Obtaining the genre names from https://www.musicgenreslist.com/ 
##### Since bottom two cells will store the scraped data, we don't have to run them again after the first time. 
##### We also don't have to use html request and beautifulsoup, we may as well just go ahead and download the source & use regex.

In [3]:
import re
f = open('datasets/web_source.txt','r',encoding ='utf-8')
text=f.readlines()
text="".join(text)
genre_labels =re.findall(r'<li><a title="([^<]*) - Music Genre"',text) # regex to find string of any length between text and text
genre_labels =[s.upper() for s in genre_labels]
print("genre_labels =",f"{genre_labels}\n")
genre_labels= set(genre_labels)

comprehensive_subgenres = re.findall(r"<li>([^<]*)<\/li>",text)
comprehensive_subgenres =[s.upper() for s in comprehensive_subgenres]
comprehensive_subgenres = [re.sub(r'\([^()]*\)', '', s) for s in comprehensive_subgenres] #regex remove parantheses descriptors
comprehensive_subgenres = [s[:-1] if s[-1] == " " else s for s in comprehensive_subgenres] # remove last char if == " "
print("some_sub_genres = ",f"{comprehensive_subgenres[10:20]}\nNumber of sub_genres = {len(comprehensive_subgenres)}")
comprehensive_subgenres = set(comprehensive_subgenres)

genre_labels = ['ALTERNATIVE', 'ANIME', 'BLUES', "CHILDREN'S MUSIC", 'CLASSICAL', 'COMEDY', 'COUNTRY', 'DANCE', 'DISNEY', 'EASY LISTENING', 'ELECTRONIC', 'ENKA', 'FRENCH POP', 'GERMAN FOLK', 'GERMAN POP', 'FITNESS &AMP; WORKOUT', 'HIP-HOP/RAP', 'HOLIDAY', 'INDIE POP', 'INDIE POP', 'CHRISTIAN &AMP; GOSPEL', 'INSTRUMENTAL', 'J-POP', 'JAZZ', 'K-POP', 'KARAOKE', 'KAYOKYOKU', 'LATINO', 'NEW AGE', 'OPERA', 'POP', 'R&AMP;B/SOUL', 'REGGAE', 'ROCK', 'SINGER/SONGWRITER', 'SOUNDTRACK', 'SPOKEN WORD', 'VOCAL', 'WORLD']

some_sub_genres =  ['GRUNGE', 'HARDCORE PUNK', 'HARD ROCK', 'INDIE ROCK', 'LO-FI', 'MUSIQUE CONCRÈTE', 'NEW WAVE', 'PROGRESSIVE ROCK', 'PUNK', 'SHOEGAZE']
Number of sub_genres = 819


In [4]:
# Save the file for easy access in another project 
# We are also going to use pickle again when the ML model is trained.
import pickle
outfile = open(r"datasets/genre_set",'wb') #write binary
pickle.dump(genre_labels,outfile)
outfile.close()

outfile = open(r"datasets/subgenre_set",'wb')
pickle.dump(comprehensive_subgenres,outfile)
outfile.close()

# FUNCTIONS FOR WEBSCRAPING THE METADATA

In [5]:
from requests_html import AsyncHTMLSession 
from bs4 import BeautifulSoup as bs
import re
import asyncio
import nest_asyncio # we need nested async because jupyter notebook has its own event loop running
nest_asyncio.apply()

#FUNCTION #2 
filter_genre_labels = lambda song_tags : {tag for tag in song_tags if tag in genre_labels}

#FUNCTION #3
filter_sub_genre_labels = lambda song_tags : {tag for tag in song_tags if tag in comprehensive_subgenres}

#sub-FUNCTION  of #4
async def create_soup(video_url):
    # init an HTML Session
    asession = AsyncHTMLSession()
    # get the html content
    response = await asession.get(video_url)
    # create bs object to parse HTML
    soup = bs(response.html.html, "html.parser")
    #soup.find_all("meta")
    return soup

# FUNCTION #4
def get_youtube_data(video_url):
    """
    Parameters
    ----------
    video_url : string
        The youtube link that we scrape the data from.
        
    Returns
    -------
    title : string
        Title of the video.
        
    tags : set
        Every tag of the video in uppercase.(additionally splits and adds multiple word tags.)
        
    duration_in_seconds : int
        Duration of the video calculated from meta data.
    """
    video_url = check_for_http(video_url)
    soup = asyncio.run(create_soup(video_url))
    
    # GETTING THE INFORMATION WE NEED
    title = soup.find("meta", itemprop="name")["content"]
    
    # TAGS
    tags =[meta.attrs.get("content") for meta in soup.find_all("meta", {"property": "og:video:tag"})]
    for item in tags: # IF A TAG CONSISTS OF MULTIPLE WORDS, SPLIT AND ADD THEM AS NEW TAGS 
        item= item.split()
        if len(item) > 1:
            tags += item
    tags= {tag.upper() for tag in tags}
    
    # DURATION
    # duration is stored in a weird string
    duration_string = soup.find("meta",itemprop = "duration")["content"]
    list_dur= re.findall("\\d+",duration_string) #regex to find decimals in string  1,23,33
    
    coefs=[1,60,3600] # second,minute,hour >> in seconds 
    duration_in_seconds = 0
    for item,second in zip(list_dur[::-1],coefs[0:len(list_dur)]):
        duration_in_seconds += int(item)*second
        
    return (title, tags, duration_in_seconds)  #P4M43ST

In [6]:
# Testing our functions
title, tags, length =  get_youtube_data("https://www.youtube.com/watch?v=tKi9Z-f6qX4")
print(filter_genre_labels(tags))
print(filter_sub_genre_labels(tags))
print(F"LENGTH IN SECONDS: {length}")
print(title)

{'DANCE', 'POP', 'ELECTRONIC'}
{'MINIMAL', 'DISCO', 'PROGRESSIVE', 'ELECTRONIC DANCE MUSIC'}
LENGTH IN SECONDS: 637
deadmau5 - Strobe


### ALL OF OUR FUNCTIONS APPLIED TO THE EXCEL TABLE IN ONE LOOP

In [7]:
import pandas as pd
excel = pd.read_excel("datasets/songs_urls.xlsx")
excel.columns = excel.iloc[0]
excel = excel[1:]
excel=excel.dropna()


to_download = excel
#to_download = excel[138:]
#to_download =excel.iloc[-16:]
#to_download = excel[excel["Genre"] == "Turkish Rap"]
print(excel.Genre.value_counts())


Turkish Pop                99
Turkish Rock               95
Turkish Slow               80
Turkish Classical Music    62
Turkish Folk Music         61
Turkish Rap                51
Piano Sample                1
Name: Genre, dtype: int64


In [8]:
start = time()
# FOR EVERY COLUMN VALUE IN A ROW
for idx,row in to_download.iterrows():
    #print("----------------------------------")
    name, url, s, genre = row.T.to_list()         # unpack the values in a row
    title, tags, length =  get_youtube_data(url)  #4
    title = re.sub('[^\w_.)( -]', '', title)
    
    songs = os.listdir(str(os.getcwd()) + f"/datasets/AUDIO/{genre}")
    if f'{title}.wav' in songs:
        song_download_split(url, genre , title+'2', s) #1
    else:
        song_download_split(url, genre , title, s) # we dont use more than 2 sample from one song so this logic is enough
    print(idx)
    print(title)
    #print(length)
    #print("--TAGS--")
    #print(filter_genre_labels(tags))               #2
    #print(filter_sub_genre_labels(tags))           #3
    
print(f"--------------------------------\nIt took : {time()-start:.2f} seconds to download & get info about {len(to_download)} songs")

[youtube] 12zcvCdtp4Q: Downloading webpage
[youtube] 12zcvCdtp4Q: Downloading android player API JSON
[info] 12zcvCdtp4Q: Downloading 1 format(s): 251
[download] Destination: temp.wav
[download] 100% of 4.85MiB in 00:01                  
1
Leifur James - Mumma Dont Tell
[youtube] HGC1cprNels: Downloading webpage
[youtube] HGC1cprNels: Downloading android player API JSON
[info] HGC1cprNels: Downloading 1 format(s): 251
[download] Destination: temp.wav
[download] 100% of 2.77MiB in 00:00                  
2
Ece Mumay - Vanilya
[youtube] J9DvDxM_ur8: Downloading webpage
[youtube] J9DvDxM_ur8: Downloading android player API JSON
[info] J9DvDxM_ur8: Downloading 1 format(s): 251
[download] Destination: temp.wav
[download] 100% of 3.57MiB in 00:00                  
3
Demet Akalın - Bensiz Olsun
[youtube] BebQwqnk0hM: Downloading webpage
[youtube] BebQwqnk0hM: Downloading android player API JSON
[info] BebQwqnk0hM: Downloading 1 format(s): 251
[download] Destination: temp.wav
[download] 100% o

In [9]:
tags

{'(OFFICIAL',
 'ALBÜM',
 'DINLE',
 'GÖKSEL',
 'GÖKSEL DINLE',
 'GÖKSEL SON ALBÜM',
 'GÖKSEL SON ALBÜM DINLE',
 'GÖKSEL YENI ALBÜM',
 'GÖKSEL YENI ALBÜM DINLE',
 'GÖKSEL YENI KLIBI',
 'GÖKSEL YENI KLIP',
 'GÖKSEL YENI VIDEO',
 'GÖKSEL YENI ŞARKI',
 'GÖKSEL YENI ŞARKISI',
 'GÖRKSEL',
 'GÖRKSEL YENI ŞARKI DINLE',
 'KLIBI',
 'KLIP',
 'MÜZIK',
 'MÜZIK DINLE',
 'ORDA',
 'POP',
 'SEN',
 'SEN ORDA YOKSUN',
 'SEN ORDA YOKSUN (OFFICIAL VIDEO)',
 'SEN ORDA YOKSUN KLIP',
 'SEN ORDA YOKSUN VIDEO',
 'SON',
 'TÜRKÇE',
 'TÜRKÇE POP',
 'TÜRKÇE POP DINLE',
 'VIDEO',
 'VIDEO)',
 'YENI',
 'YOKSUN',
 'ŞARKI',
 'ŞARKISI'}