Importing Libraries

In [1]:
import os
import datetime
import re

# for the lyrics scrape section
import requests
import time
from bs4 import BeautifulSoup
from collections import defaultdict, Counter
import random


Lyrics Scrape

In [2]:
artists = {'taylor':"https://www.azlyrics.com/t/taylorswift.html",
           'ed':"https://www.azlyrics.com/e/edsheeran.html"}
# we'll use this dictionary to hold both the artist name and the link on AZlyrics

A Note on Rate Limiting

Part 1: Finding Links to Songs Lyrics

Q: Take a look at the robots.txt page on www.azlyrics.com. (You can read more about these pages here.) Is the scraping we are about to do allowed or disallowed by this page? How do you know?

A: It's disallowed by this page. Search https://www.azlyrics.com/robots.txt, we can get the info as below:

User-agent: *

Disallow: /lyricsdb/

Disallow: /song/

Allow: /

User-agent: 008

Disallow: /

In [4]:
# Let's set up a dictionary of lists to hold our link

lyrics_pages = defaultdict(list)

for artist, artist_page in artists.items():
    # request the page and sleep
    r = requests.get(artist_page)
    time.sleep(5 + 10 * random.random())

    # now extract the links to lyrics pages from this page
    # store the links `lyrics_pages` where the key is the artist and the
    # value is a list of links.
    if r.status_code == 200:
        soup = BeautifulSoup(r.text, 'html.parser')
        links =soup.find_all("a", href=True)

        if links:
            # Extract the href attribute from each link and store in lyrics_pages
            lyrics_pages[artist] = [link['href'] for link in links]
            print(f"Found {len(lyrics_pages[artist])} lyrics pages for {artist}")
        else:
            print(f"No lyrics links found for {artist}")

    # now extract the links to lyrics pages from this page
    # store the links `lyrics_pages` where the key is the artist and the
    # value is a list of links.
    # Check if the request was successful (status code 200)


Found 1 lyrics pages for taylor
Found 1 lyrics pages for ed


In [5]:
for artist, lp in lyrics_pages.items() :
    assert(len(set(lp)) > 20)

AssertionError: 

In [6]:
# Let's see how long it's going to take to pull these lyrics
# if we're waiting `5 + 10*random.random()` seconds
for artist, links in lyrics_pages.items() :
    print(f"For {artist} we have {len(links)}.")
    print(f"The full pull will take for this artist will take {round(len(links)*10/3600,2)} hours.")

For taylor we have 1.
The full pull will take for this artist will take 0.0 hours.
For ed we have 1.
The full pull will take for this artist will take 0.0 hours.


Part 2: Pulling Lyrics

In [7]:
def generate_filename_from_link(link) :

    if not link :
        return None

    # drop the http or https and the html
    name = link.replace("https","").replace("http","")
    name = link.replace(".html","")

    name = name.replace("/lyrics/","")

    # Replace useless chareacters with UNDERSCORE
    name = name.replace("://","").replace(".","_").replace("/","_")

    # tack on .txt
    name = name + ".txt"

    return(name)

In [8]:
# Make the lyrics folder here. If you'd like to practice your programming, add functionality
# that checks to see if the folder exists. If it does, then use shutil.rmtree to remove it and create a new one.
import shutil
if os.path.isdir("lyrics") :
    shutil.rmtree("lyrics/")

os.mkdir("lyrics")

In [9]:
from urllib.parse import urljoin

url_stub = "https://www.azlyrics.com"
start = time.time()
output_folder = "lyrics"

total_pages = 0

for artist, artist_page in artists.items():
    artist_folder = os.path.join(output_folder, artist)

    # 1. Build a subfolder for the artist
    if not os.path.exists(artist_folder):
        os.makedirs(artist_folder)

    # 2. Iterate over the lyrics pages
    for lyrics_page in lyrics_pages:
        # 3. Request the lyrics page.
        lyrics_url = urljoin(url_stub, lyrics_page[2:])  # Use urljoin for correct URL formatting
        lyrics_request = requests.get(artist_page)
        time.sleep(5 + 10 * random.random())

        if lyrics_request.status_code == 200:
            lyrics_soup = BeautifulSoup(lyrics_request.text, 'html.parser')

            # 4. Extract the title and lyrics from the page.
            title = lyrics_soup.find('b', class_=None).text.strip()

            # Find all div elements containing the lyrics
            lyrics_elements = lyrics_soup.find_all('div', class_=None, recursive=False)

            # Check if there are any elements in the list
if lyrics_elements:
    # Extract text from each tag inside the lyrics elements
      lyrics = '\n'.join([tag.get_text(separator='\n').strip() for lyrics_element in lyrics_elements for tag in lyrics_element.find_all(['br', 'p'])])

            # 5. Write out the title, two returns ('\n'), and the lyrics.
      filename = generate_filename_from_link(lyrics_page)
      filepath = os.path.join(artist_folder, filename)
      with open(filepath, 'w', encoding='utf-8') as file:
          file.write(title + '\n\n' + lyrics)
else:
    print(f"No lyrics elements found for {lyrics_page}")
            # Add a break statement if you only want to pull a certain number of songs per artist
            # if total_pages >= 20:
            #     break



    # Use this space to carry out the following steps:

    # 1. Build a subfolder for the artist
    # 2. Iterate over the lyrics pages
    # 3. Request the lyrics page.
        # Don't forget to add a line like `time.sleep(5 + 10*random.random())`
        # to sleep after making the request
    # 4. Extract the title and lyrics from the page.
    # 5. Write out the title, two returns ('\n'), and the lyrics. Use `generate_filename_from_url`
    #    to generate the filename.

    # Remember to pull at least 20 songs per artist. It may be fun to pull all the songs for the artist

No lyrics elements found for ed


In [10]:
print(f"Total run time was {round((time.time() - start)/3600,2)} hours.")

Total run time was 0.01 hours.



Evaluation

In [11]:
# Simple word extractor from Peter Norvig: https://norvig.com/spell-correct.html
def words(text):
    return re.findall(r'\w+', text.lower())

Checking Lyrics

In [12]:
artist_folders = os.listdir("lyrics/")
artist_folders = [f for f in artist_folders if os.path.isdir("lyrics/" + f)]

for artist in artist_folders :
    artist_files = os.listdir("lyrics/" + artist)
    artist_files = [f for f in artist_files if 'txt' in f or 'csv' in f or 'tsv' in f]

    print(f"For {artist} we have {len(artist_files)} files.")

    artist_words = []

    for f_name in artist_files :
        with open("lyrics/" + artist + "/" + f_name) as infile :
            artist_words.extend(words(infile.read()))


    print(f"For {artist} we have roughly {len(artist_words)} words, {len(set(artist_words))} are unique.")

For ed we have 0 files.
For ed we have roughly 0 words, 0 are unique.
For taylor we have 0 files.
For taylor we have roughly 0 words, 0 are unique.
