#### Import Necessary Modules/Libraries

In [3]:
import os
import requests
import csv
from bs4 import BeautifulSoup
import time
import re

#### Define Functions

In [6]:
def scrape_billboard(year):
    """
    Scrapes the Billboard Year-End Hot 100 singles for a given year.

    Args:
        year (str): The year for which to scrape the Billboard data.

    Returns:
        list: A list of tuples containing the year, rank, song title, and artist name of each song for the given year.
    """
    # Construct URL using year
    url = f'https://en.wikipedia.org/wiki/Billboard_Year-End_Hot_100_singles_of_{year}'
    
    response = requests.get(url)
    response.raise_for_status()  # Raise an error for bad responses
    document = BeautifulSoup(response.text, 'html.parser')

    # Find list of songs on webpage
    songs_table = document.find('table', class_='wikitable')
    song_data = []

    # Get the top 10 songs
    for row in songs_table.find_all('tr')[1:11]:  # Skip the header row
        columns = row.find_all('td')
        if len(columns) > 2:
            rank = columns[0].text.strip()
            song_title = columns[1].text.strip().replace('"', '')  # Remove quotes
            artist_name = columns[2].text.strip()
            song_data.append((year, rank, song_title, artist_name))

    return song_data # Returns list of tuples (year, rank, song_title, artist_name)


def format_song_data(song_data):
    """
    Formats the song data by cleaning the artist names and song titles.

    Args:
        song_data (list): A list of tuples containing the year, rank, song title, and artist name.

    Returns:
        list: A list of formatted tuples containing the year, rank, cleaned song title, and cleaned artist name.
    """
    # Initialise empty list for formatted data
    formatted_data = []
    
    for year, rank, song_title, artist_name in song_data:
        # Extract and clean the first artist's name
        formatted_artist = artist_name.split(" featuring ")[0]  # Get only the first artist
        formatted_artist = formatted_artist.split(" and ")[0]  # Get only the first artist in case of "and"
        formatted_artist = re.sub(r'\(.*?\)|\[.*?\]', '', formatted_artist)
        formatted_artist = re.sub(r'[^\w\s]', '', formatted_artist) 
        formatted_artist = formatted_artist.lower().replace(" ", "")

        # Replace specific artist names
        artist_replacements = {
            'matchboxtwenty': 'matchbox20',
            'thecalling' : 'calling',
            'beyoncé' : 'beyonceknowles',
            'thepussycatdolls' : 'pussycatdolls',
            'theblackeyedpeas' : 'blackeyedpeas',
            'theallamericanrejects' : 'allamericanrejects',
            'kesha' : 'keha',
            'thewanted' : 'wanted',
            'macklemoreryanlewis' : 'macklemore',
            'silentó' : 'silento',
            'theweeknd' : 'weeknd',
            'thechainsmokers' : 'chainsmokers',
            'cardibbadbunny' : 'cardi-b',
            'zeddmarenmorris' : 'zedd'
        }
        
        # Replace the artist name if it matches any key in the replacements dictionary
        formatted_artist = artist_replacements.get(formatted_artist, formatted_artist)

        # Clean song title 
        formatted_title = re.sub(r'\(.*?\)|\[.*?\]', '', song_title)
        formatted_title = re.sub(r'[^\w\s]', '', formatted_title)  
        formatted_title = formatted_title.lower().replace(" ", "")

        # Replace specific song titles
        title_replacements = {
            'independentwomenparti': 'independentwomen',
            'buyuadrank' : 'buyuadrankshawtysnappin',
            'singleladies' : 'singleladiesputaringonit',
            'igottafeeling' : 'igottafeelin',
            'stronger' : 'whatdoesntkillyoustronger',
            'watchme' : 'watchmewhipnaenae'
        }
        
        # Replace the title if it matches any key in the replacements dictionary
        formatted_title = title_replacements.get(formatted_title, formatted_title)
        
        # Append formatted tuple to list
        formatted_data.append((year, rank, formatted_title, formatted_artist))
        
    return formatted_data


def scrape_lyrics(formatted_title, formatted_artist):
    """
    Scrapes the lyrics for a given song and artist from AZLyrics.

    Args:
        song_title (str): The title of the song.
        artist_name (str): The name of the artist.

    Returns:
        str: The lyrics of the song, or None if an error occurs.
    """
    # Construct URL using formatted title and artist
    url = f'https://www.azlyrics.com/lyrics/{formatted_artist}/{formatted_title}.html'
    
    try:
        response = requests.get(url)
        response.raise_for_status()  # Raise an error for bad responses
        document = BeautifulSoup(response.text, 'html.parser')

        # Find all <div> elements where lyrics are found
        lyrics_tag = document.find_all("div", attrs={"class": None, "id": None})

        # Initialise empty list to store lyrics
        lyrics = []

        # Extract text from each lyric tag, strip leading/trailing whitespace and append to list
        lyrics = [content.get_text(separator=" ").strip() for content in lyrics_tag]

        # Join lines to form complete lyrics
        lyrics_text = "\n".join(lyrics)

        return lyrics_text

    # Print error message if issue fetching the lyrics
    except requests.exceptions.RequestException as e:
        print(f"Error fetching lyrics for {formatted_title} by {formatted_artist}: {e}")
        return None 


def save_lyrics_to_file(year, lyrics):
    """
    Saves the combined lyrics to a text file for the given year.

    Args:
        year (str): The year for which the lyrics are being saved.
        lyrics (str): The lyrics to save in the text file.
    """
    # Use 'primary_set' for 00s and 'secondary_set' for 10s
    if year in decade_1:
        directory = 'primary_set'
    else:
        directory = 'secondary_set'
    
    os.makedirs(directory, exist_ok=True)  # Create directory if it doesn't exist
    with open(f'{directory}/{year}.txt', 'a', encoding='utf-8') as lyric_file:
        lyric_file.write(lyrics + "\n\n")  # Append lyrics for the year


def create_csv_file(songs, filename='overview.csv'):
    """
    Creates a CSV file from the list of songs.

    Args:
        songs (list): A list of tuples containing the year, rank, song title, and artist name.
        filename (str): The name of the CSV file to create (default is 'overview.csv').
    """  
    with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
        csv_writer = csv.writer(csvfile)
        csv_writer.writerow(['Year', 'Rank', 'Song Title', 'Artist'])  # Write header
        for year, rank, song_title, artist_name in songs:
            print(f"Writing to CSV: Year={year}, Rank={rank}, Title={song_title}, Artist={artist_name}")  # Debugging line
            csv_writer.writerow([year, rank, song_title, artist_name])

#### Scraping and Saving Lyrics for Both Decades

In [16]:
# Create lists of years for both decades
decade_1 = [str(year) for year in range(2000, 2010)]
decade_2 = [str(year) for year in range(2010, 2020)]

# Initialise lists to hold song data
songs_decade_1 = []
songs_decade_2 = []

for year in decade_1:
    song_info = scrape_billboard(year)
    formatted_data = format_song_data(song_info) 
    songs_decade_1.extend(formatted_data) 

for year in decade_2:
    song_info = scrape_billboard(year)
    formatted_data = format_song_data(song_info) 
    songs_decade_2.extend(formatted_data) 

**00's**

In [122]:
# Iterate through each year in decade, scrape, format, append song data
for year in decade_1:
    song_info = scrape_billboard(year)
    formatted_data = format_song_data(song_info) 
    songs_decade_1.extend(formatted_data) 

    # Scrape lyrics for each song and save to file
    for year, rank, formatted_title, formatted_artist in formatted_data:
        lyrics = scrape_lyrics(formatted_title, formatted_artist)
        if lyrics:
            save_lyrics_to_file(year, lyrics)
            print(f"Lyrics for {formatted_title} by {formatted_artist} have been saved in {year} file.\n")
        else:
            print(f"Failed to scrape lyrics for: {formatted_title} by {formatted_artist} from {year}")
            
        time.sleep(10) # Use sleep to avoid triggering anti-bot measures

Lyrics for breathe by faithhill have been saved in 2000 file.

Lyrics for smooth by santana have been saved in 2000 file.

Lyrics for mariamaria by santana have been saved in 2000 file.

Lyrics for iwannaknow by joe have been saved in 2000 file.

Lyrics for everythingyouwant by verticalhorizon have been saved in 2000 file.

Lyrics for saymyname by destinyschild have been saved in 2000 file.

Lyrics for iknewilovedyou by savagegarden have been saved in 2000 file.

Lyrics for amazed by lonestar have been saved in 2000 file.

Lyrics for bent by matchbox20 have been saved in 2000 file.

Lyrics for hewasntmanenough by tonibraxton have been saved in 2000 file.

Lyrics for hangingbyamoment by lifehouse have been saved in 2001 file.

Lyrics for fallin by aliciakeys have been saved in 2001 file.

Lyrics for allforyou by janetjackson have been saved in 2001 file.

Lyrics for dropsofjupiter by train have been saved in 2001 file.

Lyrics for imreal by jenniferlopez have been saved in 2001 file.

L

**10's**

In [109]:
# Iterate through each year in decade, scrape, format, append song data
for year in decade_2:
    song_info = scrape_billboard(year)
    formatted_data = format_song_data(song_info) 
    songs_decade_2.extend(formatted_data) 

    # Scrape lyrics for each song and save to file
    for year, rank, formatted_title, formatted_artist in formatted_data:
        lyrics = scrape_lyrics(formatted_title, formatted_artist)
        if lyrics:
            save_lyrics_to_file(year, lyrics)
            print(f"Lyrics for {formatted_title} by {formatted_artist} have been saved in {year} file.\n")
        else:
            print(f"Failed to scrape lyrics for: {formatted_title} by {formatted_artist} from {year}")

        time.sleep(10) # Use sleep to avoid triggering anti-bot measures


Lyrics for tiktok by keha have been saved in 2010 file.

Lyrics for needyounow by ladyantebellum have been saved in 2010 file.

Lyrics for heysoulsister by train have been saved in 2010 file.

Lyrics for californiagurls by katyperry have been saved in 2010 file.

Lyrics for omg by usher have been saved in 2010 file.

Lyrics for airplanes by bob have been saved in 2010 file.

Lyrics for lovethewayyoulie by eminem have been saved in 2010 file.

Lyrics for badromance by ladygaga have been saved in 2010 file.

Lyrics for dynamite by taiocruz have been saved in 2010 file.

Lyrics for breakyourheart by taiocruz have been saved in 2010 file.

Lyrics for rollinginthedeep by adele have been saved in 2011 file.

Lyrics for partyrockanthem by lmfao have been saved in 2011 file.

Lyrics for firework by katyperry have been saved in 2011 file.

Lyrics for et by katyperry have been saved in 2011 file.

Lyrics for givemeeverything by pitbull have been saved in 2011 file.

Lyrics for grenade by brunoma

In [19]:
# Create a CSV file for all songs from both decades
all_songs = songs_decade_1 + songs_decade_2
create_csv_file(all_songs)

Writing to CSV: Year=2000, Rank=1, Title=breathe, Artist=faithhill
Writing to CSV: Year=2000, Rank=2, Title=smooth, Artist=santana
Writing to CSV: Year=2000, Rank=3, Title=mariamaria, Artist=santana
Writing to CSV: Year=2000, Rank=4, Title=iwannaknow, Artist=joe
Writing to CSV: Year=2000, Rank=5, Title=everythingyouwant, Artist=verticalhorizon
Writing to CSV: Year=2000, Rank=6, Title=saymyname, Artist=destinyschild
Writing to CSV: Year=2000, Rank=7, Title=iknewilovedyou, Artist=savagegarden
Writing to CSV: Year=2000, Rank=8, Title=amazed, Artist=lonestar
Writing to CSV: Year=2000, Rank=9, Title=bent, Artist=matchbox20
Writing to CSV: Year=2000, Rank=10, Title=hewasntmanenough, Artist=tonibraxton
Writing to CSV: Year=2001, Rank=1, Title=hangingbyamoment, Artist=lifehouse
Writing to CSV: Year=2001, Rank=2, Title=fallin, Artist=aliciakeys
Writing to CSV: Year=2001, Rank=3, Title=allforyou, Artist=janetjackson
Writing to CSV: Year=2001, Rank=4, Title=dropsofjupiter, Artist=train
Writing to