# Web crawler & Song pre-process

In [5]:
import os
import requests
from selenium import webdriver
from bs4 import BeautifulSoup
import time

# Crawl the song

In [3]:
songs_url = "https://genius.com/artists/The-weeknd/songs"

driver = webdriver.Chrome()
driver.get(songs_url)

time.sleep(2)

last_height = driver.execute_script("return document.body.scrollHeight")

scroll_offset = 1000
# Scroll for twenty times
for i in range(50):
    driver.execute_script(f"window.scrollTo(0, {last_height - scroll_offset});")
    time.sleep(2)
    new_height = driver.execute_script("return document.body.scrollHeight")

    if new_height == last_height:
        break
    last_height = new_height

page_source = driver.page_source
driver.quit()
print("Finish crawling")

Finish crawling


In [4]:
soup = BeautifulSoup(page_source, "html.parser")
song_links = []
for link in soup.find_all("a", class_="ListItem__Link-sc-122yj9e-1 klWOzg"):
    href = link.get("href")
    song_links.append(href)
print(len(song_links))
print(song_links[:5])

780
['https://genius.com/The-weeknd-the-hills-lyrics', 'https://genius.com/Lil-uzi-vert-xo-tour-llif3-lyrics', 'https://genius.com/The-weeknd-starboy-lyrics', 'https://genius.com/The-weeknd-cant-feel-my-face-lyrics', 'https://genius.com/The-weeknd-reminder-lyrics']


In [5]:
links_path = "links.txt"

filtered_links = [link for link in song_links if 'the-weeknd' in link.lower() and 'lyrics' in link.lower()]
print(len(filtered_links))
with open(links_path, 'w', encoding='utf-8') as f:
    for link in filtered_links:
        f.write(link+'\n')
print(filtered_links[:5])

436
['https://genius.com/The-weeknd-the-hills-lyrics', 'https://genius.com/The-weeknd-starboy-lyrics', 'https://genius.com/The-weeknd-cant-feel-my-face-lyrics', 'https://genius.com/The-weeknd-reminder-lyrics', 'https://genius.com/The-weeknd-call-out-my-name-lyrics']


In [8]:
import re

with open('links.txt', 'r') as file:
    lines = file.read().split('\n')
    links = [line.strip() for line in lines if line.strip()]

for link in links:
    song_title = re.search(r'genius.com/(.*?)-lyrics$', link).group(1).replace('-', ' ')
    song_path = f'Songs/{song_title}.txt'
    
    response = requests.get(link)
    if response.status_code == 200:
        page_content = response.text
        soup = BeautifulSoup(page_content, 'html.parser')
    
        if not os.path.exists('Songs'):
            os.mkdir('Songs')

        # The web use div, class:Lyrics__Container-sc-1ynbvzw-5 Dzxov to contain the lyric.
        # Crawl them all.
        text = ""
        for tag in soup.find_all('div', class_="Lyrics__Container-sc-1ynbvzw-5 Dzxov"):
            text += tag.get_text()
        
        with open(song_path, 'w', encoding='utf-8') as f:
            f.write(text)
        
        print(f"{song_title} has been written into {song_path}.")
    else:
        print("Can\'t visit the page!")
        
print("All lyrics have been written into the song path")

The weeknd the hills has been written into Songs/The weeknd the hills.txt.
The weeknd starboy has been written into Songs/The weeknd starboy.txt.
The weeknd cant feel my face has been written into Songs/The weeknd cant feel my face.txt.
The weeknd reminder has been written into Songs/The weeknd reminder.txt.
The weeknd call out my name has been written into Songs/The weeknd call out my name.txt.
The weeknd die for you has been written into Songs/The weeknd die for you.txt.
The weeknd blinding lights has been written into Songs/The weeknd blinding lights.txt.
The weeknd often has been written into Songs/The weeknd often.txt.
The weeknd heartless has been written into Songs/The weeknd heartless.txt.
The weeknd i feel it coming has been written into Songs/The weeknd i feel it coming.txt.
The weeknd wicked games has been written into Songs/The weeknd wicked games.txt.
The weeknd after hours has been written into Songs/The weeknd after hours.txt.
The weeknd party monster has been written in

The weeknd nothing without you has been written into Songs/The weeknd nothing without you.txt.
Swedish house mafia and the weeknd moth to a flame has been written into Songs/Swedish house mafia and the weeknd moth to a flame.txt.
The weeknd out of time has been written into Songs/The weeknd out of time.txt.
The weeknd dd has been written into Songs/The weeknd dd.txt.
The weeknd till dawn here comes the sun has been written into Songs/The weeknd till dawn here comes the sun.txt.
The weeknd is there someone else has been written into Songs/The weeknd is there someone else.txt.
The weeknd the town has been written into Songs/The weeknd the town.txt.
The weeknd gasoline has been written into Songs/The weeknd gasoline.txt.
The weeknd attention has been written into Songs/The weeknd attention.txt.
The weeknd less than zero has been written into Songs/The weeknd less than zero.txt.
The weeknd until i bleed out has been written into Songs/The weeknd until i bleed out.txt.
The weeknd sacrifice 

The weeknd i wanna feel you has been written into Songs/The weeknd i wanna feel you.txt.
The weeknd heartless remix has been written into Songs/The weeknd heartless remix.txt.
The weeknd die for you sza remix has been written into Songs/The weeknd die for you sza remix.txt.
The weeknd take me back to la myself has been written into Songs/The weeknd take me back to la myself.txt.
The weeknd where do i go has been written into Songs/The weeknd where do i go.txt.
The weeknd wanderlust pharrell remix has been written into Songs/The weeknd wanderlust pharrell remix.txt.
Sabrina claudio and the weeknd christmas blues has been written into Songs/Sabrina claudio and the weeknd christmas blues.txt.
The weeknd wanna see has been written into Songs/The weeknd wanna see.txt.
Metro boomin the weeknd and diddy creepin remix has been written into Songs/Metro boomin the weeknd and diddy creepin remix.txt.
The weeknd get in there has been written into Songs/The weeknd get in there.txt.
The weeknd be go

The weeknd loft music og has been written into Songs/The weeknd loft music og.txt.
The weeknd never go home alone again demo has been written into Songs/The weeknd never go home alone again demo.txt.
The weeknd the weeknd music videos has been written into Songs/The weeknd the weeknd music videos.txt.
The weeknd take my breath remix has been written into Songs/The weeknd take my breath remix.txt.
The weeknd old news has been written into Songs/The weeknd old news.txt.
The weeknd ynp has been written into Songs/The weeknd ynp.txt.
The weeknd one of those nights demo has been written into Songs/The weeknd one of those nights demo.txt.
The weeknd keep it on that xo interlude has been written into Songs/The weeknd keep it on that xo interlude.txt.
The weeknd tongue has been written into Songs/The weeknd tongue.txt.
The weeknd can we get high has been written into Songs/The weeknd can we get high.txt.
The weeknd blinding lights major lazer remix has been written into Songs/The weeknd blindi

The weeknd faith live has been written into Songs/The weeknd faith live.txt.
The weeknd kiss land live has been written into Songs/The weeknd kiss land live.txt.
The weeknd i feel it coming live has been written into Songs/The weeknd i feel it coming live.txt.
The weeknd the morning live has been written into Songs/The weeknd the morning live.txt.
The weeknd out of time live has been written into Songs/The weeknd out of time live.txt.
The weeknd less than zero live has been written into Songs/The weeknd less than zero live.txt.
The weeknd take my breath live has been written into Songs/The weeknd take my breath live.txt.
The weeknd earned it fifty shades of grey marian hill remix has been written into Songs/The weeknd earned it fifty shades of grey marian hill remix.txt.
The weeknd die for you mixed has been written into Songs/The weeknd die for you mixed.txt.
The weeknd cant feel my face live has been written into Songs/The weeknd cant feel my face live.txt.
The weeknd heartless live 

Genius korean translations the weeknd less than zero has been written into Songs/Genius korean translations the weeknd less than zero.txt.
Genius korean translations the weeknd take my breath has been written into Songs/Genius korean translations the weeknd take my breath.txt.
Genius russian translations the weeknd alone again live has been written into Songs/Genius russian translations the weeknd alone again live.txt.
Genius korean translations the weeknd here we go again has been written into Songs/Genius korean translations the weeknd here we go again.txt.
Genius korean translations the weeknd how do i make you love me has been written into Songs/Genius korean translations the weeknd how do i make you love me.txt.
The weeknd ok event 6507 has been written into Songs/The weeknd ok event 6507.txt.
Genius korean translations the weeknd is there someone else has been written into Songs/Genius korean translations the weeknd is there someone else.txt.
Genius korean translations the weeknd

In [1]:
def clean_lyrics(input_text):
    output_text = ""
    i = 0
    while i < len(input_text):
        if input_text[i] not in ['\"','\'', '(', ')', '.', '…', ';', '—']:
            output_text += input_text[i]
        i += 1

    return output_text

In [2]:
def insert_newlines(input_text):
    output_text = ""
    i = 0
    skip = 0

    while i < len(input_text):
        current_char = input_text[i]
        
        # The song text contains tags like [intro], [verse].
        # We don't want that, so skip them.
        if current_char == '[':
            skip = 1
            i += 1
            continue
        elif current_char == ']':
            skip = 0
            i += 1
            continue
        if skip == 1:
            i += 1
            continue
            
        else:
            # Insert newlines base on whether the character is upper or not.
            if current_char.isupper():
                if input_text[i - 1] in [' ', '-']:
                    output_text += current_char
                    i += 1
                    continue
                else:
                    output_text += '\n' + current_char
                    i += 1
                    continue

            output_text += current_char
            i += 1

    return output_text

In [3]:
def merge_short_lines(input_text):
    lines = input_text.split('\n')
    merged_lines = []
    current_line = ""

    for line in lines:
        if len(line.split(' ')) <= 2:
            current_line += " " + line
        else:
            if current_line and len(merged_lines) > 0:
                merged_lines[-1] += current_line
                current_line = ""
            merged_lines.append(line)
            
    # for the last line
    if current_line:
        merged_lines[-1] += current_line

    merged_text = '\n'.join(merged_lines)
    return merged_text

# Write all lyrics into one file

In [2]:
folder_path = 'Songs'
file_list = os.listdir(folder_path)

content = ""
for file_name in file_list:
    file_path = os.path.join(folder_path, file_name)
    with open(file_path, 'r', encoding='utf-8') as f:
        content += f.read() + '\n'

content = clean_lyrics(content)
content = insert_newlines(content)
content = merge_short_lines(content)

lyric_path = "The Weeknd.txt"
with open(lyric_path, 'w', encoding='utf-8') as f:
    f.write(content)
    
print("Wrote all the lyrics into " + lyric_path)

Wrote all the lyrics into The Weeknd.txt
