# Initialize the framework

Install lyricsgenius library which provides a simple interface to the song, artist, and lyrics data stored on Genius.com

In [None]:
!pip install lyricsgenius



Import the package and initiate Genius

In [None]:
import lyricsgenius

client_token  = "<YOUR_TOKEN>"


# remove_section_headers
# If True, removes [Chorus], [Bridge], etc. headers from lyrics

# excluded_terms
# extra terms for flagging results as non-lyrics

# retries
# Number of retries in case of timeouts and errors with a >= 500 response code

excluded_terms = ["(Remix)", "(Live)", "(Edit)", "(Remastered)", "(Mix)", "(Acoustic)"]

genius = lyricsgenius.Genius(client_token, remove_section_headers=True, excluded_terms=excluded_terms, sleep_time=0.3, retries=5)

Mount Google Drive to load the downloaded dataset of lyrics

In [None]:
# Mount drive
from google.colab import drive
drive.mount('/content/drive')

dataset_path  = '/content/drive/MyDrive/DM project - NLP lyrics generation/english_cleaned_lyrics.csv'
out_path      = '/content/drive/MyDrive/DM project - NLP lyrics generation/genius_lyrics.csv'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Read the dataset

Read the loaded dataset to retrieve artists & genres

In [None]:
import pandas as pd

# Set pandas option
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)
#pd.set_option("display.max_colwidth", None)
#pd.reset_option("display.max_colwidth")

# Read CSV file and get columns of interest
data = pd.read_csv(dataset_path)
#data = data[['song', 'year', 'artist', 'genre', 'lyrics']]
data = data[['artist', 'genre']]

# Keep one entry per artist
data.drop_duplicates(subset='artist', inplace=True)

# Remove 'Other' genre
data = data[data.genre != "Other"][8800:8900]

print("Artists & genres:")
print(data[:20])

Artists & genres:
                      artist       genre
210420         art-in-manila        Rock
210428              cinerama         Pop
210474           days-divide        Folk
210480               faktion        Rock
210504  accidental-superhero        Rock
210530                 bibio  Electronic
210536          buddy-miller        Rock
210595         10000-maniacs        Rock
210686                  amos       Metal
210694            ballydowse        Rock
210701          bourbon-crow        Rock
210702                  coin        Rock
210705                  alus  Electronic
210713              demi-vie       Indie
210714                   bwo         Pop
210717        cloud-nothings        Rock
210755                  erup  Electronic
210756                 flyte        Rock
210759         elena-gheorge         Pop
210760                    fm        Rock


Get N lyrics for each artist in the dataset from Genius

In [None]:
import time
import re
import datetime

# Check if the string is a pyong (e.g. 2.2K)
def is_pyong(string):
  return re.match("^[0-9]+.[0-9][kK]$", string)

# Check if the string match the trash ending
def is_trash_ending(string):
  return string == "EmbedShare URLCopyEmbedCopy"  

# Format seconds in HH:MM:SS
def format_time(seconds):
  return datetime.timedelta(seconds=seconds)

# Return the name of the artist and a list of songs for the given artist
def search_songs(artist):
  global i, exe_time

  # Get start time for this execution
  start_time = time.time()
  
  i += 1
  print("\n" + "*"*50 + "\n")
  print("Progress: %d/%d = %.2f%%" %(i, data_size, (i/data_size)*100))
  remaining_sec = (data_size-i-1)*exe_time
  print("Estimated remaining time: %s\n" %format_time(remaining_sec))

  executed = False

  # Try to search the artist on Genius (repeat on connection errors)
  while not executed:
    try:
      artist = genius.search_artist(artist, max_songs=35)
    except KeyboardInterrupt:
      break
    except:
      time.sleep(0.1)
      continue

    executed = True
  
  # Check if the artist has been found
  if artist == None:
    return None, None
    
  print("\n" + "="*50 + "\n")

  #print("\nartist: ", artist.name)

  songs_lyrics = []

  # Get artist's songs
  for song in artist.songs:
    print("Song title: ", song.title)
    #print("\n" + "="*50 + "\n")
    #print("Song title: ", song.title, "\n")
    #print("lyrics:")
    #print(song.lyrics)

    if not song.lyrics:
      continue

    # Remove trash ending (if any)
    lyrics_sanitized_len = len(song.lyrics) - lyrics_trash_ending_len
    j = 0
    pyong = song.lyrics[lyrics_sanitized_len-4:lyrics_sanitized_len] # Range for pyong string (e.g. 2.2K or 200) (if any)

    # Check if the lyrics does not contain the trash ending
    if len(song.lyrics) <= lyrics_sanitized_len or not is_trash_ending(song.lyrics[lyrics_sanitized_len:]):
      songs_lyrics.append(song.lyrics)
      continue

    # Otherwise sanitize the lyrics
    while True:
      char = song.lyrics[lyrics_sanitized_len-1 - j]
      #print("\nchar: ", char)

      if char.isdigit() or ((j == 0 or j == 2) and is_pyong(pyong)):
        j += 1
      else:
        break

    sanitized_lyrics = song.lyrics[:lyrics_sanitized_len - j]

    # Check if a repetition (e.g. x2) has been wrongly sanitized
    if sanitized_lyrics[len(sanitized_lyrics)-2:] == " x":
      sanitized_lyrics += song.lyrics[lyrics_sanitized_len - j]

    #print("\nSanitized lyrics:")
    #print(sanitized_lyrics)

    songs_lyrics.append(sanitized_lyrics)

  # Reinitialize the execution time if this is the first execution, otherwise update the mean
  if i == 1:
    exe_time = round(time.time() - start_time)
  else:
    exe_time = round(exe_time + (time.time() - start_time - exe_time) / i)

  return artist.name, songs_lyrics




i = 0
exe_time = 120 # seconds
data_size = len(data.index)
lyrics_trash_ending_len = len("EmbedShare URLCopyEmbedCopy")

# Search artists' songs
print("Searching lyrics...")
#data['lyrics'] = data['artist'].apply(search_songs)
artists_and_lyrics = data['artist'].apply(search_songs)

# Remove null rows
print("\nRemoving null rows...")
size_with_na = artists_and_lyrics.size
artists_and_lyrics.dropna(axis=0, inplace=True)
print("Removed lines: %d/%d => %.2f%% " %(size_with_na - artists_and_lyrics.size, size_with_na, (1 - artists_and_lyrics.size/size_with_na)*100))

# Get artist name and corresponding list of lyrics
print("\nUpdating the dataset...")
data['artist'] = artists_and_lyrics.apply(lambda x: x[0])
data['lyrics'] = artists_and_lyrics.apply(lambda x: x[1])

# Transform each item in the list of lyrics to a row
data = data.explode('lyrics', True)

print("\nArtists, genres & lyrics:")
print(data[:20])

[1;30;43mOutput streaming troncato alle ultime 5000 righe.[0m
Song title:  Cuts Like a Knife
Song title:  Christmas Time
Song title:  I Will Always Return (Finale)
Song title:  Somebody
Song title:  Diana
Song title:  I Will Always Return
Song title:  Nothing I’ve Ever Known
Song title:  The Only Thing That Looks Good on Me Is You
Song title:  Cloud Number Nine
Song title:  You Belong to Me
Song title:  Brothers Under the Sun
Song title:  Get Off My Back
Song title:  Let’s Make a Night to Remember
Song title:  Can’t Stop This Thing We Started
Song title:  Here I Am (End Title)
Song title:  Do I Have to Say the Words?

**************************************************

Progress: 23/100 = 23.00%
Estimated remaining time: 1:41:20

Searching for songs by black-buddafly...

Changing artist name to 'Black Buddafly'
Song 1: "Bad Girl"
Song 2: "Rock-A-Bye"
Song 3: "First Date"
Song 4: "Sheets and Pillows"
Song 5: "U Could Be"
Song 6: "Lucky Night"
Song 7: "If You Want It"
Song 8: "No Matter

Store downloaded lyrics into a CSV file

In [None]:
data.to_csv(out_path, mode="w+", encoding='utf-8')