Given by prof

## Genius API Code

This notebook explores the retrieval of song lyrics using the **lyricsgenius library and Genius API**, providing a simple guide for accessing and processing lyrics data within the context of music analysis or related applications.

**lyricsgenius Setup** - https://lyricsgenius.readthedocs.io/en/master/setup.html

**lyricsgenius.Genius** - https://lyricsgenius.readthedocs.io/en/master/reference/genius.html

[**Note**: The genius.search_song() within the get_spot_lyrics funtion defined below may cause 403 Client Error when running in Colab. You should execute the code on a **local** machine instead. For more details, refer to the discussion on [GitHub](https://github.com/johnwmillr/LyricsGenius/issues/220)]

**Version:** This notebook file has been edited by Shi Yingfei shi_yingfei@u.nus.edu in January 2024.

In [None]:
import lyricsgenius as lg
from langdetect import detect
import pandas as pd
import numpy as np
import re
import string
import os

In [None]:
# Need to register for a new API client https://genius.com/api-clients/new (APP WEBSITE URL: https://example.com/)
client_id = ###
client_secret = ###
access_token = ###

genius = lg.Genius(access_token=access_token)

In [None]:
import gdown
### Spotify Dataset Download
file_id = '1iEG4sF-ujvtnvOk2nrrnfuQD-Sg9jlXP'
url = f'https://drive.google.com/uc?id={file_id}'
output = 'main_dataset.csv'
gdown.download(url, output, quiet=False)

spotify_df = pd.read_csv('main_dataset.csv')
spotify_df.head()

In [None]:
def join_artists(artists_list):
    artists = artists_list[1:-1].split(", ") # Split into individual artists
    return ', '.join(list(map(lambda x: x[1:-1], artists))) # Remove '' and join artists with ,

spot_df = pd.DataFrame({'name': spotify_df.name.values,
                        'artists_names': spotify_df.artists_names.apply(join_artists)})
spot_df.head()

In [None]:
# Running this function in Colab may result in a 403 Client Error.
# You should try to execute the code on a local machine instead.
# For more details, refer to the discussion on GitHub: https://github.com/johnwmillr/LyricsGenius/issues/220

def get_spot_lyrics(row):
    # Extracting song and artist names from the DataFrame row
    song_name = row['name']
    artist_name = row['artists_names']

    # For names like 'XXX Song - Live at XXX'
    song_name = song_name.split(' - ', 1)[0]

    # For names like 'XXX (feat YYY)'
    song_name = re.sub(r"[\(].*?[\)]", "", song_name)
    artist_name = re.sub(r"[\(].*?[\)]", "", artist_name)

    # Remove all punctuation from song and artist names using str.translate() and str.maketrans()
    # https://docs.python.org/3/library/stdtypes.html#str.translate
    song_name = song_name.translate(str.maketrans('', '', string.punctuation))
    artist_name = artist_name.translate(str.maketrans('', '', string.punctuation))

    try:
        # Search for the song on Genius using the Genius API
        song = genius.search_song(song_name, artist=artist_name, get_full_info=False)

    except Exception as e:
        print(f"An error occurred: {e}")
        print(song_name)
        return ''

    if song:
        # Extract lyrics and remove any metadata or tags
        return song.lyrics.split(']', 1)[-1]

    return ''

In [None]:
# Create a dataframe with the first 20 rows of spot_df for test purpose
spot_df_short = spot_df.iloc[:20].copy()

# Initialize the 'lyrics' column
spot_df_short['lyrics'] = None

# Iterate through each row in spot_df_short
# for index, row in spot_df_short.iterrows():
#     try:
#         # Attempt to get lyrics using the get_spot_lyrics function
#         spot_df_short.at[index, 'lyrics'] = get_spot_lyrics(row)
#     except Exception as e:
#         # If an exception occurs, store the error message in the 'lyrics' column
#         spot_df_short.at[index, 'lyrics'] = f"Error: {str(e)}"

In [None]:
# Simple example code to process lyrics
def process_lyrics(row):
    # Get the 'lyrics' value from the row
    lyrics = row['lyrics']

    # If lyrics are empty, concatenate 'name' and 'artists_names'
    if len(lyrics) == 0:
        lyrics = row['name'] + row['artists_names']

    # Remove words in brackets and square brackets
    lyrics = re.sub("[\(\[].*?[\)\]]", "", lyrics)

    # Remove line breaks
    lyrics = lyrics.replace("\n", " ")

    # Remove all punctuation
    lyrics = lyrics.translate(str.maketrans('', '', string.punctuation))

    # Convert to lowercase
    return lyrics.lower()

In [None]:
# Detect and remove non-English songs
def is_english(text):
    try:
        return detect(text) == 'en'
    except:
        return False

# spot_df_short['lyrics_pro'] = spot_df_short.apply(lambda x : process_lyrics(x), axis = 1)
# is_eng = spot_df_short['lyrics_pro'].apply(lambda x : is_english(x))
# spot_df_short = spot_df_short[is_eng].reset_index(drop = True)
# spot_df_short.shape

In [None]:
# Save the DataFrame 'spot_df_short' to a JSON file
# spot_df_short.to_json('spot_lyrics_short.json')

In [None]:
# Check sample output
file_id = '1qx9jt0QOICdKBnFt0fYrtsbYWIes0Ecn'
url = f'https://drive.google.com/uc?id={file_id}'
output = 'spot_lyrics_short.json'
gdown.download(url, output, quiet=False)

In [None]:
spot_df_short = pd.read_json('spot_lyrics_short.json')
spot_df_short.head()