# Lyrics Processing
Extracting and processing the scraped raw lyrics data.

## Importing Libraries


In [14]:
import json
import os
import re

## Defining Functions

In [31]:
def extract_song_data(raw_lyrics):
    """
    Extracts the song data from the raw lyrics data.

    Parameters
    ----------
    raw_lyrics : dict
        The raw lyrics data.

    Returns
    -------
    extracted_song_data : list
        A list of dictionaries containing the song data.
    """
    songs = raw_lyrics['songs']
    extracted_song_data = []
    for song in songs:
        title = song['title']
        language = song['language']
        release_date = song['release_date']
        artist = song['artist']

        try:
            album_name = song['album']['name']
        except:
            album_name = ""

        lyrics = song['lyrics']

        # At the beginning of lyrics can be "1 Contributor" or "2 Contributors" or "x Contributors"
        # Remove this using regex and replace with empty string
        regex = r"(\d Contributors?)"
        lyrics = re.sub(regex, "", lyrics)

        # If the last 5 chars is "Embed" remove it
        if lyrics[-5:] == "Embed":
            lyrics = lyrics[:-5]

        # At the beginning of lyrics can be the title and the keyword "Lyrics"
        # Remove this using regex and replace with empty string
        regex = r"(.* Lyrics)"
        lyrics = re.sub(regex, "", lyrics)

        # Remove all \n from the beginning of the string
        lyrics = lyrics.lstrip('\n')

        data_dict = {
            'title': title,
            'language': language,
            'release_date': release_date,
            'artist': artist,
            'album_name': album_name,
            'lyrics': lyrics
        }

        extracted_song_data.append(data_dict)

    return extracted_song_data

def store_to_processed(filename, data, subfolder = None):
    """
    Stores the processed data to the processed folder.

    Parameters
    ----------
    filename : str
        The filename of the processed data.
    data : list
        The processed data.
    """
    # Processed folder
    if subfolder:
        # Create subfolder if it does not exist
        if not os.path.exists(os.path.abspath(os.path.join(os.getcwd(), '..', '..', 'data', 'processed', subfolder))):
            os.makedirs(os.path.abspath(os.path.join(os.getcwd(), '..', '..', 'data', 'processed', subfolder)))
        full_filepath = os.path.abspath(os.path.join(os.getcwd(), '..', '..', 'data', 'processed', subfolder, filename + '.json'))
    else:
        full_filepath = os.path.abspath(os.path.join(os.getcwd(), '..', '..', 'data', 'processed', filename + '.json'))

    # Object to JSON
    json_data = json.dumps(data, ensure_ascii=False, indent=4)

    # Write to file encoded as UTF-8
    with open(full_filepath, "w", encoding="utf-8") as file:
        file.write(json_data)


    return


## Processing 1
Parsing the raw data and extracting only relevant data.


In [32]:
# Get path to raw lyrics data folder
current_dir = os.getcwd()
raw_genius_lyrics_folder = os.path.abspath(os.path.join(current_dir, '..', '..', 'data', 'raw', 'genius_lyrics'))

# Get list of files in raw lyrics data folder
raw_genius_lyrics_files = os.listdir(raw_genius_lyrics_folder)

for file in raw_genius_lyrics_files:
    # Load file
    with open(os.path.join(raw_genius_lyrics_folder, file), 'r') as f:
        song_data = extract_song_data(json.load(f))
        store_to_processed(song_data[0]['artist'], song_data, subfolder = 'processing1')
