<a href="https://colab.research.google.com/github/MatthewGuile/tool-lyrics-nlp/blob/main/tool_lyrics_processing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Introduction**

In [36]:
import json 
import requests

url = "https://raw.githubusercontent.com/MatthewGuile/tool-lyrics-nlp/main/files/Lyrics_Tool.json"
resp = requests.get(url)
data = json.loads(resp.text)

In [37]:
import pandas as pd

# load data as a pandas dataframe
df0 = pd.DataFrame(data['songs'])

In [38]:
df0.keys()

Index(['annotation_count', 'api_path', 'artist_names', 'full_title',
       'header_image_thumbnail_url', 'header_image_url', 'id',
       'lyrics_owner_id', 'lyrics_state', 'path', 'pyongs_count',
       'song_art_image_thumbnail_url', 'song_art_image_url', 'stats', 'title',
       'title_with_featured', 'url', 'primary_artist', 'apple_music_id',
       'apple_music_player_url', 'description', 'embed_content',
       'featured_video', 'lyrics_placeholder_reason', 'recording_location',
       'release_date', 'release_date_for_display', 'current_user_metadata',
       'album', 'custom_performances', 'description_annotation',
       'featured_artists', 'lyrics_marked_complete_by', 'media',
       'producer_artists', 'song_relationships', 'verified_annotations_by',
       'verified_contributors', 'verified_lyrics_by', 'writer_artists',
       'artist', 'lyrics'],
      dtype='object')

In [39]:
# select relevant columns 
df = df0[['title', 'lyrics', 'release_date']]

In [40]:
# scrape albums and create column 
album_list = []

for i in range(len(df0['album'])):
    try:
        album_list.append(df0['album'][i]['name'])
    except:
        album_list.append(None)
        continue

df['album'] = album_list


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()


In [None]:
# demonstrate that error had no impact

In [41]:
# select the albums required for analysis and drop irrelevant data
relevant_albums = ['Opiate', 'Undertow', 'Ænima', '10,000 Days', 'Fear Inoculum']

df = df[df['album'].isin(relevant_albums)]

In [42]:
# data cleaning time wooohooo
import re
# remove whitespace

def remove_whitespace(text):
    """ This function will remove
        extra whitespaces from the text
    arguments:
        input_text: "text" of type "String".

    return:
        value: "text" after extra whitespaces removed .

    Example:
    Input : How   are   you   doing   ?
    Output : How are you doing ?

    """
    pattern = re.compile(r'\s+')
    Without_whitespace = re.sub(pattern, ' ', text)
    # There are some instances where there is no space after '?' & ')',
    # So I am replacing these with one space so that It will not consider two words as one token.
    text = Without_whitespace.replace('?', ' ? ').replace(')', ') ')
    return text

df['lyrics'] = df['lyrics'].apply(lambda x: remove_whitespace(x))

df

Unnamed: 0,title,lyrics,release_date,album
0,Sober,There's a shadow just behind me Shrouding ever...,1993-04-06,Undertow
2,Ænema,"Hey, hey, hey, hey, hey, hey, hey, hey, hey So...",1996-09-17,Ænima
3,Forty Six & 2,"Join in my— Join in, my child, and listen Digg...",1996-09-17,Ænima
4,The Pot,Who are you to wave your finger ? You must've...,2006-05-02,"10,000 Days"
6,Fear Inoculum,"Immunity, long overdue Contagion, I exhale you...",2019-08-07,Fear Inoculum
7,"10,000 Days (Wings Pt 2)",We listen to the tales and romanticize How we'...,2006-05-02,"10,000 Days"
8,Stinkfist,Something has to change Undeniable dilemma Bor...,1996-09-17,Ænima
9,Rosetta Stoned,"Alrighty, then Picture this if you will 10 to ...",2006-05-02,"10,000 Days"
11,Right in Two,Angels on the sideline Puzzled and amused Why ...,2006-05-02,"10,000 Days"
12,Invincible,Long in tooth and soul Longing for another win...,2019-05-07,Fear Inoculum


In [43]:
def lower_casing_text(text):
    """
    The function will convert text into lower case.

    arguments:
         input_text: "text" of type "String".

    return:
         value: text in lowercase

    Example:
    Input : The World is Full of Surprises!
    Output : the world is full of surprises!

    """
    # Convert text to lower case
    # lower() - It converts all upperase letter of given string to lowercase.
    text = text.lower()
    return text

df['lyrics'] = df['lyrics'].apply(lambda x: lower_casing_text(x))

df.head()

Unnamed: 0,title,lyrics,release_date,album
0,Sober,there's a shadow just behind me shrouding ever...,1993-04-06,Undertow
2,Ænema,"hey, hey, hey, hey, hey, hey, hey, hey, hey so...",1996-09-17,Ænima
3,Forty Six & 2,"join in my— join in, my child, and listen digg...",1996-09-17,Ænima
4,The Pot,who are you to wave your finger ? you must've...,2006-05-02,"10,000 Days"
6,Fear Inoculum,"immunity, long overdue contagion, i exhale you...",2019-08-07,Fear Inoculum


In [44]:
# fix contractions 

CONTRACTION_MAP = {
    "'bout": "about",
    "ain't": "is not",
    "'round": "around",
    "aren't": "are not",
    "can't": "cannot",
    "can't've": "cannot have",
    "'cause": "because",
    "could've": "could have",
    "couldn't": "could not",
    "couldn't've": "could not have",
    "didn't": "did not",
    "doesn't": "does not",
    "don't": "do not",
    "get'chu": "get you",
    "gonna": "going to",
    "hadn't": "had not",
    "hadn't've": "had not have",
    "hasn't": "has not",
    "haven't": "have not",
    "he'd": "he would",
    "he'd've": "he would have",
    "he'll": "he will",
    "he'll've": "he he will have",
    "he's": "he is",
    "how'd": "how did",
    "how'd'y": "how do you",
    "how'll": "how will",
    "how's": "how is",
    "i'd": "i would",
    "i'd've": "i would have",
    "i'll": "i will",
    "i'll've": "i will have",
    "i'm": "i am",
    "i've": "i have",
    "isn't": "is not",
    "kinda": "kind of",
    "it'd": "it would",
    "it'd've": "it would have",
    "it'll": "it will",
    "it'll've": "it will have",
    "it's": "it is",
    "l.a": "los angeles",
    "let's": "let us",
    "ma'am": "madam",
    "mayn't": "may not",
    "might've": "might have",
    "mightn't": "might not",
    "mightn't've": "might not have",
    "must've": "must have",
    "mustn't": "must not",
    "mustn't've": "must not have",
    "needn't": "need not",
    "needn't've": "need not have",
    "o'clock": "of the clock",
    "oughtn't": "ought not",
    "oughtn't've": "ought not have",
    "shan't": "shall not",
    "sha'n't": "shall not",
    "shan't've": "shall not have",
    "she'd": "she would",
    "she'd've": "she would have",
    "she'll": "she will",
    "she'll've": "she will have",
    "she's": "she is",
    "should've": "should have",
    "shouldn't": "should not",
    "shouldn't've": "should not have",
    "so've": "so have",
    "so's": "so as",
    "that'd": "that would",
    "that'd've": "that would have",
    "that's": "that is",
    "there'd": "there would",
    "there'd've": "there would have",
    "there's": "there is",
    "they'd": "they would",
    "they'd've": "they would have",
    "they'll": "they will",
    "they'll've": "they will have",
    "they're": "they are",
    "they've": "they have",
    "to've": "to have",
    "'til": "until",
    "wanna": "want to",
    "wasn't": "was not",
    "we'd": "we would",
    "we'd've": "we would have",
    "we'll": "we will",
    "we'll've": "we will have",
    "we're": "we are",
    "we've": "we have",
    "weren't": "were not",
    "whatcha": "what are you",
    "what'll": "what will",
    "what'll've": "what will have",
    "what're": "what are",
    "what's": "what is",
    "what've": "what have",
    "when's": "when is",
    "when've": "when have",
    "where'd": "where did",
    "where's": "where is",
    "where've": "where have",
    "who'll": "who will",
    "who'll've": "who will have",
    "who's": "who is",
    "who've": "who have",
    "why's": "why is",
    "why've": "why have",
    "will've": "will have",
    "won't": "will not",
    "won't've": "will not have",
    "would've": "would have",
    "wouldn't": "would not",
    "wouldn't've": "would not have",
    "y'all": "you all",
    "y'all'd": "you all would",
    "y'all'd've": "you all would have",
    "y'all're": "you all are",
    "y'all've": "you all have",
    "you'd": "you would",
    "you'd've": "you would have",
    "you'll": "you will",
    "you'll've": "you will have",
    "you're": "you are",
    "you've": "you have",
}


# The code for expanding contraction words
def expand_contractions(text, contraction_mapping=CONTRACTION_MAP):
    """expand shortened words to the actual form.
       e.g. don't to do not

       arguments:
            input_text: "text" of type "String".

       return:
            value: Text with expanded form of shorthened words.

       Example:
       Input : ain't, aren't, can't, cause, can't've
       Output :  is not, are not, cannot, because, cannot have

     """
    # Tokenizing text into tokens.
    list_Of_tokens = text.split(' ')

    # Checking for whether the given token matches with the Key & replacing word with key's value.

    # Check whether Word is in lidt_Of_tokens or not.
    for Word in list_Of_tokens:
        # Check whether found word is in dictionary "Contraction Map" or not as a key.
        if Word in CONTRACTION_MAP:
            # If Word is present in both dictionary & list_Of_tokens, replace that word with the key value.
            list_Of_tokens = [item.replace(Word, CONTRACTION_MAP[Word]) for item in list_Of_tokens]

    # Converting list of tokens to String.
    String_Of_tokens = ' '.join(str(e) for e in list_Of_tokens)
    return String_Of_tokens

df['lyrics'] = df['lyrics'].apply(lambda x: expand_contractions(x, contraction_mapping=CONTRACTION_MAP))

df.head(5)




Unnamed: 0,title,lyrics,release_date,album
0,Sober,there is a shadow just behind me shrouding eve...,1993-04-06,Undertow
2,Ænema,"hey, hey, hey, hey, hey, hey, hey, hey, hey so...",1996-09-17,Ænima
3,Forty Six & 2,"join in my— join in, my child, and listen digg...",1996-09-17,Ænima
4,The Pot,who are you to wave your finger ? you must ha...,2006-05-02,"10,000 Days"
6,Fear Inoculum,"immunity, long overdue contagion, i exhale you...",2019-08-07,Fear Inoculum


In [45]:
# The code for removing special characters
def removing_special_characters(text):
    """Removing all the special characters except the one that is passed within
       the regex to match, as they have imp meaning in the text provided.


    arguments:
         input_text: "text" of type "String".

    return:
        value: Text with removed special characters that don't require.

    Example:
    Input : Hello, K-a-j-a-l. Thi*s is $100.05 : the payment that you will recieve! (Is this okay?)
    Output :  Hello, Kajal. This is $100.05 : the payment that you will recieve! Is this okay?

   """
    # The formatted text after removing not necessary punctuations.
    Formatted_Text = re.sub(r"[^a-zA-Z0-9:$-,%.?!]+", ' ', text)
    Remove_Characters = re.sub(r"[0-9]+[a-zA-Z]+ [a-zA-Z]+",'', Formatted_Text)
    # In the above regex expression,I am providing necessary set of punctuations that are frequent in this particular dataset.
    return Remove_Characters

df['lyrics'] = df['lyrics'].apply(lambda x: removing_special_characters(x))

df.head()

Unnamed: 0,title,lyrics,release_date,album
0,Sober,there is a shadow just behind me shrouding eve...,1993-04-06,Undertow
2,Ænema,"hey, hey, hey, hey, hey, hey, hey, hey, hey so...",1996-09-17,Ænima
3,Forty Six & 2,"join in my join in, my child, and listen diggi...",1996-09-17,Ænima
4,The Pot,who are you to wave your finger ? you must hav...,2006-05-02,"10,000 Days"
6,Fear Inoculum,"immunity, long overdue contagion, i exhale you...",2019-08-07,Fear Inoculum


In [None]:
import string as st

# Remove punctuation
def remove_punct(text):
    return ("".join([ch for ch in text if ch not in st.punctuation]))

df['lyrics'] = df['lyrics'].apply(lambda x: remove_punct(x))

df.head()

In [None]:
# feature engineering

