# Fetch Data + Clean Data + Define Functions for Analysis:

In [7]:
import pandas as pd
import lyricsgenius as genius

import string

import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 

### Notes:
- Creates Pandas DataFrame with information regarding an artist and their music.
- "search" parameter refers to any search fields used to find information using the Genius API.

In [3]:
def findMusic(search, token):

    genius = genius.Genius(token)
    
    # Fields:
    song = []
    album = []
    artist = []
    year = []
    lyrics = []
    
    artist = genius.search_artist(search, sort = "popularity", include_features = False)
    
    songs = artist.songs
    
    for i in songs:
        song.append(song.title)
        album.append(song.album)
        artist.append(song.artist)
        year.append(song.year)
        lyrics.append(song.lyrics)
        
    music = pd.DataFrame({
        "Song" : song,
        "Album" : album,
        "Artist" : artist,
        "Year" : year,
        "Lyrics" : lyrics
    })
    
    return music

### Notes:
- Cleans/ fixes data by removing [] and fixing any and all irregularities in the song lyrics.

In [5]:
def fixLyrics(music, l):
    
    # l -> music[lyrics]
    
    music[l] = music[l].str.lower()
    
    # Based on observations made in EDA, will potentially need to add more:
    music[l] = music[l].str.replace("[","")
    music[l] = music[l].str.replace("]","")
    
    music[l] = music[l].str.replace("Verse 1","")
    music[l] = music[l].str.replace("Verse 2","")
    music[l] = music[l].str.replace("Verse 3","")
    
    music[l] = music[l].str.replace("Refrain","")
    music[l] = music[l].str.replace("Chorus","")
    music[l] = music[l].str.replace("Bridge","")
    music[l] = music[l].str.replace("Outro","")
    music[l] = music[l].str.replace("Pre-Chorus","")
    music[l] = music[l].str.replace("Spoken","")
    
    music[l] = music[l].str.replace("URLCopyEmbedCopy","")
    music[l] = music[l].str.replace("EmbedShare","")
    
    return music   

### Notes:
- https://www.nltk.org/api/nltk.tokenize.html
- Normalisation of lyrics.
- Removal of stopwords.
- Lemmatisation of Word Parameters vs Stemming of Word Parameters.
- https://stackoverflow.com/questions/1787110/what-is-the-difference-between-lemmatization-vs-stemming

In [10]:
def stopwordLyrics(l):
    
    nltk.download('stopwords')

    # "Valid" English:
    stops = set(stopwords.words('english'))
    
    stopsRemove = " ".join([i for i in l.split() if i not in stops]) 
    
    # "Valid" Punctuation:
    punctuation = set(string.punctuation)
    
    punctuationRemove = " ".join(j for j in stopsRemove if j not in punctuation)
    
    # Lemmatisation:
    lemmatise = WordNetLemmatizer()
    
    # Normalisation:
    normalise = " ".join(lemmatise.lemmatize(k) for k in punctuationRemove.split())
    
    return normalise