# Fetch Data + Clean Data + Define Functions for Analysis:

In [20]:
import pandas as pd
import lyricsgenius as genius

import string

import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 

from googletrans import Translator
from google_trans_new import google_translator  

### Notes:
- Creates Pandas DataFrame with information regarding an artist and their music.
- "search" parameter refers to any search fields used to find information using the Genius API.

In [2]:
def findMusic(search, token):

    geniusAPI = genius.Genius(token)
    
    # Fields:
    titleL = []
    albumL = []
    artistL = []
    yearL = []
    lyricsL = []
    
    artist = geniusAPI.search_artist(search, sort = "popularity", include_features = False)
    
    songs = artist.songs
    
    for i in songs:
        titleL.append(i.title)
        # albumL.append(i.album)
        artistL.append(i.artist)
        # yearL.append(i.year)
        lyricsL.append(i.lyrics)
        
    # Remove album, year from dataframe for now.
    music = pd.DataFrame({
        "Title" : titleL,
        "Artist" : artistL,
        "Lyrics" : lyricsL
    })
    
    return music

### Notes:
- Cleans/ fixes data by removing strings (e.g. []) and fixing any and all irregularities in the song lyrics.
- Will need to look into improper grammar (e.g. "ur", "ya", "xo"). 

In [3]:
def fixLyrics(music, l):
    
    # l -> music[lyrics]
    
    music[l] = music[l].str.lower()
    
    # Based on observations made in EDA, will potentially need to add more:
    
    # Punctuation
    music[l] = music[l].str.replace("[","")
    music[l] = music[l].str.replace("]","")
    
    music[l] = music[l].str.replace("(","")
    music[l] = music[l].str.replace(")","")
    
    music[l] = music[l].str.replace(":","")
    music[l] = music[l].str.replace('"',"")
    music[l] = music[l].str.replace("-","")
    music[l] = music[l].str.replace("&","")
    
    music[l] = music[l].str.replace("\n", " ")
    
    # Verse Uppercase -> No longer needed due to str.lower()
    music[l] = music[l].str.replace("Verse 1","")
    music[l] = music[l].str.replace("Verse 2","")
    music[l] = music[l].str.replace("Verse 3","")
    
    # Verse Lowercase
    music[l] = music[l].str.replace("verse 1","")
    music[l] = music[l].str.replace("verse 2","")
    music[l] = music[l].str.replace("verse 3","")
    music[l] = music[l].str.replace("verse 4","")
    
    # Uppercase () -> No longer needed due to str.lower()
    music[l] = music[l].str.replace("Refrain","")
    music[l] = music[l].str.replace("Chorus","")
    music[l] = music[l].str.replace("Bridge","")
    music[l] = music[l].str.replace("Outro","")
    music[l] = music[l].str.replace("Pre-Chorus","")
    music[l] = music[l].str.replace("Spoken","")
    music[l] = music[l].str.replace("Original","")
    music[l] = music[l].str.replace("Translated","")
    music[l] = music[l].str.replace("Hm","")
    music[l] = music[l].str.replace("Intro","")
    music[l] = music[l].str.replace("Feat.","")
    music[l] = music[l].str.replace("Ft.","")
    music[l] = music[l].str.replace("Post-Chorus","")
    
    # Lowercase ()
    music[l] = music[l].str.replace("refrain","")
    music[l] = music[l].str.replace("chorus","")
    music[l] = music[l].str.replace("bridge","")
    music[l] = music[l].str.replace("outro","")
    music[l] = music[l].str.replace("pre-chorus","")
    music[l] = music[l].str.replace("spoken","")
    music[l] = music[l].str.replace("original","")
    music[l] = music[l].str.replace("translated","")
    music[l] = music[l].str.replace("hm","")
    music[l] = music[l].str.replace("intro","")
    music[l] = music[l].str.replace("feat.","")
    music[l] = music[l].str.replace("ft.","")
    music[l] = music[l].str.replace("post-chorus","")
    
    # Miscellaneous
    music[l] = music[l].str.replace("URLCopyEmbedCopy","")
    music[l] = music[l].str.replace("urlcopyembedcopy","")
    
    music[l] = music[l].str.replace("EmbedShare","")
    music[l] = music[l].str.replace("embedshare","")
    
    music[l] = music[l].str.replace("6embedshare","")
    music[l] = music[l].str.replace("1embedshare","")
    
    music[l] = music[l].str.replace("english","")
    
    music[l] = music[l].str.replace("fxxk wit","")
    
    # Name
    music[l] = music[l].str.replace("Lee Hi","")
    music[l] = music[l].str.replace("lee hi","")
    
    music[l] = music[l].str.replace("b.i","")
    music[l] = music[l].str.replace("jennie","")
    music[l] = music[l].str.replace("choi hyun suk","")
    music[l] = music[l].str.replace("dok2","")
    music[l] = music[l].str.replace("mino","")
    music[l] = music[l].str.replace("tablo","")
    
    # Korean
    music[l] = music[l].str.replace("korean","")
    
    music[l] = music[l].str.replace("이하이","")
    
    music[l] = music[l].str.replace("가사","")
    music[l] = music[l].str.replace("피처링.","")
    
    music[l] = music[l].str.replace("김제니","")
    music[l] = music[l].str.replace("최현석","")
    music[l] = music[l].str.replace("윤미래","")
    music[l] = music[l].str.replace("원슈타인","")
    music[l] = music[l].str.replace("송민호","")
    
    return music   

### Notes:
- https://www.nltk.org/api/nltk.tokenize.html
- Normalisation of lyrics.
- Removal of stopwords.
- Lemmatisation of Word Parameters vs Stemming of Word Parameters.
- https://stackoverflow.com/questions/1787110/what-is-the-difference-between-lemmatization-vs-stemming

In [6]:
stopwords.fileids()

['arabic',
 'azerbaijani',
 'bengali',
 'danish',
 'dutch',
 'english',
 'finnish',
 'french',
 'german',
 'greek',
 'hungarian',
 'indonesian',
 'italian',
 'kazakh',
 'nepali',
 'norwegian',
 'portuguese',
 'romanian',
 'russian',
 'slovene',
 'spanish',
 'swedish',
 'tajik',
 'turkish']

### Notes:
- NLTK has no stopwords in the Korean language.
- https://github.com/6/stopwords-json/blob/master/dist/ko.json
- https://konlpy.org/en/latest/
- https://www.lucypark.kr/courses/2015-ba/text-mining.html#python-packages-for-text-mining-and-nlp
- https://github.com/konlpy/konlpy

### Options:
- Option 1- Translate all lyrics into English -> Proceed with text analysis.
- Option 2- Use KoNLPy to analyse Korean lyrics separately alongside English lyrics.

In [21]:
translator = google_translator()

sentenceKR = "한국 노래 분석"
translateKR = translator.translate(sentenceKR, lang_tgt = "en") 

print(translateKR)

JSONDecodeError: Extra data: line 1 column 377 (char 376)

### Notes:
- Define stopwords and punctuation.
- Remove stopwords and punctuation.
- Lemmatisation and normalisation of tokens.

In [5]:
def stopwordLyrics(l):
    
    nltk.download('stopwords')

    # "Valid" English:
    stops = set(stopwords.words('english'))
    
    # "Valid" Punctuation:
    punctuation = set(string.punctuation)
    
    # Lemmatisation:
    lemmatise = WordNetLemmatizer()
    
    # Remove Stopwords
    stopsRemove = " ".join([i for i in l.split() if i not in stops]) 
    
    # Remove Punctuation
    punctuationRemove = " ".join(j for j in stopsRemove if j not in punctuation)
    
    # Normalisation:
    normalise = " ".join(lemmatise.lemmatize(k) for k in punctuationRemove.split())
    
    return normalise