## Load data

In [156]:
import pandas as pd
df = pd.read_csv("songs.tsv", sep="\t", encoding="utf-8")

In [157]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 187 entries, 0 to 186
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   album      187 non-null    object 
 1   year       156 non-null    float64
 2   song_name  187 non-null    object 
 3   url        187 non-null    object 
 4   lyric      187 non-null    object 
 5   file_name  187 non-null    object 
dtypes: float64(1), object(5)
memory usage: 8.9+ KB


In [158]:
# Solo los que tienen año de lanzamiento son albumes
df = df.dropna(subset=["year"])
df.head()

Unnamed: 0,album,year,song_name,url,lyric,file_name
0,Speak To Me,2008.0,Pistol Whip,https://www.azlyrics.com/lyrics/imaginedragons...,"I can see clearly, I can see clearly I can see...",Pistol_Whip
1,Speak To Me,2008.0,Living Musical,https://www.azlyrics.com/lyrics/imaginedragons...,It all begins with a look in the eyes That goe...,Living_Musical
2,Speak To Me,2008.0,The Pit,https://www.azlyrics.com/lyrics/imaginedragons...,I would rather stay here Where the flowers blo...,The_Pit
3,Speak To Me,2008.0,Speak To Me,https://www.azlyrics.com/lyrics/imaginedragons...,Draw a rumor Flashing lights and Let all whisp...,Speak_To_Me
4,Speak To Me,2008.0,Boots,https://www.azlyrics.com/lyrics/imaginedragons...,Always be careful Don't ever feel good How is ...,Boots


## Special Characters

In [159]:
## Remove special characters from lyrics
df["lyric"] = df["lyric"].str.replace(r"\[.*\]", "")
df["lyric"] = df["lyric"].str.replace(r"\(.*\)", "")
df["lyric"] = df["lyric"].str.replace(r"\{.*\}", "")
df["lyric"] = df["lyric"].str.replace(r"\*.*\*", "")

## Remove _ and - from lyrics
df["lyric"] = df["lyric"].str.replace("_", "")
df["lyric"] = df["lyric"].str.replace("-", "")
df["lyric"] = df["lyric"].str.replace("-", "")


In [160]:
df[df['song_name']=="Boots"]['lyric'].values[0]

"Always be careful Don't ever feel good How is JUDGMENT  Kiss, lights, make news It's open places Question rose Stop tryin' Are you vagabond  What you see is what you get Stop trying to be somebody else  So come on boots Oh walk me home Where Daddy works And Momma sews Just wanna live my life Don't wanna hear one thing Oh I plug my ears And live the dream (yeah)  Wait, explain you're zippin' off Don't let it off so easy What did you mean?  Always, be careful Don't ever feel good How is JUDGMENT?  Kiss, lights, make news It's open places Question rose Stop tryin' Are you vagabond  What you see is what you get Stop trying to be somebody else  So come on boots Oh walk me home Where Daddy works And Momma sews Just wanna live my life Don't wanna hear one thing Oh I plug my ears And live the dream (yeah)  Oh I don't want to see Your face in this broken town (Oh I don't want to see your face in this broken town)  I'm runnin' down South To bury my head in the ground (Runnin' down South, bury m

### Lower Case

In [161]:
# Lowercase
df["lyric"] = df["lyric"].str.lower()

## Exportar sin lemmatizar ni quitar stopwords

In [162]:
df.to_csv("songs_basic_clean.tsv", sep="\t", index=False, encoding="utf-8")

## Lemmatization

In [163]:
#!python -m spacy download en_core_web_sm
#!pip install spacy

In [164]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [165]:
def lemmatize(text):
    doc = nlp(text)
    return " ".join([token.lemma_ for token in doc])

In [166]:
df["lyric"] = df["lyric"].apply(lemmatize)

In [167]:
df['lyric'].head()

0    I can see clearly , I can see clearly I can se...
1    it all begin with a look in the eye that go be...
2    I would rather stay here where the flower bloo...
3    draw a rumor flash light and let all whisper d...
4    always be careful do not ever feel good how be...
Name: lyric, dtype: object

## Stop Words

In [168]:
# Remove stopwords
from spacy.lang.en.stop_words import STOP_WORDS
def remove_stopwords(text):
    doc = nlp(text)
    return " ".join([token.text for token in doc if token.text.lower() not in STOP_WORDS])

In [169]:
df['lyric'] = df['lyric'].apply(remove_stopwords)

In [170]:
df['lyric'].head()

0    clearly , clearly clearly , clearly clearly , ...
1    begin look eye body feeling try forget sudden ...
2    stay flower bloom sun , shine shine light eye ...
3    draw rumor flash light let whisper die    sixt...
4    careful feel good judgment    kiss , light , n...
Name: lyric, dtype: object

In [171]:
## Poner año como entero
df["year"] = df["year"].astype(int)

## Exportar lemmatizado y sin stopwords

In [172]:
df.to_csv("songs_clean_advance.tsv", sep="\t", index=False, encoding="utf-8")