## Load data

In [66]:
import pandas as pd
df = pd.read_csv("songs.tsv", sep="\t", encoding="utf-8")

In [67]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 187 entries, 0 to 186
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   album      187 non-null    object 
 1   year       156 non-null    float64
 2   song_name  187 non-null    object 
 3   url        187 non-null    object 
 4   lyric      187 non-null    object 
 5   file_name  187 non-null    object 
dtypes: float64(1), object(5)
memory usage: 8.9+ KB


In [68]:
# Solo los que tienen año de lanzamiento son albumes
df = df.dropna(subset=["year"])
df.head()

Unnamed: 0,album,year,song_name,url,lyric,file_name
0,Speak To Me,2008.0,Pistol Whip,https://www.azlyrics.com/lyrics/imaginedragons...,"I can see clearly, I can see clearly I can see...",Pistol_Whip
1,Speak To Me,2008.0,Living Musical,https://www.azlyrics.com/lyrics/imaginedragons...,It all begins with a look in the eyes That goe...,Living_Musical
2,Speak To Me,2008.0,The Pit,https://www.azlyrics.com/lyrics/imaginedragons...,I would rather stay here Where the flowers blo...,The_Pit
3,Speak To Me,2008.0,Speak To Me,https://www.azlyrics.com/lyrics/imaginedragons...,Draw a rumor Flashing lights and Let all whisp...,Speak_To_Me
4,Speak To Me,2008.0,Boots,https://www.azlyrics.com/lyrics/imaginedragons...,Always be careful Don't ever feel good How is ...,Boots


## Special Characters

In [69]:
## Remove special characters from lyrics
df["lyric"] = df["lyric"].str.replace(r"\[.*\]", "")
df["lyric"] = df["lyric"].str.replace(r"\(.*\)", "")
df["lyric"] = df["lyric"].str.replace(r"\{.*\}", "")
df["lyric"] = df["lyric"].str.replace(r"\*.*\*", "")

## Remove _ and - from lyrics
df["lyric"] = df["lyric"].str.replace("_", "")
df["lyric"] = df["lyric"].str.replace("-", "")
df["lyric"] = df["lyric"].str.replace("-", "")

import re
def limpiar_caracteres(texto):
    return re.sub(r'[^\x00-\x7F]+', '', texto)
# Remove no ascii characters
df['lyric'] = df['lyric'].apply(limpiar_caracteres)
#df["lyric"] = df["lyric"].str.replace(r"[^\x00-\x7F]+", "")



In [70]:
df[df['song_name']=="Tokyo"]['lyric'].values[0]

"Baby's got a lot of baggage It don't seem to matter, it don't seem to matter Bought a ticket to the Midwest, just for business But dismissed it from the interest It was a risk just for a mistress Who was a hostess on the West Coast Don't you see it in my eyes tonight? Baby's got a lot of baggage  Where do I have to go To find a honey with a little soul? Tokyo, Tokyo  Turn out the lights and let it go Oh, it's nice to meet you  [Kimi tanoshimu] Tokyo, Tokyo  Pay in the palms of the rock'n'roll  Get that dynamite Chickaboom on the town tonight I never seem to get it right But you can show me so politely Now don't you see it in my eyes tonight? Get that dynamite  Where do I have to go To find a honey with a little soul? Tokyo, Tokyo  Turn out the lights and let it go Oh, it's nice to meet you  Tokyo, Tokyo  Pay in the palms of the rock'n'roll  Hey! I come back to where you are Hey! I come back to where you are Get that dynamite  Where do I have to go To find a honey with a little soul? T

### Lower Case

In [71]:
# Lowercase
df["lyric"] = df["lyric"].str.lower()

In [72]:
## Poner año como entero
df["year"] = df["year"].astype(int)

## Exportar sin lemmatizar ni quitar stopwords

In [73]:
df.to_csv("songs_basic_clean.tsv", sep="\t", index=False, encoding="utf-8")

## Lemmatization

In [74]:
#!python -m spacy download en_core_web_sm
#!pip install spacy

In [75]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [76]:
def lemmatize(text):
    doc = nlp(text)
    return " ".join([token.lemma_ for token in doc])

In [77]:
df["lyric"] = df["lyric"].apply(lemmatize)

In [78]:
df['lyric'].head()

0    I can see clearly , I can see clearly I can se...
1    it all begin with a look in the eye that go be...
2    I would rather stay here where the flower bloo...
3    draw a rumor flash light and let all whisper d...
4    always be careful do not ever feel good how be...
Name: lyric, dtype: object

## Stop Words

In [79]:
# Remove stopwords
# No uso stop words de spacy porque elimina mas palabras de las que quiero
"""
from spacy.lang.en.stop_words import STOP_WORDS
def remove_stopwords(text):
    doc = nlp(text)
    return " ".join([token.text for token in doc if token.text.lower() not in STOP_WORDS])
df['lyric'] = df['lyric'].apply(remove_stopwords)
df['lyric'].head()
"""

'\nfrom spacy.lang.en.stop_words import STOP_WORDS\ndef remove_stopwords(text):\n    doc = nlp(text)\n    return " ".join([token.text for token in doc if token.text.lower() not in STOP_WORDS])\ndf[\'lyric\'] = df[\'lyric\'].apply(remove_stopwords)\ndf[\'lyric\'].head()\n'

In [80]:
# Stopwords de nltk
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stopwords = set(stopwords.words('english'))

def remove_stopwords_2(text):
    return " ".join([word for word in text.split() if word.lower() not in stopwords])

df['lyric'] = df['lyric'].apply(remove_stopwords_2)
df['lyric'].head()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


0    see clearly , see clearly see clearly , see cl...
1    begin look eye go beyond body feeling try forg...
2    would rather stay flower bloom sun , shine shi...
3    draw rumor flash light let whisper die sixteen...
4    always careful ever feel good judgment kiss , ...
Name: lyric, dtype: object

## Exportar lemmatizado y sin stopwords

In [81]:
df.to_csv("songs_clean_advance.tsv", sep="\t", index=False, encoding="utf-8")