# Music Genre Classification - Preprocessing (380K)

### Import necessary libraries

In [1]:
import sys
sys.path.append("../")

In [2]:
import string
import os
import math

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
%matplotlib inline

In [4]:
from nltk.util import ngrams
from nltk.tokenize import RegexpTokenizer

In [5]:
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

In [6]:
from lib.utils.preprocess import *

In [7]:
stop_words = stopwords.words('english')
stemmer = SnowballStemmer('english')

In [8]:
# Parameters and definitions
RANDOM_SEED = 0
VAL_SET_SIZE = 0.2

In [9]:
np.random.seed(RANDOM_SEED)

### File Paths

In [10]:
DATASET = "../data/380000.csv"
OUT_DATASET = "../data/380000_clean.csv"
FIGURES_DIR = "../figures/"

### Helper Methods

In [11]:
def load_data():
    """Loads the training and testing sets into the memory.
    """
    return pd.read_csv(DATASET).drop("index", axis=1)

### Data Wrangling

In [12]:
df = load_data()

In [13]:
df.head()

Unnamed: 0,song,year,artist,genre,lyrics
0,ego-remix,2009,beyonce-knowles,Pop,"Oh baby, how you doing?\nYou know I'm gonna cu..."
1,then-tell-me,2009,beyonce-knowles,Pop,"playin' everything so easy,\nit's like you see..."
2,honesty,2009,beyonce-knowles,Pop,If you search\nFor tenderness\nIt isn't hard t...
3,you-are-my-rock,2009,beyonce-knowles,Pop,"Oh oh oh I, oh oh oh I\n[Verse 1:]\nIf I wrote..."
4,black-culture,2009,beyonce-knowles,Pop,"Party the people, the people the party it's po..."


### Exploratory Data Analysis

In [14]:
# Number of records
prev_len = len(df)
print("Len. of data set: {}".format(prev_len))

Len. of data set: 362237


### Sanitize data

In [15]:
# Remove rows with N/A values
df.dropna(inplace=True)

# Number of records after removal
print("Len. of data set: {}".format(len(df)))

# Records removed
print("Number of elements removed: {}".format(prev_len - len(df)))

Len. of data set: 266556
Number of elements removed: 95681


In [16]:
df_s = df.copy()

df_subset = preprocess_data(df_s.iloc[0]["lyrics"])

In [17]:
df_subset

'oh babi know im gonna cut right chase women made like think creat special purpos know what special feel babi let get lost dont need call work caus your boss real want show feel consid lucki that big deal well got key heart aint gonna need id rather open bodi show secret didnt know insid need lie big wide strong wont fit much tough talk like caus back got big ego huge ego love big ego much walk like caus back usual im humbl right dont choos leav could blue call arrog call confid decid find im work damn know im kill leg better yet thigh matter fact smile mayb eye boy site see kind someth like big wide strong wont fit much tough talk like caus back got big ego huge ego love big ego much walk like caus back walk like caus back talk like caus back back back walk like caus back big wide strong wont fit much tough talk like caus back got big ego huge ego huge ego love big ego much walk like caus back ego big must admit got everi reason feel like im bitch ego strong aint know dont need beat s

### Example of custom preprocessing

'preprocess_data' takes 2 arguments: 
1. The string to be processed
2. A list of filters to be applied. For a complete list of possible filters, see preprocess.py

In [18]:
# For entire dataset: 
# preprocess_data(df_s, col='lyrics')

In [19]:
# For one entry in the dataset:
preprocess_data(df_s.iloc[1]['lyrics'], filters=['custom','contractions', 'punct', 'typos', 'digits'])

Number of punctuation removed: 36


'playing everything so easy it is like you seem so sure still your ways you do not see I am not sure if they are for me then things come right along our way though we did not truly ask it seems as if they are going to linger with every delight they bring just like what you have truly seemed I am trying to think of what you really want to say even through my darkest day you might want to leave me feeling strange about you like you are going to let me know when words then slipped out of you when words do not come so easy to say you just leave me feeling come what may though i want things coming from your way i say to you you bore me all the time when you seem to hold back all in you all that you want to let me know why do not you have the courage speak up and i will listen if you truly want me to know then tell me is there something wrong with you and you seem fastened there it sounds as if there will be a melody if things in you are let out and then i will feel alright when you sleep do

In [20]:
preprocess_data(df_s.iloc[19]['lyrics'], filters=['lang','custom','contractions', 'punct', 'typos', 'digits'])

Number of punctuation removed: 37
Beyonc was corrected to: Beyond
Beyonc was corrected to: Beyond
caer was corrected to: care
caer was corrected to: care


'Nobody likes being played Oh Beyond Beyond Hey He said I am worth it his one desire He kissed me he is a one and only beautiful liar A mi tambien why are we the ones who suffer No hay que care he will not be the one to cry Ay let us not kill the karma Ay let us not start a fight Ay Its not worth the drama I did not know about you then til I saw you with him when yeah You stole everything how can you say I did you wrong A mi tambien when the pain and heartbreaks over No hay que care the innocence is gone Ay let us not kill the karma Ay let us not start a fight Ay Its not worth the drama And I wish I could free you of the hurt and the pain Hey Hey Ay let us not kill the karma Ay let us not start a fight Ay Its not worth the drama '

### Export sanitized dataset to csv file

In [21]:
# df_s.drop(labels=["index","song","year","artist"], axis=1, inplace=True)
# df_s["genre"] = df_s["genre"].apply(to_lower)
# df_s.to_csv(OUT_DATASET, index=False)

### Display the sanitized dataset

In [22]:
# pd.read_csv(OUT_DATASET)

In [23]:
df.head()

Unnamed: 0,song,year,artist,genre,lyrics
0,ego-remix,2009,beyonce-knowles,Pop,"Oh baby, how you doing?\nYou know I'm gonna cu..."
1,then-tell-me,2009,beyonce-knowles,Pop,"playin' everything so easy,\nit's like you see..."
2,honesty,2009,beyonce-knowles,Pop,If you search\nFor tenderness\nIt isn't hard t...
3,you-are-my-rock,2009,beyonce-knowles,Pop,"Oh oh oh I, oh oh oh I\n[Verse 1:]\nIf I wrote..."
4,black-culture,2009,beyonce-knowles,Pop,"Party the people, the people the party it's po..."


### Exclude artists in multiple genres: 
#### Note: No such artist was found

In [24]:
# # build a dictionary with artist as key and genre as value
# d = dict()
# for x,row in df_s.iterrows():
#     if row['artist'] not in d.keys():
#         d[row['artist']] = [row['genre']]
#     else:
#         if row['genre'] not in d[row['artist']]:
#             d[row['artist']].append(row['genre'])
#     if len(d[row['artist']])>1:
#         print('MULTIPLE GENRE ARTIST: ' + row['artist'])


### Metrics for preprocessing

In [25]:
y = df[0:5]

In [26]:
y

Unnamed: 0,song,year,artist,genre,lyrics
0,ego-remix,2009,beyonce-knowles,Pop,"Oh baby, how you doing?\nYou know I'm gonna cu..."
1,then-tell-me,2009,beyonce-knowles,Pop,"playin' everything so easy,\nit's like you see..."
2,honesty,2009,beyonce-knowles,Pop,If you search\nFor tenderness\nIt isn't hard t...
3,you-are-my-rock,2009,beyonce-knowles,Pop,"Oh oh oh I, oh oh oh I\n[Verse 1:]\nIf I wrote..."
4,black-culture,2009,beyonce-knowles,Pop,"Party the people, the people the party it's po..."


In [27]:
preprocess_data(y, filters=['punct', 'typos', 'digits'], col='lyrics', out=True)

Number of punctuation removed: 87
Number of punctuation removed: 46
playin was corrected to: plain
Number of punctuation removed: 14
nothin was corrected to: nothing
Number of punctuation removed: 66
1 was corrected to: a
2 was corrected to: a
lovin was corrected to: loving
Number of punctuation removed: 82
freshin was corrected to: freshen
freshin was corrected to: freshen
freshin was corrected to: freshen
freshin was corrected to: freshen


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  data[col] = data[col].apply(custom_preprocessing, args=(filters,), out=True)


Unnamed: 0,song,year,artist,genre,lyrics
0,ego-remix,2009,beyonce-knowles,Pop,Oh baby how you doing You know Im gonna cut ri...
1,then-tell-me,2009,beyonce-knowles,Pop,plain everything so easy its like you seem so ...
2,honesty,2009,beyonce-knowles,Pop,If you search For tenderness It isnt hard to f...
3,you-are-my-rock,2009,beyonce-knowles,Pop,Oh oh oh I oh oh oh I Verse a If I wrote a boo...
4,black-culture,2009,beyonce-knowles,Pop,Party the people the people the party its popp...


In [28]:
y['lyrics'][0]

'Oh baby how you doing You know Im gonna cut right to the chase Some women were made but me myself I like to think that I was created for a special purpose You know whats more special than you You feel me Its on baby lets get lost You dont need to call into work cause youre the boss For real want you to show me how you feel I consider myself lucky thats a big deal Why Well you got the key to my heart But you aint gonna need it Id rather you open up my body And show me secrets you didnt know was inside No need for me to lie Its too big its too wide Its too strong it wont fit Its too much its too tough He talk like this cause he can back it up He got a big ego such a huge ego I love his big ego its too much He walk like this cause he can back it up Usually Im humble right now I dont choose You can leave with me or you could have the blues Some call it arrogant I call it confident You decide when you find on what Im working with Damn I know Im killing you with them legs Better yet them th