# Music Genre Classification - Preprocessing (380K)

### Import necessary libraries

In [1]:
import sys
sys.path.append("../lib/utils/")

In [2]:
import string
import os
import math

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
%matplotlib inline

In [4]:
from wordcloud import WordCloud
from nltk.util import ngrams
from nltk.tokenize import RegexpTokenizer

In [5]:
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

In [6]:
from preprocess import *

In [7]:
stop_words = stopwords.words('english')
stemmer = SnowballStemmer('english')

In [8]:
# Parameters and definitions
RANDOM_SEED = 0
VAL_SET_SIZE = 0.2

In [9]:
np.random.seed(RANDOM_SEED)
sns.set(style="darkgrid")

### File Paths

In [10]:
DATASET = "../data/380000.csv"
OUT_DATASET = "../data/380000_clean.csv"
FIGURES_DIR = "../figures/"

### Helper Methods

In [11]:
def load_data():
    """Loads the training and testing sets into the memory.
    """
    return pd.read_csv(DATASET)

### Data Wrangling

In [12]:
df = load_data()

### Exploratory Data Analysis

In [13]:
# Number of records
prev_len = len(df)
print("Len. of data set: {}".format(prev_len))

Len. of data set: 362237


### Sanitize data

In [14]:
# Remove rows with N/A values
df.dropna(inplace=True)

# Number of records after removal
print("Len. of data set: {}".format(len(df)))

# Records removed
print("Number of elements removed: {}".format(prev_len - len(df)))

Len. of data set: 266556
Number of elements removed: 95681


In [15]:
df_s = df.copy()
df_s = preprocess_data(df_s, col="lyrics")

In [16]:
df.iloc[0]["lyrics"]

"Oh baby, how you doing?\nYou know I'm gonna cut right to the chase\nSome women were made but me, myself\nI like to think that I was created for a special purpose\nYou know, what's more special than you? You feel me\nIt's on baby, let's get lost\nYou don't need to call into work 'cause you're the boss\nFor real, want you to show me how you feel\nI consider myself lucky, that's a big deal\nWhy? Well, you got the key to my heart\nBut you ain't gonna need it, I'd rather you open up my body\nAnd show me secrets, you didn't know was inside\nNo need for me to lie\nIt's too big, it's too wide\nIt's too strong, it won't fit\nIt's too much, it's too tough\nHe talk like this 'cause he can back it up\nHe got a big ego, such a huge ego\nI love his big ego, it's too much\nHe walk like this 'cause he can back it up\nUsually I'm humble, right now I don't choose\nYou can leave with me or you could have the blues\nSome call it arrogant, I call it confident\nYou decide when you find on what I'm working 

In [17]:
df_s.iloc[0]["lyrics"]

'oh babi know im gonna cut right chase women made like think creat special purpos know what special feel babi let get lost dont need call work caus your boss real want show feel consid lucki that big deal well got key heart aint gonna need id rather open bodi show secret didnt know insid need lie big wide strong wont fit much tough talk like caus back got big ego huge ego love big ego much walk like caus back usual im humbl right dont choos leav could blue call arrog call confid decid find im work damn know im kill leg better yet thigh matter fact smile mayb eye boy site see kind someth like big wide strong wont fit much tough talk like caus back got big ego huge ego love big ego much walk like caus back walk like caus back talk like caus back back back walk like caus back big wide strong wont fit much tough talk like caus back got big ego huge ego huge ego love big ego much walk like caus back ego big must admit got everi reason feel like im bitch ego strong aint know dont need beat s

### Export sanitized dataset to csv file

In [18]:
df_s.drop(labels=["index","song","year","artist"], axis=1, inplace=True)
df_s["genre"] = df_s["genre"].apply(to_lower)
df_s.to_csv(OUT_DATASET, index=False)

### Display the sanitized dataset

In [19]:
pd.read_csv(OUT_DATASET)

Unnamed: 0,genre,lyrics
0,pop,oh babi know im gonna cut right chase women ma...
1,pop,playin everyth easi like seem sure still way d...
2,pop,search tender isnt hard find love need live lo...
3,pop,oh oh oh oh oh oh vers 1 wrote book stand titl...
4,pop,parti peopl peopl parti pop sit around see loo...
5,pop,heard church bell ring heard choir sing saw lo...
6,pop,anoth day would spend waitin right one stare n...
7,pop,wait wait wait wait wait wait wait wait wait w...
8,pop,vers 1 read magazin wait around said couldnt w...
9,pop,nnnow honey better sit look around caus mustv ...
