# Music Genre Classification - Preprocessing (380K)

### Import necessary libraries

In [1]:
import sys
sys.path.append("../lib/utils/")

In [2]:
import string
import os
import math

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
%matplotlib inline

In [4]:
from wordcloud import WordCloud
from nltk.util import ngrams
from nltk.tokenize import RegexpTokenizer

In [5]:
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

In [6]:
from preprocess import *

In [7]:
stop_words = stopwords.words('english')
stemmer = SnowballStemmer('english')

In [8]:
# Parameters and definitions
RANDOM_SEED = 0
VAL_SET_SIZE = 0.2

In [9]:
np.random.seed(RANDOM_SEED)
sns.set(style="darkgrid")

### File Paths

In [10]:
DATASET = "../data/380000.csv"
OUT_DATASET = "../data/380000_clean.csv"
FIGURES_DIR = "../figures/"

### Helper Methods

In [11]:
def load_data():
    """Loads the training and testing sets into the memory.
    """
    return pd.read_csv(DATASET)

### Data Wrangling

In [12]:
df = load_data()

### Exploratory Data Analysis

In [13]:
# Number of records
prev_len = len(df)
print("Len. of data set: {}".format(prev_len))

Len. of data set: 362237


### Sanitize data

In [14]:
# Remove rows with N/A values
df.dropna(inplace=True)

# Number of records after removal
print("Len. of data set: {}".format(len(df)))

# Records removed
print("Number of elements removed: {}".format(prev_len - len(df)))

Len. of data set: 266556
Number of elements removed: 95681


In [15]:
df_s = df.copy()

df_subset = preprocess_data(df_s.iloc[0]["lyrics"])

# df_s = preprocess_data(df_s, col="lyrics")

In [16]:
df_subset

'oh babi know im gonna cut right chase women made like think creat special purpos know what special feel babi let get lost dont need call work caus your boss real want show feel consid lucki that big deal well got key heart aint gonna need id rather open bodi show secret didnt know insid need lie big wide strong wont fit much tough talk like caus back got big ego huge ego love big ego much walk like caus back usual im humbl right dont choos leav could blue call arrog call confid decid find im work damn know im kill leg better yet thigh matter fact smile mayb eye boy site see kind someth like big wide strong wont fit much tough talk like caus back got big ego huge ego love big ego much walk like caus back walk like caus back talk like caus back back back walk like caus back big wide strong wont fit much tough talk like caus back got big ego huge ego huge ego love big ego much walk like caus back ego big must admit got everi reason feel like im bitch ego strong aint know dont need beat s

In [17]:
# df_test = preprocess_data(df_s.iloc[0:2], function_list=['lower','punct','custom'], col="lyrics")

df_test = preprocess_data(df_s.iloc[0]['lyrics'], filters=['punct', 'lower'])

In [18]:
df_test

'oh baby how you doing\nyou know im gonna cut right to the chase\nsome women were made but me myself\ni like to think that i was created for a special purpose\nyou know whats more special than you you feel me\nits on baby lets get lost\nyou dont need to call into work cause youre the boss\nfor real want you to show me how you feel\ni consider myself lucky thats a big deal\nwhy well you got the key to my heart\nbut you aint gonna need it id rather you open up my body\nand show me secrets you didnt know was inside\nno need for me to lie\nits too big its too wide\nits too strong it wont fit\nits too much its too tough\nhe talk like this cause he can back it up\nhe got a big ego such a huge ego\ni love his big ego its too much\nhe walk like this cause he can back it up\nusually im humble right now i dont choose\nyou can leave with me or you could have the blues\nsome call it arrogant i call it confident\nyou decide when you find on what im working with\ndamn i know im killing you with them

In [19]:
# df.iloc[0]["lyrics"]

In [20]:
# df_s.iloc[0]["lyrics"]

### Export sanitized dataset to csv file

In [21]:
# df_s.drop(labels=["index","song","year","artist"], axis=1, inplace=True)
# df_s["genre"] = df_s["genre"].apply(to_lower)
# df_s.to_csv(OUT_DATASET, index=False)

### Display the sanitized dataset

In [22]:
# pd.read_csv(OUT_DATASET)