In [12]:
# imports
import pandas as pd
import numpy as np
import random
from collections import defaultdict
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC

In [13]:
df = pd.read_csv('lyrics.csv')
df.head()

Unnamed: 0,index,song,year,artist,genre,lyrics
0,0,ego-remix,2009,beyonce-knowles,Pop,"Oh baby, how you doing?\nYou know I'm gonna cu..."
1,1,then-tell-me,2009,beyonce-knowles,Pop,"playin' everything so easy,\nit's like you see..."
2,2,honesty,2009,beyonce-knowles,Pop,If you search\nFor tenderness\nIt isn't hard t...
3,3,you-are-my-rock,2009,beyonce-knowles,Pop,"Oh oh oh I, oh oh oh I\n[Verse 1:]\nIf I wrote..."
4,4,black-culture,2009,beyonce-knowles,Pop,"Party the people, the people the party it's po..."


In [14]:
#replace carriage returns
df = df.replace({'\n': ' '}, regex=True)
df.head()

Unnamed: 0,index,song,year,artist,genre,lyrics
0,0,ego-remix,2009,beyonce-knowles,Pop,"Oh baby, how you doing? You know I'm gonna cut..."
1,1,then-tell-me,2009,beyonce-knowles,Pop,"playin' everything so easy, it's like you seem..."
2,2,honesty,2009,beyonce-knowles,Pop,If you search For tenderness It isn't hard to ...
3,3,you-are-my-rock,2009,beyonce-knowles,Pop,"Oh oh oh I, oh oh oh I [Verse 1:] If I wrote a..."
4,4,black-culture,2009,beyonce-knowles,Pop,"Party the people, the people the party it's po..."


In [15]:
# remove nans
df = df.dropna()

# remove ones that aren't in set
#df = df[df.genre != 'Rock']
#df = df[df.genre != 'Pop']
#df = df[df.genre != 'Hip-Hop']
df = df[df.genre != 'Not Available']

In [16]:
# shuffle data
random.seed(10)
df = df.sample(frac=1).reset_index(drop=True)
df.head()

Unnamed: 0,index,song,year,artist,genre,lyrics
0,133119,trespassin,2012,adam-lambert,Pop,Well I was walkin' for some time When I came a...
1,110964,c-mon,2007,go-betty-go,Rock,Next week came so suddenly I tried to see the ...
2,179919,tears-of-bitterness,2007,extol,Metal,"Divine security, heartfelt certainty But my fa..."
3,146485,sissy-s-song,2008,alan-jackson,Country,Why did she have to go So young I just don't k...
4,18649,funky-beat,2006,everlast,Rock,"Check, uh huh Check check y'all Yo Whitey Ford..."


In [17]:
# get lyrics
genres, lyrics = [], []
for x1, x2 in zip(df['lyrics'], df['genre']):
    genres.append(x2.lower())
    lyrics.append(x1)

In [18]:
# get number of total genres
total_genres = len(set(genres))
print("Num genres: %d" % (total_genres))

# create training and test set
training_x = lyrics[:len(lyrics)//2]
training_y = genres[:len(lyrics)//2]

test_x = lyrics[len(lyrics)//2:]
test_y = genres[len(lyrics)//2:]

Num genres: 11


In [19]:
# get number of words
num_words = [len(row.split(" ")) for row in lyrics]
num_words[:10]

num_words_by_genre = defaultdict(int)
avg_num_words_by_genre = defaultdict(float)

for word_count, genre in zip(num_words, genres):
    num_words_by_genre[genre] += word_count

# songs by genre
songs_by_genre = defaultdict(int)
for g in genres:
    songs_by_genre[g] += 1

sorted(songs_by_genre.items(), key=lambda x:x[1], reverse=True)
# words by genre
for g in num_words_by_genre:
    avg_num_words_by_genre[g] = num_words_by_genre[g] / songs_by_genre[g]

sorted(avg_num_words_by_genre.items(), key=lambda x:x[1], reverse=True)

[('hip-hop', 489.93360160965796),
 ('pop', 245.64797607868334),
 ('r&b', 224.9376653925316),
 ('other', 223.89169396800926),
 ('indie', 196.85138139091777),
 ('electronic', 193.35952799397438),
 ('rock', 190.6831418501396),
 ('country', 186.08389518315147),
 ('folk', 181.01471243869818),
 ('jazz', 172.06361355081555),
 ('metal', 169.05496864346142)]

____________________________________________________________________
## SVM balanced classifier, using unigrams (words)

In [45]:
# svm classifier with words
classifier = Pipeline([
    ('vectorizer', CountVectorizer(stop_words='english', analyzer='word')),
    ('tfidf', TfidfTransformer()),
    ('clf', OneVsRestClassifier(LinearSVC(C=1, class_weight='auto')))])
classifier.fit(training_x, training_y)
predictions = classifier.predict(test_x)
# get accuracy
correct = 0
for label, prediction in zip(test_y, predictions):
    if label == prediction:
        correct += 1
print("Accuracy: %f" % (float(correct)/float(len(test_y))))



Accuracy: 0.492243


In [46]:
# performance by genre
correct_by_genre = defaultdict(int)
incorrect_by_genre = defaultdict(int)
pct_correct_by_genre = defaultdict(float)
for label, prediction in zip(test_y, predictions):
    if (label == prediction):
        correct_by_genre[label] += 1
    else:
        incorrect_by_genre[label] += 1
for genre in correct_by_genre:
    pct_correct_by_genre[genre] = correct_by_genre[genre] / (correct_by_genre[genre]+incorrect_by_genre[genre])
sorted(pct_correct_by_genre.items(), key=lambda x:x[1], reverse=True)

[('hip-hop', 0.758507748841668),
 ('metal', 0.6178177841339278),
 ('country', 0.5072805436139232),
 ('rock', 0.4944214041912871),
 ('pop', 0.4069352118265701),
 ('jazz', 0.391272370679088),
 ('electronic', 0.2745839636913767),
 ('other', 0.2164009111617312),
 ('r&b', 0.21264028352037803),
 ('folk', 0.20607175712971482),
 ('indie', 0.1461100569259962)]

____________________________________________________________________
## Logistic Regression balanced classifier, using unigrams (words)

In [47]:
# logistic regressor
classifier = Pipeline([
    ('vectorizer', CountVectorizer(stop_words='english')),
    ('tfidf', TfidfTransformer()),
    ('clf', OneVsRestClassifier(LogisticRegression(class_weight='auto')))])
classifier.fit(training_x, training_y)
predictions = classifier.predict(test_x)
# get accuracy
correct = 0
for label, prediction in zip(test_y, predictions):
    if label == prediction:
        correct += 1
print("Accuracy: %f" % (float(correct)/float(len(test_y))))



Accuracy: 0.480719


In [48]:
# performance by genre
correct_by_genre = defaultdict(int)
incorrect_by_genre = defaultdict(int)
pct_correct_by_genre = defaultdict(float)
for label, prediction in zip(test_y, predictions):
    if (label == prediction):
        correct_by_genre[label] += 1
    else:
        incorrect_by_genre[label] += 1
for genre in correct_by_genre:
    pct_correct_by_genre[genre] = correct_by_genre[genre] / (correct_by_genre[genre]+incorrect_by_genre[genre])
sorted(pct_correct_by_genre.items(), key=lambda x:x[1], reverse=True)

[('hip-hop', 0.7639399265058316),
 ('metal', 0.6813325481618575),
 ('country', 0.5454167244487589),
 ('rock', 0.46216097185011196),
 ('jazz', 0.391272370679088),
 ('pop', 0.37786486754638354),
 ('electronic', 0.2528996469994957),
 ('folk', 0.22447102115915363),
 ('other', 0.2076689445709947),
 ('r&b', 0.18606024808033078),
 ('indie', 0.1347248576850095)]

In [20]:
# svm classifier without class balance
classifier = Pipeline([
    ('vectorizer', CountVectorizer(stop_words='english', analyzer='word')),
    ('tfidf', TfidfTransformer()),
    ('clf', OneVsRestClassifier(LinearSVC(C=1)))])
classifier.fit(training_x, training_y)
predictions = classifier.predict(test_x)
# get accuracy
correct = 0
for label, prediction in zip(test_y, predictions):
    if label == prediction:
        correct += 1
print("Accuracy: %f" % (float(correct)/float(len(test_y))))

Accuracy: 0.597240


In [21]:
# performance by genre
correct_by_genre = defaultdict(int)
incorrect_by_genre = defaultdict(int)
pct_correct_by_genre = defaultdict(float)
for label, prediction in zip(test_y, predictions):
    if (label == prediction):
        correct_by_genre[label] += 1
    else:
        incorrect_by_genre[label] += 1
for genre in correct_by_genre:
    pct_correct_by_genre[genre] = correct_by_genre[genre] / (correct_by_genre[genre]+incorrect_by_genre[genre])
sorted(pct_correct_by_genre.items(), key=lambda x:x[1], reverse=True)

[('rock', 0.8402536006156778),
 ('hip-hop', 0.7546957778134532),
 ('metal', 0.5214847015549239),
 ('pop', 0.3505651105651106),
 ('country', 0.30868838763575607),
 ('jazz', 0.21788990825688073),
 ('folk', 0.1253430924062214),
 ('other', 0.08503134796238244),
 ('r&b', 0.07042253521126761),
 ('electronic', 0.06377032520325203),
 ('indie', 0.014621741894469168)]

________________________________________________________________________________
## Data visualization

In [29]:
genres = ['metal', 'country', 'rock', 'jazz', 'pop', 'electronic', 'folk', 'other', 'r&b', 'indie']
import plotly.plotly as py
import plotly.graph_objs as go

In [23]:
svm_stats = [('hip-hop', 0.758507748841668),
 ('metal', 0.6178177841339278),
 ('country', 0.5072805436139232),
 ('rock', 0.4944214041912871),
 ('pop', 0.4069352118265701),
 ('jazz', 0.391272370679088),
 ('electronic', 0.2745839636913767),
 ('other', 0.2164009111617312),
 ('r&b', 0.21264028352037803),
 ('folk', 0.20607175712971482),
 ('indie', 0.1461100569259962)]

log_regression_stats = [('hip-hop', 0.7639399265058316),
 ('metal', 0.6813325481618575),
 ('country', 0.5454167244487589),
 ('rock', 0.46216097185011196),
 ('jazz', 0.391272370679088),
 ('pop', 0.37786486754638354),
 ('electronic', 0.2528996469994957),
 ('folk', 0.22447102115915363),
 ('other', 0.2076689445709947),
 ('r&b', 0.18606024808033078),
 ('indie', 0.1347248576850095)]

svm_no_balance_stats = [('rock', 0.8402536006156778),
 ('hip-hop', 0.7546957778134532),
 ('metal', 0.5214847015549239),
 ('pop', 0.3505651105651106),
 ('country', 0.30868838763575607),
 ('jazz', 0.21788990825688073),
 ('folk', 0.1253430924062214),
 ('other', 0.08503134796238244),
 ('r&b', 0.07042253521126761),
 ('electronic', 0.06377032520325203),
 ('indie', 0.014621741894469168)]

In [10]:
"""
Plot balanced SVM vs balanced Logistic regressor
"""

trace1 = go.Bar(
    x=genres,
    y=[x[1] for x in svm_stats],
    name='SVM'
)
trace2 = go.Bar(
    x=genres,
    y=[x[1] for x in log_regression_stats],
    name='Log Regresion'
)

data = [trace1, trace2]
layout = go.Layout(
    title = "Accuracy by Genre",
    barmode='group', 
    yaxis=dict(
        autotick=False,
        ticks='outside',
        tick0=0,
        dtick=0.1,
        ticklen=8,
        tickwidth=4,
        tickcolor='#000'
    )
)

fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='grouped-bar')

High five! You successfully sent some data to your account on plotly. View your plot in your browser at https://plot.ly/~hle000/0 or inside your plot.ly account where it is named 'grouped-bar'


In [30]:
"""
Plot balanced SVM vs unbalanced SVM
"""
trace1 = go.Bar(
    x=genres,
    y=[x[1] for x in svm_stats],
    name='SVM-Balanced'
)
trace2 = go.Bar(
    x=genres,
    y=[x[1] for x in svm_no_balance_stats],
    name='SVM-Unalanced', 
    marker=dict(
        color='rgb(235, 40, 40)'
    )
)

data = [trace1, trace2]
layout = go.Layout(
    title = "Accuracy by Genre",
    barmode='group', 
    yaxis=dict(
        autotick=False,
        ticks='outside',
        tick0=0,
        dtick=0.1,
        ticklen=8,
        tickwidth=4,
        tickcolor='#000'
    )
)

fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='grouped-bar')

High five! You successfully sent some data to your account on plotly. View your plot in your browser at https://plot.ly/~hle000/0 or inside your plot.ly account where it is named 'grouped-bar'
