In [1]:
# imports
import pandas as pd
import numpy as np
from collections import defaultdict
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC

In [2]:
df = pd.read_csv('lyrics.csv')
df.head()

Unnamed: 0,index,song,year,artist,genre,lyrics
0,0,ego-remix,2009,beyonce-knowles,Pop,"Oh baby, how you doing?\nYou know I'm gonna cu..."
1,1,then-tell-me,2009,beyonce-knowles,Pop,"playin' everything so easy,\nit's like you see..."
2,2,honesty,2009,beyonce-knowles,Pop,If you search\nFor tenderness\nIt isn't hard t...
3,3,you-are-my-rock,2009,beyonce-knowles,Pop,"Oh oh oh I, oh oh oh I\n[Verse 1:]\nIf I wrote..."
4,4,black-culture,2009,beyonce-knowles,Pop,"Party the people, the people the party it's po..."


In [3]:
#replace carriage returns
df = df.replace({'\n': ' '}, regex=True)
df.head()

Unnamed: 0,index,song,year,artist,genre,lyrics
0,0,ego-remix,2009,beyonce-knowles,Pop,"Oh baby, how you doing? You know I'm gonna cut..."
1,1,then-tell-me,2009,beyonce-knowles,Pop,"playin' everything so easy, it's like you seem..."
2,2,honesty,2009,beyonce-knowles,Pop,If you search For tenderness It isn't hard to ...
3,3,you-are-my-rock,2009,beyonce-knowles,Pop,"Oh oh oh I, oh oh oh I [Verse 1:] If I wrote a..."
4,4,black-culture,2009,beyonce-knowles,Pop,"Party the people, the people the party it's po..."


In [4]:
#remove nans
df = df.dropna()

In [5]:
# get lyrics
genres, lyrics = [], []
for x1, x2 in zip(df['lyrics'], df['genre']):
    genres.append(x2)
    lyrics.append(x1)

In [7]:
# get number of total genres
total_genres = len(set(genres))
print("Num genres: %d" % (total_genres))

Num genres: 12


In [14]:
# get distribution of genres
np.unique(genres, return_counts=True)

# performance by genre
genre_count = defaultdict(int)
genre_dist = defaultdict(float)
for genre in genres:
    genre_count[genre] += 1
for genre in genre_count:
    genre_dist[genre] = genre_count[genre] / len(genres)

sorted(genre_dist.items(), key=lambda x:x[1], reverse=True)

[('Rock', 0.4098013175467819),
 ('Pop', 0.1518105013580636),
 ('Hip-Hop', 0.09322618886838038),
 ('Not Available', 0.08981602364981467),
 ('Metal', 0.0891332402947223),
 ('Country', 0.05397364906436171),
 ('Jazz', 0.029899908462011734),
 ('Electronic', 0.029884902234427288),
 ('Other', 0.01946682873392458),
 ('R&B', 0.012759045003676526),
 ('Indie', 0.011813652665856331),
 ('Folk', 0.008414742117978961)]

In [None]:
# create training and test set
training_x = lyrics[:len(lyrics)//2]
training_y = genres[:len(lyrics)//2]

test_x = lyrics[len(lyrics)//2:]
test_y = genres[len(lyrics)//2:]

In [8]:
# create classifier for pipeline
classifier = Pipeline([
    ('vectorizer', CountVectorizer(stop_words='english')),
    ('tfidf', TfidfTransformer()),
    ('clf', OneVsRestClassifier(LinearSVC(C=1)))])
classifier.fit(training_x, training_y)
predictions = classifier.predict(test_x)
# get accuracy
correct = 0
for label, prediction in zip(test_y, predictions):
    if label == prediction:
        correct += 1
print("Accuracy: %f" % (float(correct)/float(len(test_y))))

Accuracy: 0.503474


In [13]:
# performance by genre
correct_by_genre = defaultdict(int)
incorrect_by_genre = defaultdict(int)
pct_incorrect_by_genre = defaultdict(float)
for label, prediction in zip(test_y, predictions):
    if (label == prediction):
        correct_by_genre[label] += 1
    else:
        incorrect_by_genre[label] += 1
for genre in correct_by_genre:
    pct_incorrect_by_genre[genre] = correct_by_genre[genre] / (correct_by_genre[genre]+incorrect_by_genre[genre])
sorted(pct_incorrect_by_genre.items(), key=lambda x:x[1], reverse=True)

NameError: name 'predictions' is not defined

In [30]:
# create classifier for pipeline
classifier = Pipeline([
    ('vectorizer', CountVectorizer(stop_words='english')),
    ('tfidf', TfidfTransformer()),
    ('clf', OneVsRestClassifier(LogisticRegression()))])
classifier.fit(training_x, training_y)
predictions = classifier.predict(test_x)
# get accuracy
correct = 0
for label, prediction in zip(test_y, predictions):
    if label == prediction:
        correct += 1
print("Accuracy: %f" % (float(correct)/float(len(test_y))))

Accuracy: 0.521639
