In [0]:
# Project
# github restricts file size, dataset can be downloaded from https://www.kaggle.com/gyani95/380000-lyrics-from-metrolyrics/data


<h1 align = "center">Genre Classification by Lyric Analysis</h1>
<h2 align = "center">Data Mining Project</h2>
<h4 align = "center">
Nathan Jenkins<br>
Vincent Potrykus<br>
Jordan Sandberg<br>
</h4>

<h2>Introduction</h2>
<p>
Online streaming platforms like Apple Music and Spotify add new songs to their collection daily. Each platform gives users the ability to listen to and download millions unque of songs. These songs are catalogued and categorized so that music listeners may find new songs that match their unique tastes. 

Organizing songs by genre is an old and familiar technique to categorize music and make it easier for listeners to find songs they like. To classify songs into genres manually, someone would need to listen to it and select the genre that best applies. This is hard and very time-consuming job. They would require a knowledge of many different genres, types of music and the nuances between them. 

Machine learning and text lyric analysis can aid in this pursuit. 
</p>

<p>
This report serves to identify if there is a link between word frequency within the lyrics of a song and genre that the song belongs to. By using different machine learning techniques and word frequency analysis ....
</p>

<h2>The Dataset</h2>
<p>
    The dataset can be obtained at: 
    <a href=https://www.kaggle.com/gyani95/380000-lyrics-from-metrolyrics/data>
        https://www.kaggle.com/gyani95/380000-lyrics-from-metrolyrics/data
    </a>
    <br>
    <font color = "red">
        <strong>IMPORTANT: Place the <i>lyrics.csv</i> file in the same folder 
            as the <i>project.ipynb</i> file (this file).</strong>
    </font>
</p>

<h2>Data Preprocessing</h2>
<h3>Removing unnecessary data</h3>
<p>
    The first step of the process is preprocessing the data so that we can use it in the algorithm. First, we delete all invalid rows from the database, that is, rows where the genre is "Not Available" or "Other". Second, we delete all the rows that do not contain lyrics. In the next step, Count Vectorizer, we remove all "non-words", that is emojis and all tokens that arent composed solely of the letters of the alphabet, and "stop words", that is common words that are unlikely to be specific to a document, such as "and", "the", "a", etc.
</p>

In [3]:
# import the dataset
import io
import pandas as pd

data = pd.read_csv('lyrics.csv')
data.head()

# temp so that it runs during testing
data = data[:10000]

# preprocess the data

# remove all the genres with not avaliable and other
data = data[data.genre != 'Not Available']
data = data[data.genre != 'Other']


# remove all the data with no lyrics
data.dropna(subset=['lyrics'], inplace=True)
#print(data.shape)

In [2]:
data.head()

Unnamed: 0,index,song,year,artist,genre,lyrics
0,0,ego-remix,2009,beyonce-knowles,Pop,"Oh baby, how you doing?\nYou know I'm gonna cu..."
1,1,then-tell-me,2009,beyonce-knowles,Pop,"playin' everything so easy,\nit's like you see..."
2,2,honesty,2009,beyonce-knowles,Pop,If you search\nFor tenderness\nIt isn't hard t...
3,3,you-are-my-rock,2009,beyonce-knowles,Pop,"Oh oh oh I, oh oh oh I\n[Verse 1:]\nIf I wrote..."
4,4,black-culture,2009,beyonce-knowles,Pop,"Party the people, the people the party it's po..."


<h3>Count Vectorizer</h3>
<p>
    This steps converts the initial database to a matrix of token counts. Here we also remove all "stop words" and "non-words".
</p>

In [2]:
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer

# max_features : a maximum of 1500 columns
# min_df : the word must occur in a mininum of 5 documents
# max_df : the word occurs in less than 70 percent of the documents (words appearing in all are useless)
# nltk.download('stopwords')
vectorizer = CountVectorizer(max_features=1500, min_df=2, max_df=0.7, stop_words=stopwords.words('english'), token_pattern = '[a-zA-Z]+')

# creates a bag of words
X = vectorizer.fit_transform(data.lyrics.values.astype('str'))

# creates y, which is our genre class
y = data.genre

# visualize the transformed data 
pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names())

Unnamed: 0,across,act,admit,afraid,age,ago,ah,ahead,aim,aint,...,yesterday,yet,yippie,yo,york,young,youre,youth,yuh,z
0,0,0,1,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6207,0,0,0,0,0,0,0,0,6,0,...,0,0,0,0,0,0,0,0,0,0
6208,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6209,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6210,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [0]:
,# find the TFID reference: http://www.tfidf.com/
# mitigates the fact that the word may have a high frequency in other documents

# TF(t) = (Number of times term t appears in a document) / (Total number of terms in the document).
# IDF(t) = log_e(Total number of documents / Number of documents with term t in it). <-- appro 
    # if all documents have the term in it then it has a score of 0 TF * log(1) = 0
    # it will weigh rare words heigher 

# TFIDF = TF * IDF

from sklearn.feature_extraction.text import TfidfTransformer
tfidfconverter = TfidfTransformer()
X = tfidfconverter.fit_transform(X)

In [0]:
# notice how the data is now altered based on the word frequency
pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names())

In [0]:
# split X and y into training a test sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y)


In [0]:

def train_predict(classifiers):
    for classifier in classifiers:
        classifier.fit(X_train, y_train)
        y_pred = classifier.predict(X_test)
        print(classifier.__class__.__name__, accuracy_score(y_test, y_pred))
        print(classification_report(y_test, y_pred))


In [18]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score, classification_report

classifers = [RandomForestClassifier(), MultinomialNB(), LogisticRegression()]
train_predict(classifers)

RandomForestClassifier 0.5692816163631829
              precision    recall  f1-score   support

     Country       0.49      0.19      0.28      1156
  Electronic       0.55      0.09      0.15       535
        Folk       0.77      0.18      0.29       204
     Hip-Hop       0.81      0.74      0.77      1606
       Indie       0.69      0.05      0.09       193
        Jazz       0.61      0.21      0.31       563
       Metal       0.58      0.34      0.43      1617
         Pop       0.46      0.31      0.37      2807
         R&B       0.75      0.10      0.17       187
        Rock       0.56      0.85      0.67      7168

    accuracy                           0.57     16036
   macro avg       0.63      0.31      0.35     16036
weighted avg       0.57      0.57      0.53     16036

MultinomialNB 0.5277500623596907
  'precision', 'predicted', average, warn_for)
              precision    recall  f1-score   support

     Country       0.63      0.01      0.03      1156
  Electron