In [1]:
import pandas as pd
import numpy as np
import string
import matplotlib.pyplot as plt

import nltk
from nltk.stem import WordNetLemmatizer, SnowballStemmer
import nltk
from nltk.corpus import stopwords
nltk.download('wordnet')
nltk.download('punkt')
stemmer = SnowballStemmer('english')
nltk.download('stopwords')

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import MinMaxScaler

from sklearn.naive_bayes import MultinomialNB
from sklearn.cluster import KMeans, DBSCAN

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.metrics.cluster import adjusted_rand_score
from sklearn.metrics.cluster import fowlkes_mallows_score

import seaborn as sns
import cufflinks as cf


from sklearn.pipeline import Pipeline 
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV, RandomizedSearchCV

cf.go_offline()
cf.set_config_file(offline=False, world_readable=True)

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\lidag\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\lidag\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\lidag\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
data = pd.read_csv("Lyrics-Genre-Train.csv")

In [3]:
data2 = []
stop_words = set(stopwords.words('english'))

for i in range(0, len(data.index)):

    # Punctuation removal
    table = str.maketrans(dict.fromkeys(string.punctuation))                   
    sentences = (data.Lyrics[i].translate(table))

    # " '\n " removal
    words = sentences[:-3] 

    # Tokenization
    words = nltk.word_tokenize(words)

    # shrt words removal & lemmatization & stemming
    
    words_ = []
    for word in words:
        if word.lower() not in stop_words:   
            if not word.isnumeric():                                                  
                word1 = stemmer.stem(WordNetLemmatizer().lemmatize(word, pos='v'))          
                words_.append(word1)
    data2.append(words_)  

In [4]:
df = pd.DataFrame({'lyr':data2, 'gen':data['Genre']})   

In [5]:
train_data__, test_data__, train_labels, test_labels = train_test_split(df['lyr'], df['gen'], test_size = 0.2, random_state = 25)

In [6]:
# CountVectorizer & TermFrequencies
cvect = CountVectorizer(ngram_range=(1, 1), lowercase='true')   
tfidf_transformer = TfidfTransformer(norm= 'l2', use_idf= True)

In [7]:
# Transforming the processed data to a list (for tfidf)
data4 = train_data__.astype(str).values.tolist()

train_data1 = cvect.fit_transform(data4)
train_data = tfidf_transformer.fit_transform(train_data1)

In [8]:
# Same procedure for the test data
data5 = test_data__.astype(str).values.tolist()

test_data1 = cvect.transform(data5)
test_data = tfidf_transformer.transform(test_data1)

In [9]:
# Same procedure for the entire data set
train_data6 = df['lyr'].astype(str).values.tolist()

data1_ = cvect.fit_transform(train_data6)
data_ = tfidf_transformer.fit_transform(data1_)

## Naive Bayes

In [10]:
# Model fitting
model = MultinomialNB(alpha = 0.01)
model.fit(train_data, train_labels)

MultinomialNB(alpha=0.01)

In [11]:
# Prediction
prediction = model.predict(test_data)

In [12]:
accuracy_score(test_labels, prediction) 

0.39832568187955714

In [13]:
print(classification_report(test_labels, prediction))

              precision    recall  f1-score   support

     Country       0.42      0.46      0.44       356
  Electronic       0.27      0.06      0.10       322
        Folk       0.59      0.20      0.30       236
     Hip-Hop       0.59      0.79      0.67       429
       Indie       0.22      0.04      0.06       246
        Jazz       0.50      0.22      0.30       307
       Metal       0.62      0.68      0.65       380
         Pop       0.30      0.39      0.34       536
         R&B       0.45      0.09      0.15       229
        Rock       0.27      0.52      0.36       662

    accuracy                           0.40      3703
   macro avg       0.42      0.34      0.34      3703
weighted avg       0.41      0.40      0.37      3703



In [14]:
test_labels.value_counts().sort_values(ascending=False).iplot(kind='bar', yTitle='Number of samples', title='Genre')

In [15]:
pred = pd.DataFrame({'pred': prediction})

In [16]:
pred['pred'].value_counts().sort_values(ascending=False).iplot(kind='bar', title='Lyrics')