# IRTM Second Project

Ghadamiyan Lida, class 507

In [1]:
import pandas as pd
import numpy as np
import string
import matplotlib.pyplot as plt

import nltk
from nltk.stem import WordNetLemmatizer, SnowballStemmer
nltk.download('wordnet')
nltk.download('punkt')
stemmer = SnowballStemmer('english')

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import MinMaxScaler

from sklearn.naive_bayes import MultinomialNB
from sklearn import svm
from sklearn.cluster import KMeans, DBSCAN

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.metrics.cluster import adjusted_rand_score
from sklearn.metrics.cluster import fowlkes_mallows_score

import seaborn as sns
import cufflinks as cf


from sklearn.pipeline import Pipeline 
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV, RandomizedSearchCV

cf.go_offline()
cf.set_config_file(offline=False, world_readable=True)

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\lidag\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\lidag\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Data preprocessing

In [2]:
data = pd.read_csv("Lyrics-Genre-Train.csv")
data1 = pd.read_csv("Lyrics-Genre-Test-GroundTruth.csv")

In [3]:
def conv_labels(data):          # Converting the labels to numbers

    
    numeric_labels=[]

    for i in range(0, len(data.index)):

        if data.Genre[i] == 'Metal':
            numeric_labels.append(0)

        if data.Genre[i] == 'Hip-Hop':
            numeric_labels.append(1)

        if data.Genre[i] == 'Country':
            numeric_labels.append(2)

        if data.Genre[i] == 'Jazz':
            numeric_labels.append(3)

        if data.Genre[i] == 'Electronic':
            numeric_labels.append(4)

        if data.Genre[i] == 'Pop':
            numeric_labels.append(5)

        if data.Genre[i] == 'Folk':
            numeric_labels.append(6)

        if data.Genre[i] == 'Rock':
            numeric_labels.append(7)

        if data.Genre[i] == 'R&B':
            numeric_labels.append(8)

        if data.Genre[i] == 'Indie':
            numeric_labels.append(9)
            
    return numeric_labels

In [4]:
numeric_labels = conv_labels(data)
numeric_labels1 = conv_labels(data1)

In [5]:
def prep(data):                             
    
    data2 = []
    for i in range(0, len(data.index)):

        # Punctuation removal
        table = str.maketrans(dict.fromkeys(string.punctuation))                   
        sentences = (data.Lyrics[i].translate(table))

        # " '\n " removal
        words = sentences[:-3] 

        # Tokenization
        words = nltk.word_tokenize(words)

        # shrt words removal & lemmatization & stemming
        words_ = []
        for word in words:
            if len(word) > 2:   
                if not word.isnumeric():                                                  
                    word1 = stemmer.stem(WordNetLemmatizer().lemmatize(word, pos='v'))          
                    words_.append(word1)
        data2.append(words_)  
    return data2

In [6]:
df = pd.DataFrame({'lyr':prep(data), 'gen':numeric_labels})   

In [7]:
train_data__ = df['lyr']
train_labels = df['gen']

In [8]:
df1 = pd.DataFrame({'lyr':prep(data1), 'gen':numeric_labels1})   

test_data__ = df1['lyr']
test_labels = df1['gen']

In [9]:
# CountVectorizer & TermFrequencies
cvect = CountVectorizer(ngram_range=(1, 1), lowercase='true')   
tfidf_transformer = TfidfTransformer(norm= 'l2', use_idf= True)

In [10]:
# Transforming the processed data to a list (for tfidf)
data4 = train_data__.astype(str).values.tolist()

train_data1 = cvect.fit_transform(data4)
train_data = tfidf_transformer.fit_transform(train_data1)

In [11]:
# Same procedure for the test data
data5 = test_data__.astype(str).values.tolist()

test_data1 = cvect.transform(data5)
test_data = tfidf_transformer.transform(test_data1)

In [12]:
# Same procedure for the entire data set
train_data6 = df['lyr'].astype(str).values.tolist()

data1_ = cvect.fit_transform(train_data6)
data_ = tfidf_transformer.fit_transform(data1_)

## Bernoulli NB - best one so far 0.42

In [13]:
from sklearn.naive_bayes import BernoulliNB

model = BernoulliNB(alpha=0.01)
model.fit(train_data, train_labels)

BernoulliNB(alpha=0.01)

In [14]:
prediction = model.predict(test_data)
print(classification_report(test_labels, prediction))

              precision    recall  f1-score   support

           0       0.62      0.62      0.62       810
           1       0.75      0.80      0.77       960
           2       0.51      0.49      0.50       810
           3       0.28      0.48      0.35       660
           4       0.28      0.23      0.25       660
           5       0.37      0.30      0.33      1110
           6       0.51      0.29      0.37       495
           7       0.34      0.36      0.35      1410
           8       0.24      0.25      0.24       510
           9       0.21      0.17      0.19       510

    accuracy                           0.42      7935
   macro avg       0.41      0.40      0.40      7935
weighted avg       0.43      0.42      0.42      7935



## SVM 0.41

In [15]:
# Model fitting
model = svm.SVC()
model.fit(train_data, train_labels)

SVC()

In [16]:
# Prediction
prediction = model.predict(test_data)

In [17]:
print(classification_report(test_labels, prediction))

              precision    recall  f1-score   support

           0       0.65      0.61      0.63       810
           1       0.82      0.74      0.78       960
           2       0.47      0.43      0.45       810
           3       0.47      0.28      0.36       660
           4       0.34      0.05      0.09       660
           5       0.31      0.40      0.35      1110
           6       0.52      0.17      0.25       495
           7       0.28      0.66      0.39      1410
           8       0.52      0.09      0.16       510
           9       0.80      0.02      0.03       510

    accuracy                           0.41      7935
   macro avg       0.52      0.35      0.35      7935
weighted avg       0.49      0.41      0.39      7935



In [None]:
for c in [0, 1e-2, 1e-2, 1, 10, 100]:
    
    model = svm.SVC()
    model.fit(train_data, train_labels)
    prediction = model.predict(test_data)
    print(str(c)+": "+str(accuracy_score(test_labels, prediction)))

## MLP 0.33

In [18]:
from sklearn.neural_network import MLPClassifier

model = MLPClassifier(solver='adam', activation='relu',alpha=1e-5, hidden_layer_sizes=(50,50,50), random_state=1, max_iter=55, verbose=10, learning_rate_init=0.001)
model.fit(train_data, train_labels)

prediction = model.predict(test_data)
accuracy_score(test_labels, prediction) #0.33

Iteration 1, loss = 2.15020852
Iteration 2, loss = 1.71504962
Iteration 3, loss = 1.42725599
Iteration 4, loss = 1.13232904
Iteration 5, loss = 0.85485726
Iteration 6, loss = 0.64139288
Iteration 7, loss = 0.48763714
Iteration 8, loss = 0.37540928
Iteration 9, loss = 0.29189771
Iteration 10, loss = 0.22329319
Iteration 11, loss = 0.17196054
Iteration 12, loss = 0.13415506
Iteration 13, loss = 0.10338984
Iteration 14, loss = 0.08023991
Iteration 15, loss = 0.06170056
Iteration 16, loss = 0.04770239
Iteration 17, loss = 0.03894888
Iteration 18, loss = 0.03190975
Iteration 19, loss = 0.02797006
Iteration 20, loss = 0.02325559
Iteration 21, loss = 0.02153180
Iteration 22, loss = 0.01863835
Iteration 23, loss = 0.01692210
Iteration 24, loss = 0.01698826
Iteration 25, loss = 0.01408219
Iteration 26, loss = 0.01423866
Iteration 27, loss = 0.01403663
Iteration 28, loss = 0.01278360
Iteration 29, loss = 0.01289570
Iteration 30, loss = 0.01000575
Iteration 31, loss = 0.01290595
Iteration 32, los


Stochastic Optimizer: Maximum iterations (55) reached and the optimization hasn't converged yet.



0.332703213610586

## KNN 0.28

In [19]:
from sklearn.neighbors import KNeighborsClassifier

model = KNeighborsClassifier(n_neighbors=10)
model.fit(train_data, train_labels)

KNeighborsClassifier(n_neighbors=10)

In [20]:
# Prediction
prediction = model.predict(test_data)

In [21]:
print(classification_report(test_labels, prediction))

              precision    recall  f1-score   support

           0       0.54      0.38      0.44       810
           1       0.46      0.71      0.56       960
           2       0.23      0.30      0.26       810
           3       0.21      0.19      0.20       660
           4       0.12      0.08      0.09       660
           5       0.19      0.33      0.24      1110
           6       0.29      0.10      0.15       495
           7       0.26      0.24      0.25      1410
           8       0.20      0.13      0.15       510
           9       0.20      0.05      0.07       510

    accuracy                           0.28      7935
   macro avg       0.27      0.25      0.24      7935
weighted avg       0.28      0.28      0.27      7935

