In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import os
from pathlib import Path
from time import time

from DataPrep import preprocess

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, ConfusionMatrixDisplay

from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Hein\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Hein\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
def vectorize(X_train):
    vectorizer = CountVectorizer()
    dt_matrix = vectorizer.fit_transform(X_train).toarray()
    return vectorizer, dt_matrix

In [3]:
def save_output(lim, genres, parameter, amount, accuracy, f1score, additional: list[tuple] = []):
    '''Save the output of the tests to a csv file'''
    # Create the location of the file
    location = f'./output/{lim}/{parameter}/'
    Path(location).mkdir(parents=True, exist_ok=True)
    file = location + f'{"_".join(genres)}.csv'

    # Prep csv format data
    columns = [parameter, 'accuracy', 'f1score']
    data = [[amount, accuracy, f1score]]
    # Add optional additional parameters (like feature space size)
    for param, amt in additional:
        columns.append(param)
        data[0].append(amt)

    # Write to csv
    df = pd.DataFrame(data, columns=columns)
    header = not os.path.exists(file)
    df.to_csv(file, mode='a', header=header)

    print(f'Succesfully saved output to file {file}')
    return

In [4]:
def run(documents, df, genres, parameter, amount, additional=[]):
    '''Run the test and save the output'''
    # Prepare data
    X_train, X_test, y_train, y_test = train_test_split(documents, df['tag'], test_size=0.1, random_state = 42)
    vectorizer, dt_matrix = vectorize(X_train)

    # Fit model
    if parameter != 'model':
        model = MultinomialNB()
    else:
        model = amount
        amount = model.__name__
    model.fit(dt_matrix, y_train)

    # Test
    dt_matrix_test = vectorizer.transform(X_test)
    y_pred = model.predict(dt_matrix_test)

    # Calculate and save results
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')
    save_output(limit, genres, parameter, amount, accuracy, f1, additional=additional)

In [5]:
# genres = ['rap', 'rock', 'pop']
# limit = 1000
# df = pd.read_csv(f'data/song_lyrics_reduced_{"_".join(genres)}_{limit}.csv')

# # # Test stemming vs lemmatization
# # for method in ['stem', 'lemmatize']:
# #     documents = preprocess(df['lyrics'], stem_words=method=='stem', debug=False)
# #     run(documents, df, genres, 'stem', method)

# # Test effect of removing rare words
# for cutoff in range(1, 50):
#     old, total, documents = preprocess(df['lyrics'], limit=cutoff, debug=False, return_count=True)
#     run(documents, df, genres, 'cutoff', cutoff, additional=[('features', total)])

In [6]:
genres = ['rap', 'rock', 'pop']
limit = 1000
df = pd.read_csv(f'data/song_lyrics_reduced_{"_".join(genres)}_{limit}.csv')

# Test effect of removing rare words
gap = 20
for cutoff in range(50):
    old, total, documents = preprocess(df['lyrics'], limit=cutoff+gap, other_limit=cutoff, debug=False, return_count=True)
    run(documents, df, genres, 'cutoff-mid', cutoff, additional=[('features', total)])

Succesfully saved output to file ./output/1000/cutoff-mid/rap_rock_pop.csv
Succesfully saved output to file ./output/1000/cutoff-mid/rap_rock_pop.csv
Succesfully saved output to file ./output/1000/cutoff-mid/rap_rock_pop.csv
Succesfully saved output to file ./output/1000/cutoff-mid/rap_rock_pop.csv
Succesfully saved output to file ./output/1000/cutoff-mid/rap_rock_pop.csv
Succesfully saved output to file ./output/1000/cutoff-mid/rap_rock_pop.csv
Succesfully saved output to file ./output/1000/cutoff-mid/rap_rock_pop.csv
Succesfully saved output to file ./output/1000/cutoff-mid/rap_rock_pop.csv
Succesfully saved output to file ./output/1000/cutoff-mid/rap_rock_pop.csv
Succesfully saved output to file ./output/1000/cutoff-mid/rap_rock_pop.csv
Succesfully saved output to file ./output/1000/cutoff-mid/rap_rock_pop.csv
Succesfully saved output to file ./output/1000/cutoff-mid/rap_rock_pop.csv
Succesfully saved output to file ./output/1000/cutoff-mid/rap_rock_pop.csv
Succesfully saved output 

In [None]:
genres = ['rap', 'rock', 'pop', 'rb', 'country']
limit = 100
df = pd.read_csv(f'data/song_lyrics_reduced_{"_".join(genres)}_{limit}.csv')

# Test stemming vs lemmatization
for method in ['stem', 'lemmatize']:
    documents = preprocess(df['lyrics'], stem_words=method=='stem', debug=False)
    run(documents, df, genres, 'stem', method)

# Test effect of removing rare words
for cutoff in range(21, 50):
    old, total, documents = preprocess(df['lyrics'], limit=cutoff, debug=False, return_count=True)
    run(documents, df, genres, 'cutoff', cutoff, additional=[('features', total)])

Succesfully saved output to file ./output/1000/stem/rap_rock_pop_rb_country.csv
Succesfully saved output to file ./output/1000/stem/rap_rock_pop_rb_country.csv
Succesfully saved output to file ./output/1000/cutoff/rap_rock_pop_rb_country.csv
Succesfully saved output to file ./output/1000/cutoff/rap_rock_pop_rb_country.csv
Succesfully saved output to file ./output/1000/cutoff/rap_rock_pop_rb_country.csv
Succesfully saved output to file ./output/1000/cutoff/rap_rock_pop_rb_country.csv
Succesfully saved output to file ./output/1000/cutoff/rap_rock_pop_rb_country.csv
Succesfully saved output to file ./output/1000/cutoff/rap_rock_pop_rb_country.csv
Succesfully saved output to file ./output/1000/cutoff/rap_rock_pop_rb_country.csv
Succesfully saved output to file ./output/1000/cutoff/rap_rock_pop_rb_country.csv
Succesfully saved output to file ./output/1000/cutoff/rap_rock_pop_rb_country.csv
Succesfully saved output to file ./output/1000/cutoff/rap_rock_pop_rb_country.csv


KeyboardInterrupt: 

In [None]:
genres = ['rap', 'rock', 'pop']
limit = 100

df = pd.read_csv(f'data/song_lyrics_reduced_{"_".join(genres)}_{limit}.csv')
# Prepare data
documents = preprocess(df['lyrics'], debug=False)
X_train, X_test, y_train, y_test = train_test_split(documents, df['tag'], test_size=0.1, random_state = 42)
vectorizer, dt_matrix = vectorize(X_train)

# Fit model
start = time()
model = MultinomialNB()
model.fit(dt_matrix, y_train)
end = time() - start

# Test
dt_matrix_test = vectorizer.transform(X_test)
y_pred = model.predict(dt_matrix_test)

# Calculate and save results
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')
save_output(limit, genres, 'model', 'MultinomialNB', accuracy, f1, additional=[('time', end)])

Succesfully saved output to file ./output/1000/model/rap_rock_pop.csv
Succesfully saved output to file ./output/1000/model/rap_rock_pop_rb_country.csv


In [None]:
genres = ['rap', 'rock', 'pop', 'rb', 'country']
limit = 100

df = pd.read_csv(f'data/song_lyrics_reduced_{"_".join(genres)}_{limit}.csv')
# Prepare data
documents = preprocess(df['lyrics'], debug=False)
X_train, X_test, y_train, y_test = train_test_split(documents, df['tag'], test_size=0.1, random_state = 42)
vectorizer, dt_matrix = vectorize(X_train)

# Fit model
start = time()
model = MultinomialNB()
model.fit(dt_matrix, y_train)
end = time() - start

# Test
dt_matrix_test = vectorizer.transform(X_test)
y_pred = model.predict(dt_matrix_test)

# Calculate and save results
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')
save_output(limit, genres, 'model', 'MultinomialNB', accuracy, f1, additional=[('time', end)])


In [None]:
genres = ['rap', 'rock', 'pop']
limit = 100

df = pd.read_csv(f'data/song_lyrics_reduced_{"_".join(genres)}_{limit}.csv')
# Prepare data
documents = preprocess(df['lyrics'], debug=False)
X_train, X_test, y_train, y_test = train_test_split(documents, df['tag'], test_size=0.1, random_state = 42)

tfidf_vectorizer = TfidfVectorizer()
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train)
y_test = label_encoder.fit_transform(y_test)
X_train = tfidf_vectorizer.fit_transform(X_train)
X_test = tfidf_vectorizer.transform(X_test)

# Fit model
start = time()
model = SVC(C=1000000.0, kernel='linear', degree=3, gamma='auto')
model.fit(X_train, y_train)
end = time() - start

# Test
y_pred = model.predict(X_test)

# Calculate and save results
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')
save_output(limit, genres, 'model', 'SVM', accuracy, f1, additional=[('time', end)])

Succesfully saved output to file ./output/1000/model/rap_rock_pop.csv
Succesfully saved output to file ./output/1000/model/rap_rock_pop_rb_country.csv


In [None]:
genres = ['rap', 'rock', 'pop', 'rb', 'country']
limit = 100

df = pd.read_csv(f'data/song_lyrics_reduced_{"_".join(genres)}_{limit}.csv')
# Prepare data
documents = preprocess(df['lyrics'], debug=False)
X_train, X_test, y_train, y_test = train_test_split(documents, df['tag'], test_size=0.1, random_state = 42)

tfidf_vectorizer = TfidfVectorizer()
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train)
y_test = label_encoder.fit_transform(y_test)
X_train = tfidf_vectorizer.fit_transform(X_train)
X_test = tfidf_vectorizer.transform(X_test)

# Fit model
start = time()
model = SVC(C=1000000.0, kernel='linear', degree=3, gamma='auto')
model.fit(X_train, y_train)
end = time() - start

# Test
y_pred = model.predict(X_test)

# Calculate and save results
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')
save_output(limit, genres, 'model', 'SVM', accuracy, f1, additional=[('time', end)])

In [None]:
# N-gram tests
genres = ['rap', 'rock', 'pop']
limit = 100

# Prepare data
df = pd.read_csv(f'data/song_lyrics_reduced_{"_".join(genres)}_{limit}.csv')
documents = preprocess(df['lyrics'], limit=25)
X_train, X_test, y_train, y_test = train_test_split(documents, df['tag'], test_size=0.1, random_state = 42)

for n in range(1, 5):
    # N_grams
    n_vectorizer = CountVectorizer(analyzer='word', ngram_range=(n, n))
    n_dt_matrix = n_vectorizer.fit_transform(X_train).toarray()

    # Fit model
    start = time()
    model = MultinomialNB()
    model.fit(n_dt_matrix, y_train)
    end = time() - start

    # Test
    n_dt_matrix_test = n_vectorizer.transform(X_test)
    y_pred = model.predict(n_dt_matrix_test)

    # Calculate and save results
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')
    save_output(limit, genres, 'n-gram-reduced-more', n, accuracy, f1, additional=[('time', end)])

Preprocessing data
Removing notes in [brackets]
Removing punctuation
Normalizing
Tokenizing
Removing stopwords
Stemming
Removing rare words
Total words: 383130
Unique words: 22033
Most frequent:
's: 2610; n't: 2431; 'm: 2089; like: 2059; know: 1997; got: 1894; get: 1810; go: 1618; ': 1571; see: 1531
Least frequent:
heav'n: 1 - vauntingli: 1 - hirel: 1 - freemen: 1 - essex: 1 - zed: 1 - mortifi: 1 - villan: 1 - elaps: 1 - scroog: 1

Removed all words occuring 25 or less times
Reduced vocab from 22033 to 2223 words
Finished data preparation!
Succesfully saved output to file ./output/1000/n-gram-reduced-more/rap_rock_pop.csv
Succesfully saved output to file ./output/1000/n-gram-reduced-more/rap_rock_pop.csv
Succesfully saved output to file ./output/1000/n-gram-reduced-more/rap_rock_pop.csv
Succesfully saved output to file ./output/1000/n-gram-reduced-more/rap_rock_pop.csv
