In [7]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import os
from pathlib import Path
from time import time

from DataPrep import preprocess, remove_rare_words

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC

In [8]:
def vectorize(X_train):
    vectorizer = CountVectorizer()
    dt_matrix = vectorizer.fit_transform(X_train).toarray()
    return vectorizer, dt_matrix

In [9]:
def save_output(lim, genres, parameter, amount, accuracy, f1score, additional: list[tuple] = []):
    '''Save the output of the tests to a csv file'''
    # Create the location of the file
    location = f'./output/{lim}/{parameter}/'
    Path(location).mkdir(parents=True, exist_ok=True)
    file = location + f'{"_".join(genres)}.csv'

    # Prep csv format data
    columns = [parameter, 'accuracy', 'f1score']
    data = [[amount, accuracy, f1score]]
    # Add optional additional parameters (like feature space size)
    for param, amt in additional:
        columns.append(param)
        data[0].append(amt)

    # Write to csv
    df = pd.DataFrame(data, columns=columns)
    header = not os.path.exists(file)
    df.to_csv(file, mode='a', header=header)

    print(f'Succesfully saved output to file {file}')
    return

In [10]:
def run(documents, df, genres, parameter, amount, limit, additional=[]):
    '''Run the test and save the output'''
    # Prepare data
    X_train, X_test, y_train, y_test = train_test_split(documents, df['tag'], test_size=0.1, random_state = 42)
    vectorizer, dt_matrix = vectorize(X_train)

    # Fit model
    if parameter != 'model':
        model = MultinomialNB()
    else:
        model = amount
        amount = model.__name__
    model.fit(dt_matrix, y_train)

    # Test
    dt_matrix_test = vectorizer.transform(X_test)
    y_pred = model.predict(dt_matrix_test)

    # Calculate and save results
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')
    save_output(limit, genres, parameter, amount, accuracy, f1, additional=additional)

# TESTS

In [None]:
# Logistic Regression test

genres = ['rap', 'rock', 'rb', 'country']
limit = 100

# Prepare data
df = pd.read_csv(f'data/song_lyrics_reduced_{"_".join(genres)}_{limit}.csv')
documents = preprocess(df['lyrics'], debug=False)
X_train, X_test, y_train, y_test = train_test_split(documents, df['tag'], test_size=0.1, random_state = 42)


vectorizer = CountVectorizer(analyzer='word')  # ngram_range=(n, n)
dt_matrix = vectorizer.fit_transform(X_train).toarray()

# Fit model
start = time()
model = LogisticRegression(random_state = 42, max_iter=250)
model.fit(dt_matrix, y_train)
end = time() - start

# Test
n_dt_matrix_test = vectorizer.transform(X_test)
y_pred = model.predict(n_dt_matrix_test)

# Calculate and save results
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='macro')
save_output(limit, genres, 'model', 'Logistic Regression', accuracy, f1, additional=[('time', end)])


# Fit model
start = time()
model = MultinomialNB()
model.fit(dt_matrix, y_train)
end = time() - start

# Test
n_dt_matrix_test = vectorizer.transform(X_test)
y_pred = model.predict(n_dt_matrix_test)

# Calculate and save results
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='macro')
save_output(limit, genres, 'model', 'MultinomialNB', accuracy, f1, additional=[('time', end)])

KeyboardInterrupt: 

In [None]:
# Bert test
from transformers import DistilBertTokenizer
from transformers import TFDistilBertForSequenceClassification
import tensorflow as tf
import pandas as pd

# Logistic Regression test

genres = ['rap', 'rock', 'rb', 'country']
limit = 1000

# Prepare data
df = pd.read_csv(f'data/song_lyrics_reduced_{"_".join(genres)}_{limit}.csv')
documents = preprocess(df['lyrics'], debug=False)
X_train, X_test, y_train, y_test = train_test_split(documents, df['tag'], test_size=0.1, random_state = 42)


vectorizer = CountVectorizer(analyzer='word')  # ngram_range=(n, n)
dt_matrix = vectorizer.fit_transform(X_train).toarray()

# Fit model
start = time()
model = LogisticRegression(random_state = 42, max_iter=250)
model.fit(dt_matrix, y_train)
end = time() - start

# Test
n_dt_matrix_test = vectorizer.transform(X_test)
y_pred = model.predict(n_dt_matrix_test)

# Calculate and save results
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='macro')
save_output(limit, genres, 'model', 'Logistic Regression', accuracy, f1, additional=[('time', end)])

In [None]:
# Test effect of removing rare words

genres = ['rap', 'rock', 'pop']
# genres = ['rap', 'rock', 'pop', 'rb', 'country']
limit = 100
df = pd.read_csv(f'data/song_lyrics_reduced_{"_".join(genres)}_{limit}.csv')

old, total, documents = preprocess(df['lyrics'], debug=True, return_count=True)
documents = [document.split() for document in documents]

for cutoff in range(1):
    documents = remove_rare_words(documents, limit=limit, debug=False, return_count=True)
    joined_docs = [' '.join(document) for document in documents]
    run(joined_docs, df, genres, 'cutoff', cutoff, limit, additional=[('features', total)])

Preprocessing data
Removing notes in [brackets]
Removing punctuation
Normalizing
Tokenizing
Removing stopwords
Stemming
Removing rare words
Total words: 38255
Unique words: 7782
Most frequent:
's: 277; n't: 247; like: 213; 'm: 209; got: 197; know: 185; get: 183; ': 173; go: 159; 'll: 152
Least frequent:
bulli: 1 - disabl: 1 - orient: 1 - h-i-m: 1 - cholo: 1 - chola: 1 - lipstick: 1 - vole: 1 - prudenc: 1 - fe: 1

Removed all words occuring 0 or less times
Reduced vocab from 7782 to 7782 words
Finished data preparation!
Succesfully saved output to file ./output/100/cutoff/rap_rock_pop.csv


In [None]:
# N-gram tests

genres = ['rap', 'rock', 'pop']
# genres = ['rap', 'rock', 'pop', 'rb', 'country']
limit = 100

# Prepare data
df = pd.read_csv(f'data/song_lyrics_reduced_{"_".join(genres)}_{limit}.csv')
documents = preprocess(df['lyrics'], debug=False)
X_train, X_test, y_train, y_test = train_test_split(documents, df['tag'], test_size=0.1, random_state = 42)

for n in range(1, 5):
    # N_grams
    n_vectorizer = CountVectorizer(analyzer='word', ngram_range=(n, n))
    n_dt_matrix = n_vectorizer.fit_transform(X_train).toarray()

    # Fit model
    start = time()
    model = MultinomialNB()
    model.fit(n_dt_matrix, y_train)
    end = time() - start

    # Test
    n_dt_matrix_test = n_vectorizer.transform(X_test)
    y_pred = model.predict(n_dt_matrix_test)

    # Calculate and save results
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')
    save_output(limit, genres, 'n-gram', n, accuracy, f1, additional=[('time', end)])

Succesfully saved output to file ./output/100/n-gram/rap_rock_pop.csv
Succesfully saved output to file ./output/100/n-gram/rap_rock_pop.csv
Succesfully saved output to file ./output/100/n-gram/rap_rock_pop.csv
Succesfully saved output to file ./output/100/n-gram/rap_rock_pop.csv
