# Imports

In [None]:
import pandas as pd
import os
from pathlib import Path

from DataPrep import preprocess, remove_rare_words

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Hein\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Hein\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# Support Functions

In [None]:
def vectorize(X_train, n_gram=(1,1)):
    vectorizer = CountVectorizer(ngram_range=n_gram)
    dt_matrix = vectorizer.fit_transform(X_train).toarray()
    return vectorizer, dt_matrix

In [3]:
def save_output(lim, genres, parameter, amount, accuracy, f1score, additional: list[tuple] = []):
    '''Save the output of the tests to a csv file'''
    # Create the location of the file
    location = f'./output/{lim}/{parameter}/'
    Path(location).mkdir(parents=True, exist_ok=True)
    file = location + f'{"_".join(genres)}.csv'

    # Prep csv format data
    columns = [parameter, 'accuracy', 'f1score']
    data = [[amount, accuracy, f1score]]
    # Add optional additional parameters (like feature space size)
    for param, amt in additional:
        columns.append(param)
        data[0].append(amt)

    # Write to csv
    df = pd.DataFrame(data, columns=columns)
    header = not os.path.exists(file)
    df.to_csv(file, mode='a', header=header)

    print(f'Succesfully saved output to file {file}')
    return

In [4]:
def run(documents, df, genres, parameter, amount, limit, additional=[]):
    '''Run the test and save the output'''
    # Prepare data
    X_train, X_test, y_train, y_test = train_test_split(documents, df['tag'], test_size=0.1, random_state = 42)
    vectorizer, dt_matrix = vectorize(X_train)

    # Fit model
    if parameter != 'model':
        model = MultinomialNB()
    else:
        model = amount
        amount = model.__name__
    model.fit(dt_matrix, y_train)

    # Test
    dt_matrix_test = vectorizer.transform(X_test)
    y_pred = model.predict(dt_matrix_test)

    # Calculate and save results
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')
    save_output(limit, genres, parameter, amount, accuracy, f1, additional=additional)

# Tests

### Preprocessing

In [5]:
# Regular run
genres = ['rap', 'rock', 'rb', 'country']
limit = 1000

df = pd.read_csv(f'data/song_lyrics_reduced_{"_".join(genres)}_{limit}.csv')

documents = preprocess(df['lyrics'], debug=True)
run(documents, df, genres, 'Normal', 'Normal', limit)

Preprocessing data
Removing notes in [brackets]
Removing punctuation
Normalizing


KeyboardInterrupt: 

In [None]:
# 100 vs 1000 vs 10000
genres = ['rap', 'rock', 'rb', 'country']
limits = [100, 1000, 10000]

for limit in limits:
    df = pd.read_csv(f'data/song_lyrics_reduced_{"_".join(genres)}_{limit}.csv')

    documents = preprocess(df['lyrics'], debug=True)
    run(documents, df, genres, 'Limit', limit, limit)

Preprocessing data
Removing notes in [brackets]
Removing punctuation
Normalizing
Assigned 400 RID values (['PRIMARY', 'SECONDARY', 'EMOTIONS'])
Tokenizing
Removing stopwords
Stemming
Finished data preparation!
Succesfully saved output to file ./output/100/Limit/rap_rock_rb_country.csv
Preprocessing data
Removing notes in [brackets]
Removing punctuation
Normalizing
Assigned 4000 RID values (['PRIMARY', 'SECONDARY', 'EMOTIONS'])
Tokenizing
Removing stopwords
Stemming
Finished data preparation!
Succesfully saved output to file ./output/1000/Limit/rap_rock_rb_country.csv
Preprocessing data
Removing notes in [brackets]
Removing punctuation
Normalizing
Assigned 40000 RID values (['PRIMARY', 'None', 'SECONDARY', 'EMOTIONS'])
Tokenizing
Removing stopwords
Stemming
Finished data preparation!
Succesfully saved output to file ./output/10000/Limit/rap_rock_rb_country.csv


In [None]:
# Cutoff rate
genres = ['rap', 'rock', 'rb', 'country']
limit = 1000

df = pd.read_csv(f'data/song_lyrics_reduced_{"_".join(genres)}_{limit}.csv')
old, new, documents = preprocess(df['lyrics'], limit=0, debug=True, return_count=True)
joined_docs = [' '.join(document) for document in documents]
run(joined_docs, df, genres, 'Cutoff', 0, limit, additional=[('features', new)])

for cutoff in range(1, 50):
    old, new, documents = remove_rare_words(documents, limit=cutoff, debug=True, return_count=True)
    joined_docs = [' '.join(document) for document in documents]
    run(joined_docs, df, genres, 'Cutoff', cutoff, limit, additional=[('features', new)])

Preprocessing data
Removing notes in [brackets]
Removing punctuation
Normalizing
Tokenizing
Removing stopwords
Stemming
Removing rare words
Total words: 482455
Unique words: 27579
Most frequent:
's: 3468; n't: 3309; 'm: 2776; like: 2750; know: 2675; got: 2552; get: 2447; ': 2201; go: 2165; see: 2020
Least frequent:
marina: 1 - impend: 1 - tear-stopp: 1 - 80-proof: 1 - ten-thousand: 1 - mother-in-law: 1 - honky-tonkin: 1 - clackity-clack: 1 - one-night-stand: 1 - luckiest: 1

Removed all words occuring 0 or less times
Reduced vocab from 27579 to 27579 words
Finished data preparation!
Succesfully saved output to file ./output/1000/Cutoff/rap_rock_rb_country.csv
Total words: 482455
Unique words: 27579
Most frequent:
's: 3468; n't: 3309; 'm: 2776; like: 2750; know: 2675; got: 2552; get: 2447; ': 2201; go: 2165; see: 2020
Least frequent:
marina: 1 - impend: 1 - tear-stopp: 1 - 80-proof: 1 - ten-thousand: 1 - mother-in-law: 1 - honky-tonkin: 1 - clackity-clack: 1 - one-night-stand: 1 - lucki

In [None]:
# Lemmatize vs. Stem

df = pd.read_csv(f'data/song_lyrics_reduced_{"_".join(genres)}_{limit}.csv')

for stem in [True, False]:
    documents = preprocess(df['lyrics'], debug=True, stem_words=stem)
    run(documents, df, genres, 'Stem', stem, limit)

Preprocessing data
Removing notes in [brackets]
Removing punctuation
Normalizing
Tokenizing
Removing stopwords
Stemming
Finished data preparation!
Succesfully saved output to file ./output/1000/Stem/rap_rock_rb_country.csv
Preprocessing data
Removing notes in [brackets]
Removing punctuation
Normalizing
Tokenizing
Removing stopwords
Lemmatizing
Finished data preparation!
Succesfully saved output to file ./output/1000/Stem/rap_rock_rb_country.csv


In [12]:
# Updated settings
genres = ['rap', 'rock', 'rb', 'country']
limit = 1000

cutoff = 10
stem = True

df = pd.read_csv(f'data/song_lyrics_reduced_{"_".join(genres)}_{limit}.csv')
documents = preprocess(df['lyrics'], debug=True, limit=cutoff, stem_words=stem)

Preprocessing data
Removing notes in [brackets]
Removing punctuation
Normalizing
Tokenizing
Removing stopwords
Stemming
Removing rare words
Total words: 482455
Unique words: 27579
Most frequent:
's: 3468; n't: 3309; 'm: 2776; like: 2750; know: 2675; got: 2552; get: 2447; ': 2201; go: 2165; see: 2020
Least frequent:
marina: 1 - impend: 1 - tear-stopp: 1 - 80-proof: 1 - ten-thousand: 1 - mother-in-law: 1 - honky-tonkin: 1 - clackity-clack: 1 - one-night-stand: 1 - luckiest: 1

Removed all words occuring 10 or less times
Reduced vocab from 27579 to 4411 words
Finished data preparation!


In [14]:
# MultinomialNB vs Logistic Regression
models = [MultinomialNB(), LogisticRegression(random_state=42, max_iter=250)]
names = ['MultinomialNB', 'LogisticRegression']

X_train, X_test, y_train, y_test = train_test_split(documents, df['tag'], test_size=0.1, random_state = 42)
vectorizer, dt_matrix = vectorize(X_train)

for i in range(len(models)):
    model = models[i]
    name = names[i]
    model.fit(dt_matrix, y_train)

    dt_matrix_test = vectorizer.transform(X_test)
    y_pred = model.predict(dt_matrix_test)

    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')
    save_output(limit, genres, 'Model', name, accuracy, f1)

Succesfully saved output to file ./output/1000/Model/rap_rock_rb_country.csv
Succesfully saved output to file ./output/1000/Model/rap_rock_rb_country.csv


### Other Tests

In [None]:
# RID
df = pd.read_csv(f'data/song_lyrics_reduced_{"_".join(genres)}_{limit}.csv')
for cutoff in [0, 10, 20, 30, 40, 50]:
    for rid in range(20, 21):
        documents = preprocess(df['lyrics'], debug=True, limit=cutoff, stem_words=stem, use_rid=rid)
        run(documents, df, genres, 'RID', rid, limit)

Preprocessing data
Removing notes in [brackets]
Removing punctuation
Normalizing
Assigned 4000 RID values (['PRIMARY', 'SECONDARY', 'EMOTIONS'])
Tokenizing
Removing stopwords
Stemming
Finished data preparation!
Succesfully saved output to file ./output/1000/RID/rap_rock_rb_country.csv
Preprocessing data
Removing notes in [brackets]
Removing punctuation
Normalizing
Assigned 4000 RID values (['PRIMARY', 'SECONDARY', 'EMOTIONS'])
Tokenizing
Removing stopwords
Stemming
Removing rare words
Total words: 482455
Unique words: 27579
Most frequent:
's: 3468; n't: 3309; 'm: 2776; like: 2750; know: 2675; got: 2552; get: 2447; ': 2201; go: 2165; see: 2020
Least frequent:
marina: 1 - impend: 1 - tear-stopp: 1 - 80-proof: 1 - ten-thousand: 1 - mother-in-law: 1 - honky-tonkin: 1 - clackity-clack: 1 - one-night-stand: 1 - luckiest: 1

Removed all words occuring 10 or less times
Reduced vocab from 27579 to 4411 words
Finished data preparation!
Succesfully saved output to file ./output/1000/RID/rap_rock_

In [5]:
# Length
genres = ['rap', 'rock', 'rb', 'country']
limit = 1000
cutoff = 10
stem = True
rid=20
df = pd.read_csv(f'data/song_lyrics_reduced_{"_".join(genres)}_{limit}.csv')

In [6]:
for quants in range(2, 21):
    documents = preprocess(df['lyrics'], debug=True, limit=cutoff, stem_words=stem, use_rid=rid, line_quants=quants, use_length=rid)
    run(documents, df, genres, 'Line counts', quants, limit)

Preprocessing data
Removing notes in [brackets]
Counting Lines
Found quantiles: [ 64. 389.] for [0.5, 1.0] (range 4 - 389)
Removing punctuation
Normalizing
Assigned 4000 RID values (['PRIMARY', 'EMOTIONS', 'SECONDARY'])
Tokenizing
Removing stopwords
Stemming
Removing rare words
Total words: 482455
Unique words: 27579
Most frequent:
's: 3468; n't: 3309; 'm: 2776; like: 2750; know: 2675; got: 2552; get: 2447; ': 2201; go: 2165; see: 2020
Least frequent:
marina: 1 - impend: 1 - 80-proof: 1 - tear-stopp: 1 - ten-thousand: 1 - mother-in-law: 1 - honky-tonkin: 1 - clackity-clack: 1 - one-night-stand: 1 - luckiest: 1

Removed all words occuring 10 or less times
Reduced vocab from 27579 to 4411 words
Finished data preparation!
Succesfully saved output to file ./output/1000/Line counts/rap_rock_rb_country.csv
Preprocessing data
Removing notes in [brackets]
Counting Lines
Found quantiles: [ 52.  79. 389.] for [0.3333333333333333, 0.6666666666666666, 1.0] (range 4 - 389)
Removing punctuation
Norm

In [7]:
line_quants = 10

for quants in range(2, 21):
    documents = preprocess(df['lyrics'], debug=True, limit=cutoff, stem_words=stem, use_rid=rid, 
                           line_quants=line_quants, token_quants=quants, use_length=rid)
    run(documents, df, genres, 'Line counts', quants, limit)

Preprocessing data
Removing notes in [brackets]
Counting Lines
Found quantiles: [ 32.  42.  49.  56.  64.  73.  84.  95. 110. 389.] for [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0] (range 4 - 389)
Removing punctuation
Normalizing
Assigned 4000 RID values (['PRIMARY', 'EMOTIONS', 'SECONDARY'])
Tokenizing
Counting tokens
Found quantiles: [ 391.5 3269. ] for [0.5, 1.0] (range 29 - 3269)
Removing stopwords
Stemming
Removing rare words
Total words: 482455
Unique words: 27579
Most frequent:
's: 3468; n't: 3309; 'm: 2776; like: 2750; know: 2675; got: 2552; get: 2447; ': 2201; go: 2165; see: 2020
Least frequent:
marina: 1 - impend: 1 - 80-proof: 1 - tear-stopp: 1 - ten-thousand: 1 - mother-in-law: 1 - honky-tonkin: 1 - clackity-clack: 1 - one-night-stand: 1 - luckiest: 1

Removed all words occuring 10 or less times
Reduced vocab from 27579 to 4411 words
Finished data preparation!
Succesfully saved output to file ./output/1000/Line counts/rap_rock_rb_country.csv
Preprocessing data
Removin

In [8]:
token_quants = 18

for quants in range(2, 21):
    documents = preprocess(df['lyrics'], debug=True, limit=cutoff, stem_words=stem, use_rid=rid, 
                           line_quants=line_quants, token_quants=token_quants, tpl_quants=quants, use_length=rid)
    run(documents, df, genres, 'Line counts', quants, limit)

Preprocessing data
Removing notes in [brackets]
Counting Lines
Found quantiles: [ 32.  42.  49.  56.  64.  73.  84.  95. 110. 389.] for [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0] (range 4 - 389)
Removing punctuation
Normalizing
Assigned 4000 RID values (['PRIMARY', 'EMOTIONS', 'SECONDARY'])
Tokenizing
Counting tokens
Found quantiles: [ 141.16666667  179.          207.5         235.          265.
  293.          321.          351.          391.5         443.
  490.          543.          593.          653.          724.
  805.66666667  912.83333333 3269.        ] for [0.05555555555555555, 0.1111111111111111, 0.16666666666666666, 0.2222222222222222, 0.2777777777777778, 0.3333333333333333, 0.3888888888888889, 0.4444444444444444, 0.5, 0.5555555555555556, 0.6111111111111112, 0.6666666666666666, 0.7222222222222222, 0.7777777777777778, 0.8333333333333334, 0.8888888888888888, 0.9444444444444444, 1.0] (range 29 - 3269)
Counting tokens per line
Found quantiles: [ 6. 19.] for [0.5, 1.0] (

In [9]:
tpl_quants = 2

for use_length in range(21):
    documents = preprocess(df['lyrics'], debug=True, limit=cutoff, stem_words=stem, use_rid=rid, 
                           line_quants=line_quants, token_quants=token_quants, tpl_quants=tpl_quants, use_length=use_length)
    run(documents, df, genres, 'Line counts', quants, limit)


Preprocessing data
Removing notes in [brackets]
Counting Lines
Found quantiles: [ 32.  42.  49.  56.  64.  73.  84.  95. 110. 389.] for [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0] (range 4 - 389)
Removing punctuation
Normalizing
Assigned 4000 RID values (['PRIMARY', 'EMOTIONS', 'SECONDARY'])
Tokenizing
Counting tokens
Found quantiles: [ 141.16666667  179.          207.5         235.          265.
  293.          321.          351.          391.5         443.
  490.          543.          593.          653.          724.
  805.66666667  912.83333333 3269.        ] for [0.05555555555555555, 0.1111111111111111, 0.16666666666666666, 0.2222222222222222, 0.2777777777777778, 0.3333333333333333, 0.3888888888888889, 0.4444444444444444, 0.5, 0.5555555555555556, 0.6111111111111112, 0.6666666666666666, 0.7222222222222222, 0.7777777777777778, 0.8333333333333334, 0.8888888888888888, 0.9444444444444444, 1.0] (range 29 - 3269)
Counting tokens per line
Found quantiles: [ 6. 19.] for [0.5, 1.0] (

In [10]:
# N-grams
genres = ['rap', 'rock', 'rb', 'country']
limit = 1000
cutoff = 10
stem = True
rid=0
line_quants = 10
token_quants = 18
tpl_quants = 2
use_length = 16

df = pd.read_csv(f'data/song_lyrics_reduced_{"_".join(genres)}_{limit}.csv')
documents = preprocess(df['lyrics'], debug=True, limit=cutoff, stem_words=stem, use_rid=rid,
                           line_quants=line_quants, token_quants=token_quants, tpl_quants=tpl_quants, use_length=use_length)

X_train, X_test, y_train, y_test = train_test_split(documents, df['tag'], test_size=0.1, random_state = 42)

for ngram in [(1, 1), (2, 2), (1, 2), (3, 3), (2, 3), (1, 3)]:
    vectorizer, dt_matrix = vectorize(X_train, n_gram=ngram)
    model = MultinomialNB()
    model.fit(dt_matrix, y_train)

    dt_matrix_test = vectorizer.transform(X_test)
    y_pred = model.predict(dt_matrix_test)

    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')
    save_output(limit, genres, 'N-gram', f'{ngram[0]}-{ngram[1]}', accuracy, f1, additional=[('rid', rid)])

Preprocessing data
Removing notes in [brackets]
Counting Lines
Found quantiles: [ 32.  42.  49.  56.  64.  73.  84.  95. 110. 389.] for [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0] (range 4 - 389)
Removing punctuation
Normalizing


KeyboardInterrupt: 