In [1]:
%cd /content/drive/My Drive/Colab Notebooks/regaetton_songs_nlp

/content/drive/My Drive/Colab Notebooks/regaetton_songs_nlp


In [2]:
normalized_eval_path = '/content/drive/MyDrive/Colab Notebooks/regaetton_songs_nlp/data/normalized_eval_lyrics.csv'
split_train_path = '/content/drive/MyDrive/Colab Notebooks/regaetton_songs_nlp/data/normalized_train_split.csv'
scores_path = '/content/drive/MyDrive/Colab Notebooks/regaetton_songs_nlp/data/scores.csv'
cbow_path = '/content/drive/MyDrive/Colab Notebooks/regaetton_songs_nlp/embeddings/cbow_model.bin'
skip_path = '/content/drive/MyDrive/Colab Notebooks/regaetton_songs_nlp/embeddings/skip_model.bin'

In [19]:
import numpy as np
import pandas as pd
import keras

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras import layers
from keras.models import Model, Sequential
from keras.initializers import Constant
from keras.metrics import Accuracy, Recall
from gensim.models import Word2Vec
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score

def add_score(df, model, parameters, accuracy, recall):
    row = {'model': model, 'parameters': parameters,
           'accuracy': accuracy, 'recall': recall}
    df = df.append(row, ignore_index=True)

    return df

In [4]:
train = pd.read_csv(split_train_path)
eval = pd.read_csv(normalized_eval_path)
scores = pd.read_csv(scores_path)
cbow_model = Word2Vec.load(cbow_path)
skip_model = Word2Vec.load(skip_path)

In [5]:
X_train, X_valid, y_train, y_valid = train_test_split(train.lyrics.values, train.sexual_content.values, 
                                                      stratify=train.sexual_content.values, random_state=10,
                                                      test_size=0.15, shuffle=True)

In [14]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)
train_sequences = tokenizer.texts_to_sequences(X_train)
valid_sequences = tokenizer.texts_to_sequences(X_valid)
eval_sequences = tokenizer.texts_to_sequences(eval.lyrics.values)
word_index = tokenizer.word_index
print(f'{len(word_index)} unique tokens.')

41013 unique tokens.


In [34]:
max_len_sequence = 1000

train_data = pad_sequences(train_sequences, maxlen=max_len_sequence)
valid_data = pad_sequences(valid_sequences, maxlen=max_len_sequence)
eval_data = pad_sequences(eval_sequences, maxlen=max_len_sequence)
train_labels = to_categorical(np.asarray(y_train))
valid_labels = to_categorical(np.asarray(y_valid))
eval_labels = to_categorical(np.asarray(eval.sexual_content.values))


In [35]:
emb_dim = 300
num_words = len(word_index) + 1 # plus 1 for unknown words

embedding_matrix = np.zeros((num_words, emb_dim))
for word, i in word_index.items():
    embedding_vector = cbow_model.wv[word]
    embedding_matrix[i] = embedding_vector
    
# load these pre-trained word embeddings into an Embedding layer
embedding_layer = layers.Embedding(num_words,
                            emb_dim,
                            embeddings_initializer=Constant(embedding_matrix),
                            input_length=max_len_sequence,
                            trainable=False)

# Train CNN model

In [36]:
early_stopping = keras.callbacks.EarlyStopping(
    monitor="val_loss",
    patience=20,
    restore_best_weights=True,
)


In [37]:
model = Sequential([
    embedding_layer,
    layers.Conv1D(128, 5, activation='relu'),
    layers.MaxPooling1D(5),
    layers.Conv1D(128, 5, activation='relu'),
    layers.MaxPooling1D(5),
    layers.Conv1D(128, 5, activation='relu'),
    layers.GlobalAveragePooling1D(),
    layers.Dense(128, activation='relu'),
    layers.Dropout(0.3),
    layers.Dense(64, activation='relu'),
    layers.Dropout(0.3),
    layers.Dense(2, activation='softmax')
])

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy', Recall()])
model.fit(train_data, train_labels,
          batch_size=256,
          epochs=100, validation_data=(valid_data, valid_labels), 
          callbacks=[early_stopping])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100


<tensorflow.python.keras.callbacks.History at 0x7f9a9f33acc0>

In [38]:
# Predict on hand-labeled data
cnn_loss, cnn_accuracy, cnn_recall = model.evaluate(eval_data, eval_labels)

print(cnn_accuracy)
print(cnn_recall)

0.7483333349227905
0.7483333349227905


In [43]:
scores = add_score(scores, 'CNN on lyrics embeddings (word2vec - cbow)', 
                   {'optimizer': 'adam', 'Conv + pooling': 3, 'filters-size': (128, 5), 'batch_size': 256}, cnn_accuracy, cnn_recall)
scores.to_csv(scores_path, index=False)

# Train an LSTM model

In [41]:
lstm_early_stopping = keras.callbacks.EarlyStopping(
    monitor="val_loss",
    patience=3,
    restore_best_weights=True,
)


In [42]:
lstm = Sequential([
    embedding_layer,
    layers.SpatialDropout1D(0.3),
    layers.LSTM(100, dropout=0.2, recurrent_dropout=0.2),
    layers.Dense(1024, activation='relu'),
    layers.Dropout(0.8),
    layers.Dense(1024, activation='relu'),
    layers.Dropout(0.8),
    layers.Dense(2, activation='softmax'),
])

lstm.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy', Recall()])

lstm.fit(train_data, train_labels,
          batch_size=32,
          epochs=100,
          validation_data=(valid_data, valid_labels), 
          callbacks=[lstm_early_stopping])


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100


<tensorflow.python.keras.callbacks.History at 0x7f98cc1e9940>

In [44]:
lstm_loss, lstm_accuracy, lstm_recall = lstm.evaluate(eval_data, eval_labels)
print(lstm_accuracy)
print(lstm_recall)

0.7400000095367432
0.7400000095367432


In [45]:
scores = add_score(scores, 'LSTM on lyrics embeddings (word2vec - cbow)', 
                   {'optimizer': 'adam', 'LSTM layers - units': (1, 100), 'Dense layers - units': (2, 1024),
                    'Dropout rate after dense layers': 0.8, 'batch_size': 32}, lstm_accuracy, lstm_recall)
scores.to_csv(scores_path, index=False)

# GRU

In [46]:
gru = Sequential([
    embedding_layer,
    layers.SpatialDropout1D(0.3),
    layers.GRU(300, dropout=0.3, recurrent_dropout=0.3, return_sequences=True),
    layers.GRU(300, dropout=0.3, recurrent_dropout=0.3),
    layers.Dense(1024, activation='relu'),
    layers.Dropout(0.8),
    layers.Dense(1024, activation='relu'),
    layers.Dropout(0.8),
    layers.Dense(2, activation='softmax'),
])

gru.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy', Recall()])

gru.fit(train_data, train_labels,
          batch_size=128,
          epochs=100,
          validation_data=(valid_data, valid_labels), 
          callbacks=[lstm_early_stopping])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100


<tensorflow.python.keras.callbacks.History at 0x7f98c32b01d0>

In [47]:
lstm_loss, gru_accuracy, gru_recall = gru.evaluate(eval_data, eval_labels)
print(lstm_accuracy)
print(lstm_recall)

0.7400000095367432
0.7400000095367432


In [48]:
scores = add_score(scores, 'GRU on lyrics embeddings (word2vec - cbow)', 
                   {'optimizer': 'adam', 'GRU layers - units': (2, 300), 'Dense layers - units': (2, 1024),
                    'Dropout rate after dense layers': 0.8, 'batch_size': 128}, gru_accuracy, gru_recall)
scores.to_csv(scores_path, index=False)

In [49]:
scores.head(50)

Unnamed: 0,model,parameters,accuracy,recall
0,logistic regression - BoW,{'C': 1.0},0.668333,0.685714
1,naive bayes - BoW,,0.756667,0.743243
2,logistic regression - Bag of n-grams,{'C': 1.0},0.696667,0.721311
3,naive bayes - Bag of ngrams,,0.695,0.645777
4,logistic regression - Bag of n-grams - tfidf,{'C': 1.0},0.743333,0.748175
5,naive bayes - Bag of ngrams,,0.771667,0.744409
6,naive bayes - Bag of ngrams - tfidf,,0.771667,0.744409
7,svm,{'C': 1.0},0.738333,0.758755
8,Gradient Boosting on Tfidf,"{'n_estimators': 10, 'max_features': 'sqrt', '...",0.645,0.666667
9,Gradient Boosting on count vectorizer,"{'n_estimators': 10, 'max_features': 'sqrt', '...",0.683333,0.681159


# To do's

- try limiting the amount of words to the most frequent to reduce noisy misspelled words.