In [1]:
%cd /content/drive/My Drive/Colab Notebooks/regaetton_songs_nlp

/content/drive/My Drive/Colab Notebooks/regaetton_songs_nlp


In [2]:
normalized_train_path = '/content/drive/MyDrive/Colab Notebooks/regaetton_songs_nlp/data/normalized_train_lyrics.csv'
normalized_eval_path = '/content/drive/MyDrive/Colab Notebooks/regaetton_songs_nlp/data/normalized_eval_lyrics.csv'
split_train_path = '/content/drive/MyDrive/Colab Notebooks/regaetton_songs_nlp/data/normalized_train_split.csv'
scores_path = '/content/drive/MyDrive/Colab Notebooks/regaetton_songs_nlp/data/scores.csv'

In [3]:
import nltk
import pandas as pd

from nltk import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, recall_score

nltk.download('punkt')

def add_score(df, model, parameters, accuracy, recall):
    row = {'model': model, 'parameters': parameters,
           'accuracy': accuracy, 'recall': recall}
    df = df.append(row, ignore_index=True)

    return df

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [4]:
# train = pd.read_csv(normalized_train_path)
eval = pd.read_csv(normalized_eval_path)
train = pd.read_csv(split_train_path)
scores = pd.read_csv(scores_path)


# handle imbalance in training data

In [5]:
# train.sexual_content.value_counts()
# train.sort_values(by='sexual_content', inplace=True, ascending=False, ignore_index=True)
# train.head()
# split = 2777 + 2800
# train_split = train[:split].copy()
# print(train_split.sexual_content.value_counts())
# train_split.tail()


# # Save for reproducibility
# train_split.to_csv(split_train_path, index=False)

In [6]:
train.sexual_content.value_counts()

0    2800
1    2777
Name: sexual_content, dtype: int64

# Create model

In [7]:
X_train, X_valid, y_train, y_valid = train_test_split(train.lyrics.values, train.sexual_content.values, 
                                                      stratify=train.sexual_content.values, random_state=10,
                                                      test_size=0.15, shuffle=True)
    

In [8]:
count_vect = CountVectorizer(tokenizer=lambda x: word_tokenize(x, language='spanish'), encoding='utf-8')
X_train_ctv = count_vect.fit_transform(X_train)
X_valid_ctv = count_vect.transform(X_valid)

In [11]:
# Fitting a simple Logistic Regression on Counts
lgr_parameters = {'C': 1.0}
clf = LogisticRegression(C=1.0)
clf.fit(X_train_ctv, y_train)
predictions = clf.predict(X_valid_ctv)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [12]:
print(f'Loggistic regression accuracy {accuracy_score(predictions, y_valid)}')
print(f'Loggistic regression recall {recall_score(predictions, y_valid)}')

Loggistic regression accuracy 0.8040621266427718
Loggistic regression recall 0.8268733850129198


# Performance in the eval dataset (hand-labeled)

In [13]:
X_eval_ctv = count_vect.transform(eval.lyrics.values)
eval_preds = clf.predict(X_eval_ctv)
lgr_accuracy = accuracy_score(eval_preds, eval.sexual_content.values)
lgr_recall = recall_score(eval_preds, eval.sexual_content.values)
print(f'Loggistic regression accuracy in hand-labeled lyrics {lgr_accuracy}')
print(f'Loggistic regression recall in hand-labeled lyrics {lgr_recall}')

Loggistic regression accuracy in hand-labeled lyrics 0.6683333333333333
Loggistic regression recall in hand-labeled lyrics 0.6857142857142857


In [14]:
scores = add_score(scores, 'logistic regression', lgr_parameters, lgr_accuracy, lgr_recall)

If we go back to past scripts, using snorkel and labeling functions heuristics we achieve 77% against our eval dataset. So, we can still improve.

In [15]:
# Fitting a simple Naive Bayes
clf = MultinomialNB()
clf.fit(X_train_ctv, y_train)
predictions = clf.predict(X_valid_ctv)
print(f'Loggistic regression accuracy {accuracy_score(predictions, y_valid)}')
print(f'Loggistic regression recall {recall_score(predictions, y_valid)}')
X_eval_ctv = count_vect.transform(eval.lyrics.values)
eval_preds = clf.predict(X_eval_ctv)
naive_accuracy = accuracy_score(eval_preds, eval.sexual_content.values)
naive_recall = recall_score(eval_preds, eval.sexual_content.values)
print(f'Loggistic regression accuracy in hand-labeled lyrics {naive_accuracy}')
print(f'Loggistic regression recall in hand-labeled lyrics {naive_recall}')

Loggistic regression accuracy 0.7562724014336918
Loggistic regression recall 0.7232704402515723
Loggistic regression accuracy in hand-labeled lyrics 0.7566666666666667
Loggistic regression recall in hand-labeled lyrics 0.7432432432432432


We almost achieve our benchmark (77%) using naive bayes.

In [16]:
scores = add_score(scores, 'naive bayes', '', naive_accuracy, naive_recall)
scores.to_csv(scores_path)