In [1]:
%cd /content/drive/My Drive/Colab Notebooks/regaetton_songs_nlp

/content/drive/My Drive/Colab Notebooks/regaetton_songs_nlp


In [2]:
normalized_eval_path = '/content/drive/MyDrive/Colab Notebooks/regaetton_songs_nlp/data/normalized_eval_lyrics.csv'
split_train_path = '/content/drive/MyDrive/Colab Notebooks/regaetton_songs_nlp/data/normalized_train_split.csv'
scores_path = '/content/drive/MyDrive/Colab Notebooks/regaetton_songs_nlp/data/scores.csv'

In [3]:
import nltk
import pandas as pd

from nltk import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, recall_score

nltk.download('punkt')

def add_score(df, model, parameters, accuracy, recall):
    row = {'model': model, 'parameters': parameters,
           'accuracy': accuracy, 'recall': recall}
    df = df.append(row, ignore_index=True)

    return df

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [8]:
eval = pd.read_csv(normalized_eval_path)
train = pd.read_csv(split_train_path)
scores = pd.read_csv(scores_path)

In [9]:
scores.head()

Unnamed: 0,model,parameters,accuracy,recall
0,logistic regression - BoW,{'C': 1.0},0.668333,0.685714
1,naive bayes - BoW,,0.756667,0.743243


# Create model


In [6]:
X_train, X_valid, y_train, y_valid = train_test_split(train.lyrics.values, train.sexual_content.values, 
                                                      stratify=train.sexual_content.values, random_state=10,
                                                      test_size=0.15, shuffle=True)

In [10]:
count_vect = CountVectorizer(tokenizer=lambda x: word_tokenize(x, language='spanish'), encoding='utf-8', ngram_range=(1, 3))
X_train_ctv = count_vect.fit_transform(X_train)
X_valid_ctv = count_vect.transform(X_valid)

In [11]:
# Fitting a simple Logistic Regression on Counts
lgr_parameters = {'C': 1.0}
clf = LogisticRegression(C=1.0)
clf.fit(X_train_ctv, y_train)
predictions = clf.predict(X_valid_ctv)
print(f'Loggistic regression accuracy {accuracy_score(predictions, y_valid)}')
print(f'Loggistic regression recall {recall_score(predictions, y_valid)}')

Loggistic regression accuracy 0.8315412186379928
Loggistic regression recall 0.859375


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [12]:
X_eval_ctv = count_vect.transform(eval.lyrics.values)
eval_preds = clf.predict(X_eval_ctv)
lgr_accuracy = accuracy_score(eval_preds, eval.sexual_content.values)
lgr_recall = recall_score(eval_preds, eval.sexual_content.values)
print(f'Loggistic regression accuracy in hand-labeled lyrics {lgr_accuracy}')
print(f'Loggistic regression recall in hand-labeled lyrics {lgr_recall}')

Loggistic regression accuracy in hand-labeled lyrics 0.6966666666666667
Loggistic regression recall in hand-labeled lyrics 0.7213114754098361


In [13]:
scores = add_score(scores, 'logistic regression - Bag of n-grams', lgr_parameters, lgr_accuracy, lgr_recall)

In [14]:
# Fitting a simple Naive Bayes
clf = MultinomialNB()
clf.fit(X_train_ctv, y_train)
predictions = clf.predict(X_valid_ctv)
print(f'Loggistic regression accuracy {accuracy_score(predictions, y_valid)}')
print(f'Loggistic regression recall {recall_score(predictions, y_valid)}')
X_eval_ctv = count_vect.transform(eval.lyrics.values)
eval_preds = clf.predict(X_eval_ctv)
naive_accuracy = accuracy_score(eval_preds, eval.sexual_content.values)
naive_recall = recall_score(eval_preds, eval.sexual_content.values)
print(f'Loggistic regression accuracy in hand-labeled lyrics {naive_accuracy}')
print(f'Loggistic regression recall in hand-labeled lyrics {naive_recall}')

Loggistic regression accuracy 0.7299880525686977
Loggistic regression recall 0.6666666666666666
Loggistic regression accuracy in hand-labeled lyrics 0.695
Loggistic regression recall in hand-labeled lyrics 0.6457765667574932


In [24]:
scores = add_score(scores, 'naive bayes - Bag of ngrams', '', naive_accuracy, naive_recall)
scores.to_csv(scores_path, index=False)

# Let's use Tfidf instead of count vectorizer

In [16]:
tfv = TfidfVectorizer(tokenizer=lambda x: word_tokenize(x, language='spanish'),min_df=3,  max_features=None, 
                      ngram_range=(1, 3), use_idf=1,smooth_idf=1,sublinear_tf=1)

In [17]:
X_train_tfv = tfv.fit_transform(X_train)
X_valid_tfv = tfv.transform(X_valid)

In [19]:
# Fitting a simple Logistic Regression on tfidf
lgr_parameters = {'C': 1.0}
clf = LogisticRegression(C=1.0)
clf.fit(X_train_tfv, y_train)
predictions = clf.predict(X_valid_tfv)
print(f'Loggistic regression accuracy {accuracy_score(predictions, y_valid)}')
print(f'Loggistic regression recall {recall_score(predictions, y_valid)}')

Loggistic regression accuracy 0.8052568697729988
Loggistic regression recall 0.7967289719626168


In [21]:
X_eval_tfv = tfv.transform(eval.lyrics.values)
eval_preds = clf.predict(X_eval_tfv)
lgr_accuracy = accuracy_score(eval_preds, eval.sexual_content.values)
lgr_recall = recall_score(eval_preds, eval.sexual_content.values)
print(f'Loggistic regression accuracy in hand-labeled lyrics {lgr_accuracy}')
print(f'Loggistic regression recall in hand-labeled lyrics {lgr_recall}')

Loggistic regression accuracy in hand-labeled lyrics 0.7433333333333333
Loggistic regression recall in hand-labeled lyrics 0.7481751824817519


In [22]:
scores = add_score(scores, 'logistic regression - Bag of n-grams - tfidf', lgr_parameters, lgr_accuracy, lgr_recall)

In [23]:
# Fitting a simple Naive Bayes
clf = MultinomialNB()
clf.fit(X_train_tfv, y_train)
predictions = clf.predict(X_valid_tfv)
print(f'Loggistic regression accuracy {accuracy_score(predictions, y_valid)}')
print(f'Loggistic regression recall {recall_score(predictions, y_valid)}')
eval_preds = clf.predict(X_eval_tfv)
naive_accuracy = accuracy_score(eval_preds, eval.sexual_content.values)
naive_recall = recall_score(eval_preds, eval.sexual_content.values)
print(f'Loggistic regression accuracy in hand-labeled lyrics {naive_accuracy}')
print(f'Loggistic regression recall in hand-labeled lyrics {naive_recall}')

Loggistic regression accuracy 0.7574671445639187
Loggistic regression recall 0.717479674796748
Loggistic regression accuracy in hand-labeled lyrics 0.7716666666666666
Loggistic regression recall in hand-labeled lyrics 0.744408945686901


In [25]:
scores = add_score(scores, 'naive bayes - Bag of ngrams - tfidf', '', naive_accuracy, naive_recall)
scores.to_csv(scores_path, index=False)

In [27]:
scores.head()

Unnamed: 0,model,parameters,accuracy,recall
0,logistic regression - BoW,{'C': 1.0},0.668333,0.685714
1,naive bayes - BoW,,0.756667,0.743243
2,logistic regression - Bag of n-grams,{'C': 1.0},0.696667,0.721311
3,naive bayes - Bag of ngrams,,0.695,0.645777
4,logistic regression - Bag of n-grams - tfidf,{'C': 1.0},0.743333,0.748175


# Trying SVM

In [31]:
from sklearn.svm import SVC
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler


In [32]:
n_components = 120
svd = TruncatedSVD(n_components=n_components)
svd.fit(X_train_tfv)
X_train_svd = svd.transform(X_train_tfv)
X_valid_svd = svd.transform(X_valid_tfv)

# Scale the data obtained from SVD. Renaming variable to reuse without scaling.
scl = StandardScaler()
scl.fit(X_train_svd)
X_train_svd_scaled = scl.transform(X_train_svd)
X_valid_svd_scaled = scl.transform(X_valid_svd)

In [34]:
# Fitting a simple SVM
svm_parameters = {'C':1.0}
clf = SVC(C=1.0)
clf.fit(X_train_svd_scaled, y_train)
predictions = clf.predict(X_valid_svd_scaled)

In [35]:
print(f'SVM accuracy {accuracy_score(predictions, y_valid)}')
print(f'SVM recall {recall_score(predictions, y_valid)}')

SVM accuracy 0.7945041816009558
SVM recall 0.8054862842892768


In [37]:
X_eval_svd = svd.transform(X_eval_tfv)
X_eval_svd_scaled = scl.transform(X_eval_svd)

predictions = clf.predict(X_eval_svd_scaled)
svm_accuracy = accuracy_score(predictions, eval.sexual_content.values)
svm_recall = recall_score(predictions, eval.sexual_content.values)

In [38]:
print(f'SVM accuracy in hand-labeled lyrics {naive_accuracy}')
print(f'SVM regression recall in hand-labeled lyrics {naive_recall}')

SVM accuracy in hand-labeled lyrics 0.7716666666666666
SVM regression recall in hand-labeled lyrics 0.744408945686901


In [39]:
scores = add_score(scores, 'svm', svm_parameters, svm_accuracy, svm_recall)
scores.to_csv(scores_path, index=False)

In [40]:
scores.head(10)

Unnamed: 0,model,parameters,accuracy,recall
0,logistic regression - BoW,{'C': 1.0},0.668333,0.685714
1,naive bayes - BoW,,0.756667,0.743243
2,logistic regression - Bag of n-grams,{'C': 1.0},0.696667,0.721311
3,naive bayes - Bag of ngrams,,0.695,0.645777
4,logistic regression - Bag of n-grams - tfidf,{'C': 1.0},0.743333,0.748175
5,naive bayes - Bag of ngrams,,0.771667,0.744409
6,naive bayes - Bag of ngrams - tfidf,,0.771667,0.744409
7,svm,{'C': 1.0},0.738333,0.758755


# Stronger model

Now that we have an idea of how well can we do in our eval dataset with models that are consider simple, let's try a stronger model now using gridsearch as well to explore more possibilities.

In [57]:
import numpy as np

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV


In [58]:
params = {'learning_rate': [0.0001, 0.001, 0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2], 
          'n_estimators': [10],
          'max_features': ["log2","sqrt"], 
          'max_depth':[3,5,8, 10, 15],
          'criterion':['friedman_mse', 'mae'],
          }

clf = RandomizedSearchCV(GradientBoostingClassifier(), params, cv=5, n_jobs=-1)

# Train using the Tfidf features

In [59]:
clf.fit(X_train_tfv, y_train)

RandomizedSearchCV(cv=5, error_score=nan,
                   estimator=GradientBoostingClassifier(ccp_alpha=0.0,
                                                        criterion='friedman_mse',
                                                        init=None,
                                                        learning_rate=0.1,
                                                        loss='deviance',
                                                        max_depth=3,
                                                        max_features=None,
                                                        max_leaf_nodes=None,
                                                        min_impurity_decrease=0.0,
                                                        min_impurity_split=None,
                                                        min_samples_leaf=1,
                                                        min_samples_split=2,
                                                    

In [60]:
print(f'Best params: \n{clf.best_params_}')
best_params = clf.best_params_
predictions = clf.predict(X_valid_tfv)
print(f'Gradient Boosting accuracy {accuracy_score(predictions, y_valid)}')
print(f'Gradient Boosting recall {recall_score(predictions, y_valid)}')

Best params: 
{'n_estimators': 10, 'max_features': 'sqrt', 'max_depth': 15, 'learning_rate': 0.15, 'criterion': 'mae'}
Gradient Boosting accuracy 0.7228195937873357
Gradient Boosting recall 0.731829573934837


In [61]:
predictions = clf.predict(X_eval_tfv)
gb_accuracy = accuracy_score(predictions, eval.sexual_content.values)
gb_recall = recall_score(predictions, eval.sexual_content.values)

scores = add_score(scores, 'Gradient Boosting on Tfidf', best_params, gb_accuracy, gb_recall)
scores.to_csv(scores_path, index=False)

In [62]:
print(f'Gradient Boosting accuracy in hand-labeled lyrics {gb_accuracy}')
print(f'Gradient Boosting recall in hand-labeled lyrics {gb_recall}')

Gradient Boosting accuracy in hand-labeled lyrics 0.645
Gradient Boosting recall in hand-labeled lyrics 0.6666666666666666


# Train using the count vectorizer features

In [63]:
clf.fit(X_train_ctv, y_train)
print(f'Best params: \n{clf.best_params_}')
best_params = clf.best_params_
predictions = clf.predict(X_valid_ctv)
print(f'Gradient Boosting accuracy {accuracy_score(predictions, y_valid)}')
print(f'Gradient Boosting recall {recall_score(predictions, y_valid)}')

Best params: 
{'n_estimators': 10, 'max_features': 'sqrt', 'max_depth': 15, 'learning_rate': 0.075, 'criterion': 'mae'}
Gradient Boosting accuracy 0.7144563918757467
Gradient Boosting recall 0.7181372549019608


In [64]:
predictions = clf.predict(X_eval_ctv)
gb_accuracy = accuracy_score(predictions, eval.sexual_content.values)
gb_recall = recall_score(predictions, eval.sexual_content.values)

scores = add_score(scores, 'Gradient Boosting on count vectorizer', best_params, gb_accuracy, gb_recall)
scores.to_csv(scores_path, index=False)

In [65]:
print(f'Gradient Boosting accuracy in hand-labeled lyrics {gb_accuracy}')
print(f'Gradient Boosting recall in hand-labeled lyrics {gb_recall}')

Gradient Boosting accuracy in hand-labeled lyrics 0.6833333333333333
Gradient Boosting recall in hand-labeled lyrics 0.6811594202898551


# Train using the Dimensionality reduction using truncated SVD

In [66]:
clf.fit(X_train_svd, y_train)
print(f'Best params: \n{clf.best_params_}')
best_params = clf.best_params_
predictions = clf.predict(X_valid_svd)
print(f'Gradient Boosting accuracy {accuracy_score(predictions, y_valid)}')
print(f'Gradient Boosting recall {recall_score(predictions, y_valid)}')

Best params: 
{'n_estimators': 10, 'max_features': 'sqrt', 'max_depth': 15, 'learning_rate': 0.2, 'criterion': 'mae'}
Gradient Boosting accuracy 0.7467144563918757
Gradient Boosting recall 0.7293064876957495


In [67]:
predictions = clf.predict(X_eval_svd)
gb_accuracy = accuracy_score(predictions, eval.sexual_content.values)
gb_recall = recall_score(predictions, eval.sexual_content.values)

scores = add_score(scores, 'Gradient Boosting on 120 svd', best_params, gb_accuracy, gb_recall)
scores.to_csv(scores_path, index=False)

In [68]:
print(f'Gradient Boosting accuracy in hand-labeled lyrics {gb_accuracy}')
print(f'Gradient Boosting recall in hand-labeled lyrics {gb_recall}')

Gradient Boosting accuracy in hand-labeled lyrics 0.7
Gradient Boosting recall in hand-labeled lyrics 0.6883561643835616


# Now using the scaled version

In [69]:
clf.fit(X_train_svd_scaled, y_train)
print(f'Best params: \n{clf.best_params_}')
best_params = clf.best_params_
predictions = clf.predict(X_valid_svd_scaled)
print(f'Gradient Boosting accuracy {accuracy_score(predictions, y_valid)}')
print(f'Gradient Boosting recall {recall_score(predictions, y_valid)}')

Best params: 
{'n_estimators': 10, 'max_features': 'log2', 'max_depth': 15, 'learning_rate': 0.15, 'criterion': 'mae'}
Gradient Boosting accuracy 0.7455197132616488
Gradient Boosting recall 0.7339449541284404


In [70]:
predictions = clf.predict(X_eval_svd_scaled)
gb_accuracy = accuracy_score(predictions, eval.sexual_content.values)
gb_recall = recall_score(predictions, eval.sexual_content.values)

scores = add_score(scores, 'Gradient Boosting on 120 svd scaled', best_params, gb_accuracy, gb_recall)
scores.to_csv(scores_path, index=False)

In [71]:
print(f'Gradient Boosting accuracy in hand-labeled lyrics {gb_accuracy}')
print(f'Gradient Boosting recall in hand-labeled lyrics {gb_recall}')

Gradient Boosting accuracy in hand-labeled lyrics 0.7216666666666667
Gradient Boosting recall in hand-labeled lyrics 0.7320754716981132


as we can see, a more powerful model does not mean always a better performance. Although, we could increase the grid search.

In [74]:
scores.head(20)

Unnamed: 0,model,parameters,accuracy,recall
0,logistic regression - BoW,{'C': 1.0},0.668333,0.685714
1,naive bayes - BoW,,0.756667,0.743243
2,logistic regression - Bag of n-grams,{'C': 1.0},0.696667,0.721311
3,naive bayes - Bag of ngrams,,0.695,0.645777
4,logistic regression - Bag of n-grams - tfidf,{'C': 1.0},0.743333,0.748175
5,naive bayes - Bag of ngrams,,0.771667,0.744409
6,naive bayes - Bag of ngrams - tfidf,,0.771667,0.744409
7,svm,{'C': 1.0},0.738333,0.758755
8,Gradient Boosting on Tfidf,"{'n_estimators': 10, 'max_features': 'sqrt', '...",0.645,0.666667
9,Gradient Boosting on count vectorizer,"{'n_estimators': 10, 'max_features': 'sqrt', '...",0.683333,0.681159
