In [None]:
%cd /content/drive/My Drive/Colab Notebooks/regaetton_songs_nlp

/content/drive/My Drive/Colab Notebooks/regaetton_songs_nlp


In [None]:
normalized_eval_path = '/content/drive/MyDrive/Colab Notebooks/regaetton_songs_nlp/data/normalized_eval_lyrics.csv'
split_train_path = '/content/drive/MyDrive/Colab Notebooks/regaetton_songs_nlp/data/normalized_train_split.csv'
scores_path = '/content/drive/MyDrive/Colab Notebooks/regaetton_songs_nlp/data/scores.csv'
cbow_path = '/content/drive/MyDrive/Colab Notebooks/regaetton_songs_nlp/embeddings/cbow_model.bin'
skip_path = '/content/drive/MyDrive/Colab Notebooks/regaetton_songs_nlp/embeddings/skip_model.bin'

In [None]:
import nltk
import pandas as pd
import numpy as np

from nltk import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, recall_score
from sklearn.svm import SVC
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from gensim.models import Word2Vec, KeyedVectors
from sklearn.metrics import classification_report


nltk.download('punkt')

def add_score(df, model, parameters, accuracy, recall):
    row = {'model': model, 'parameters': parameters,
           'accuracy': accuracy, 'recall': recall}
    df = df.append(row, ignore_index=True)

    return df

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
train = pd.read_csv(split_train_path)
eval = pd.read_csv(normalized_eval_path)
scores = pd.read_csv(scores_path)
cbow_model = Word2Vec.load(cbow_path)
skip_model = Word2Vec.load(skip_path)

# Create the embedding features

In [None]:
# Creating a feature vector by averaging all embeddings for all sentences
def embedding_features(lyrics, model):
    tokens = lyrics.split()
    features = []
    for word in tokens:
        if word in model.wv:

            features.append(model.wv[word])
    features = np.array(features)
    features = features.sum(axis=0)
    features = features / np.sqrt((features ** 2).sum())
    
    return features





In [None]:
# Change the lyrics for its feature representation
train['lyrics_cbow'] = train.lyrics.apply(lambda x: embedding_features(x, cbow_model))
train['lyrics_skip'] = train.lyrics.apply(lambda x: embedding_features(x, skip_model))

In [None]:
eval['lyrics_cbow'] = eval.lyrics.apply(lambda x: embedding_features(x, cbow_model))
eval['lyrics_skip'] = eval.lyrics.apply(lambda x: embedding_features(x, cbow_model))

In [None]:
train.head()

Unnamed: 0,song_name,artist,lyrics,sexual_content,women_denigration,drugs,lyrics_cbow,lyrics_skip
0,,Vico C,despierte maestra vamos dar tutoria ando abusa...,1,-1,-1,"[-0.092151076, 0.032867495, 0.031520315, -0.04...","[-0.059569262, 0.021621076, 0.014864316, 0.015..."
1,Mi alma se muere,Fuego,alma muere remix lyrics voy tratar hacer omega...,1,-1,-1,"[-0.09055093, 0.023767037, 0.020039957, -0.058...","[-0.0664002, 0.019473454, 0.001348675, 0.00170..."
2,,Gente De Zona,amigo escuchame bientienes ayudarme amo mujer ...,1,-1,-1,"[-0.0924042, 0.0022625045, 0.012445057, -0.028...","[-0.06930634, 0.042856492, -0.00765857, 0.0157..."
3,,Nicky Jam,yale quiero tocar quiero provocar quiero senti...,1,-1,-1,"[-0.09963684, 0.014951053, 0.013473895, -0.059...","[-0.06008176, -0.0014560907, 0.035712104, -0.0..."
4,Enchulao,Izaak,hablando claro cité vinieras aquí mismo decirt...,1,-1,-1,"[-0.09749914, 0.018143926, 0.023233136, -0.041...","[-0.06568646, 0.029540772, 0.022370828, -0.007..."


In [None]:
X_train_cbow, X_valid_cbow, y_train_cbow, y_valid_cbow = train_test_split(train.lyrics_cbow.values, train.sexual_content.values, 
                                                      stratify=train.sexual_content.values, random_state=10,
                                                      test_size=0.15, shuffle=True)

X_train_skip, X_valid_skip, y_train_skip, y_valid_skip = train_test_split(train.lyrics_skip.values, train.sexual_content.values, 
                                                      stratify=train.sexual_content.values, random_state=10,
                                                      test_size=0.15, shuffle=True)

# Gradietn Boosting on cbow






In [None]:
params = {'learning_rate': [0.0001, 0.001, 0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2], 
          'n_estimators': [10],
          'max_features': ["log2","sqrt"], 
          'max_depth':[3,5,8, 10, 15],
          'criterion':['friedman_mse', 'mae'],
          }

clf = GridSearchCV(GradientBoostingClassifier(), params, cv=5, n_jobs=-1)

In [None]:
clf.fit(list(X_train_cbow), y_train_cbow)

GridSearchCV(cv=5, error_score=nan,
             estimator=GradientBoostingClassifier(ccp_alpha=0.0,
                                                  criterion='friedman_mse',
                                                  init=None, learning_rate=0.1,
                                                  loss='deviance', max_depth=3,
                                                  max_features=None,
                                                  max_leaf_nodes=None,
                                                  min_impurity_decrease=0.0,
                                                  min_impurity_split=None,
                                                  min_samples_leaf=1,
                                                  min_samples_split=2,
                                                  min_weight_fraction_leaf=0.0,
                                                  n_estimators=100,
                                                  n_iter_no_c...
                 

In [None]:
print(f'Best params: \n{clf.best_params_}')
best_params = clf.best_params_
predictions = clf.predict(list(X_valid_cbow))
print(f'Gradient Boosting accuracy {accuracy_score(predictions, y_valid_cbow)}')
print(f'Gradient Boosting recall {recall_score(predictions, y_valid_cbow)}')

Best params: 
{'criterion': 'friedman_mse', 'learning_rate': 0.025, 'max_depth': 10, 'max_features': 'log2', 'n_estimators': 10}
Gradient Boosting accuracy 0.6845878136200717
Gradient Boosting recall 0.6783216783216783


In [None]:
predictions = clf.predict(list(eval.lyrics_cbow.values))
gb_accuracy = accuracy_score(predictions, eval.sexual_content.values)
gb_recall = recall_score(predictions, eval.sexual_content.values)

print(gb_accuracy)
print(gb_recall)

scores = add_score(scores, 'Gradient Boosting on cbow', best_params, gb_accuracy, gb_recall)
scores.to_csv(scores_path, index=False)

0.7633333333333333
0.7624113475177305


# Logistic regression on cbow

In [None]:
params = {"C":np.logspace(-3,3,7), "penalty":["l1","l2"]}

clf = LogisticRegression()
clf = GridSearchCV(clf, params, cv=10)


In [None]:
clf.fit(list(X_train_cbow), y_train_cbow)

In [None]:
print(f'Best params: \n{clf.best_params_}')
best_params = clf.best_params_
predictions = clf.predict(list(X_valid_cbow))
print(f'Logistic regression accuracy {accuracy_score(predictions, y_valid_cbow)}')
print(f'Logistic regression recall {recall_score(predictions, y_valid_cbow)}')

Best params: 
{'C': 100.0, 'penalty': 'l2'}
Logistic regression accuracy 0.6726403823178017
Logistic regression recall 0.664367816091954


In [None]:
predictions = clf.predict(list(eval.lyrics_cbow.values))
gb_accuracy = accuracy_score(predictions, eval.sexual_content.values)
gb_recall = recall_score(predictions, eval.sexual_content.values)

print(f'Logistic regression accuracy: {gb_accuracy}')
print(f'Logistic regression recall: {gb_recall}')

scores = add_score(scores, 'Logistic regression on cbow', best_params, gb_accuracy, gb_recall)
scores.to_csv(scores_path, index=False)

Logistic regression accuracy: 0.7716666666666666
Logistic regression recall: 0.7610921501706485


# Logistic regression on skipgram

In [None]:
clf = LogisticRegression()
clf = GridSearchCV(clf, params, cv=10)

In [None]:
clf.fit(list(X_train_skip), y_train_skip)

In [None]:
print(f'Best params: \n{clf.best_params_}')
best_params = clf.best_params_
predictions = clf.predict(list(X_valid_skip))
print(f'Logistic regression accuracy {accuracy_score(predictions, y_valid_skip)}')
print(f'Logistic regression recall {recall_score(predictions, y_valid_skip)}')

Best params: 
{'C': 10.0, 'penalty': 'l2'}
Logistic regression accuracy 0.7287933094384708
Logistic regression recall 0.7251184834123223


In [None]:
predictions = clf.predict(list(eval.lyrics_skip.values))
gb_accuracy = accuracy_score(predictions, eval.sexual_content.values)
gb_recall = recall_score(predictions, eval.sexual_content.values)

print(f'Logistic regression accuracy: {gb_accuracy}')
print(f'Logistic regression recall: {gb_recall}')

scores = add_score(scores, 'Logistic regression on skipgram', best_params, gb_accuracy, gb_recall)
scores.to_csv(scores_path, index=False)

Logistic regression accuracy: 0.71
Logistic regression recall: 0.6907894736842105
