<a href="https://colab.research.google.com/github/Nacho2904/orga_de_datos/blob/main/tp3_red_neuronal.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import top_k_accuracy_score
from google.colab import drive 
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
import functools
drive.mount('/content/gdrive')
path_a_training_set = 'gdrive/MyDrive/TP3 dataset music/train.parquet'
path_a_test_set = 'gdrive/MyDrive/TP3 dataset music/test.parquet'


df_music_train = pd.read_parquet(path_a_training_set).fillna("")
df_music_test = pd.read_parquet(path_a_test_set).fillna("")

Mounted at /content/gdrive


## Preprocessing

In [2]:
nltk.download('vader_lexicon')
def apply_sentiment_analysis_to_lyrics(df_music: pd.DataFrame) -> pd.DataFrame:
  sia = SentimentIntensityAnalyzer()
  negative, neutral, positive, compound = 0, 1, 2, 3
  sentimentAnalysisOfLyrics = df_music["lyric"].map(lambda lyric: list(sia.polarity_scores(lyric).values()))
  negativeScoreOfLyrics = sentimentAnalysisOfLyrics.map(lambda row: row[negative])
  positiveScoreOfLyrics = sentimentAnalysisOfLyrics.map(lambda row: row[positive])
  neutralScoreOfLyrics = sentimentAnalysisOfLyrics.map(lambda row: row[neutral])
  compoundScoreOfLyrics = sentimentAnalysisOfLyrics.map(lambda row: row[compound])
  return pd.DataFrame(pd.concat([negativeScoreOfLyrics, positiveScoreOfLyrics,neutralScoreOfLyrics,compoundScoreOfLyrics], axis = 1))

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


In [3]:
nltk.download('stopwords')
nltk.download('punkt')

genres = list(df_music_train["genre"].unique())
def get_vectorizers_by_genre(df_music: pd.DataFrame) -> dict:
  df_music_lyric_tokenized = df_music.copy().fillna("")
  df_music_lyric_tokenized["lyric"] = df_music_lyric_tokenized["lyric"].map(lambda lyric: set(nltk.word_tokenize(lyric)))
  df_music_grouped_by_genre = df_music_lyric_tokenized[["genre", "lyric"]].groupby('genre').agg(lambda x: functools.reduce(set.union, x)).reset_index()
  vocabs = dict(zip(df_music_grouped_by_genre.genre.to_list(), df_music_grouped_by_genre.lyric.to_list()))
  stopwords = set(nltk.corpus.stopwords.words("english")).union(set(nltk.corpus.stopwords.words("spanish"))).union(set(nltk.corpus.stopwords.words("french")))
  vectorizers = {genre: TfidfVectorizer(input = "content", stop_words = stopwords, vocabulary = vocabs[genre]) for genre in genres}
  for genre in genres:
    vectorizers[genre].fit(df_music[df_music["genre"] == genre]["genre"])
  return vectorizers

vectorizers = get_vectorizers_by_genre(df_music_train)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
  "Upper case characters found in"


In [4]:
def get_sum_tfidf_from_lyrics(df_music: pd.DataFrame) -> pd.DataFrame:
  df_music = df_music.fillna("")
  column_names = ["sum_tfidf_for_" + genre.lower() for genre in genres]
  for i in range(0, len(genres)):
    df_music[column_names[i]] = np.sum(vectorizers[genres[i]].transform(df_music["lyric"]), axis = 1)
  return df_music[column_names]

In [5]:
def eliminate_genres_without_enough_observations(df_music: pd.DataFrame) -> pd.DataFrame:
  df_music_recuento_filas_por_genero = df_music.groupby("genre").count().reset_index()[["genre", "track_name"]].rename(
    columns = {"track_name": "rowCount"}).sort_values("rowCount")
  problematic_genres = list(df_music_recuento_filas_por_genero[df_music_recuento_filas_por_genero["rowCount"] < 50].genre)[1:]
  return df_music[~df_music["genre"].isin(problematic_genres)]

In [6]:
nltk.download('punkt')
def get_length_transforms_for_text(df_music: pd.DataFrame) -> pd.DataFrame:
  df_music["number_of_lines"] = df_music["lyric"].map(lambda lyric: len(lyric.split("\n")))
  df_music["number_of_tokens"] = df_music["lyric"].map(lambda lyric: len(nltk.word_tokenize(lyric)))
  df_music["length_lyrics"] = df_music["lyric"].map(lambda lyric: len(lyric))
  df_music["length_of_track_name"] = df_music["track_name"].map(lambda track_name: len(track_name))
  return df_music[["length_lyrics", "length_of_track_name", "number_of_lines", "number_of_tokens"]]

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [7]:
nltk.download('averaged_perceptron_tagger')

def get_POS_vector_from_lyric_POS(lyric_POS):
  useful_pos_tags = ["FW", "JJR", "NN", "NNS", "NNP", "PDT", "PRP", "RB", "RBR",
                   "UH", "VB", "VBG", "VBD", "VBN", "VBP", "VBZ"]
  POS_dictionary = {pos_tag:0 for pos_tag in useful_pos_tags}
  POS_dictionary.update({"other":0})

  for token in lyric_POS:
    if token[1] in useful_pos_tags:
      POS_dictionary[token[1]] += 1
    else:
      POS_dictionary["other"] += 1

  return [POS_dictionary[pos_tag] for pos_tag in POS_dictionary]
  

def get_POS_chunk_taggin_counts_for_text(df_music: pd.DataFrame) -> pd.DataFrame:
  df_music["lyric_POS"] = df_music["lyric"].map(lambda lyric: nltk.pos_tag(nltk.word_tokenize(lyric)))
  df_music["lyric_POS"] = df_music["lyric_POS"].map(lambda lyric_pos: get_POS_vector_from_lyric_POS(lyric_pos))
  return pd.DataFrame(df_music["lyric_POS"].to_list())


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


In [8]:
df_music_filtered_genres = eliminate_genres_without_enough_observations(df_music_train)
artists = list(df_music_filtered_genres["artist"].unique())
train_artists = set(artists[:int(0.9*len(artists))])
validation_artists = set(artists[int(0.9*len(artists)):])
train_set = df_music_filtered_genres[df_music_filtered_genres["artist"].isin(train_artists)]
validation_set = df_music_filtered_genres[df_music_filtered_genres["artist"].isin(validation_artists)]

In [9]:
def mean_hot_encoder(df_music: pd.DataFrame, df_training: pd.DataFrame) -> pd.DataFrame:
  df_training_grouped_by_lang = df_training.groupby("language").mean().reset_index()[["language", "popularity", "a_popularity", "loudness"]]
  df_new_columns = df_music.merge(df_training_grouped_by_lang, on = "language", how = "left")
  return df_new_columns[["popularity_y", "a_popularity_y", "loudness_y"]].fillna(0)

mean_hot_encoder_using_training_set = lambda df_to_encode: mean_hot_encoder(df_to_encode, df_music_train) 

In [10]:
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer

text_features = ["track_name", "lyric", "artist"]

numerical_features = ["a_songs", "a_popularity", "popularity", "acousticness", "danceability", "duration_ms",
                   "energy", "instrumentalness", "liveness", "loudness", "speechiness", "tempo", "valence"]

ordinal_features = ["key", "time_signature"]

one_hot_features = ["mode"]

mean_enc_features = ["language"]

artist_genres = ["a_genres", "genre"]

label = ["genre"]

identity_transformer = preprocessing.FunctionTransformer(None)

full_processor = ColumnTransformer(transformers=[
    ('text_sentiment_analysis', preprocessing.FunctionTransformer(apply_sentiment_analysis_to_lyrics), text_features),
    ('text_tf_idf', preprocessing.FunctionTransformer(get_sum_tfidf_from_lyrics), text_features),
    ('text_simple_transforms', preprocessing.FunctionTransformer(get_length_transforms_for_text), text_features),
    ('text_POS_count', preprocessing.FunctionTransformer(get_POS_chunk_taggin_counts_for_text), text_features),
    ('mean_encoding', preprocessing.FunctionTransformer(mean_hot_encoder_using_training_set), list(df_music_train.columns)),
    ('one_hot_encoding', preprocessing.OneHotEncoder(), one_hot_features),
    ('numerical', identity_transformer, numerical_features),
    ('ordinal', preprocessing.OrdinalEncoder(categories = [['C', 'C#', 'D', 'D#', 'E', 'F', 'F#', 'G', 'G#', 'A', 'A#', 'B'],
                                                            ['1/4', '3/4', '4/4', '5/4']]), ordinal_features)])


## Red Neuronal

In [11]:
X_train = full_processor.fit_transform(train_set)

In [12]:
label_encoder = preprocessing.LabelEncoder()
label_encoder.fit(train_set.genre)
y_train = label_encoder.transform(train_set.genre)
X_validation = full_processor.transform(validation_set)
y_validation = label_encoder.transform(validation_set.genre)

In [13]:
def reset_weights(model):
    for l in model.layers:
        if hasattr(l,"kernel_initializer"):
            l.kernel.assign(l.kernel_initializer(tf.shape(l.kernel)))
        if hasattr(l,"bias_initializer"):
            l.bias.assign(l.bias_initializer(tf.shape(l.bias)))
        if hasattr(l,"recurrent_initializer"):
            l.recurrent_kernel.assign(l.recurrent_initializer(tf.shape(l.recurrent_kernel)))


In [15]:
num_columns = X_train.shape[1]
num_classes = len(label_encoder.classes_)
width = 30
depth = 1
activation = "ReLU"

input = tf.keras.layers.Input(shape = (num_columns))
normalize = tf.keras.layers.Normalization()(input)

hidden_layers = [tf.keras.layers.Dense(width- int(0.3*i), activation = activation, kernel_initializer = tf.keras.initializers.HeNormal(),
                                       kernel_constraint=tf.keras.constraints.MaxNorm(5))
                  for i in range(0,depth)]

for i in range(0, depth):
  if i==0:
    hidden_layers[i] = hidden_layers[i](input)
  else:
    hidden_layers[i] = hidden_layers[i](hidden_layers[i-1])


output = tf.keras.layers.Dense(units = num_classes, activation = "softmax",
                               kernel_regularizer=tf.keras.regularizers.L1(0.001),
                               bias_regularizer=tf.keras.regularizers.L1(0.001))(hidden_layers[-1])
model_NN = tf.keras.models.Model(inputs = input, outputs = output)
model_NN.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, 71)]              0         
                                                                 
 dense_2 (Dense)             (None, 30)                2160      
                                                                 
 dense_3 (Dense)             (None, 21)                651       
                                                                 
Total params: 2,811
Trainable params: 2,811
Non-trainable params: 0
_________________________________________________________________


In [24]:
reset_weights(model_NN)
es = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=250, restore_best_weights = True)
model_NN.compile(optimizer=tf.keras.optimizers.Nadam(learning_rate = 0.000025),loss=tf.keras.losses.SparseCategoricalCrossentropy(),
                 metrics = ["accuracy"])

hist = model_NN.fit(x=X_train, y=y_train, batch_size = 64, epochs=2500, callbacks = [es],
                 validation_data=(X_validation, y_validation))

Epoch 1/2500
Epoch 2/2500
Epoch 3/2500
Epoch 4/2500
Epoch 5/2500
Epoch 6/2500
Epoch 7/2500
Epoch 8/2500
Epoch 9/2500
Epoch 10/2500
Epoch 11/2500
Epoch 12/2500
Epoch 13/2500
Epoch 14/2500
Epoch 15/2500
Epoch 16/2500
Epoch 17/2500
Epoch 18/2500
Epoch 19/2500
Epoch 20/2500
Epoch 21/2500
Epoch 22/2500
Epoch 23/2500
Epoch 24/2500
Epoch 25/2500
Epoch 26/2500
Epoch 27/2500
Epoch 28/2500
Epoch 29/2500
Epoch 30/2500
Epoch 31/2500
Epoch 32/2500
Epoch 33/2500
Epoch 34/2500
Epoch 35/2500
Epoch 36/2500
Epoch 37/2500
Epoch 38/2500
Epoch 39/2500
Epoch 40/2500
Epoch 41/2500
Epoch 42/2500
Epoch 43/2500
Epoch 44/2500
Epoch 45/2500
Epoch 46/2500
Epoch 47/2500
Epoch 48/2500
Epoch 49/2500
Epoch 50/2500
Epoch 51/2500
Epoch 52/2500
Epoch 53/2500
Epoch 54/2500
Epoch 55/2500
Epoch 56/2500
Epoch 57/2500
Epoch 58/2500
Epoch 59/2500
Epoch 60/2500
Epoch 61/2500
Epoch 62/2500
Epoch 63/2500
Epoch 64/2500
Epoch 65/2500
Epoch 66/2500
Epoch 67/2500
Epoch 68/2500
Epoch 69/2500
Epoch 70/2500
Epoch 71/2500
Epoch 72/2500
E

In [25]:
from sklearn.metrics import accuracy_score
accuracy_score(y_validation, np.argmax(model_NN.predict(X_validation), axis = 1))



0.27324312527280664

In [26]:
top_k_accuracy_score(y_validation, model_NN.predict(X_validation), labels = np.arange(0,21,1))



0.3897861195984286

In [31]:
X_test = full_processor.transform(eliminate_genres_without_enough_observations(df_music_test))

In [30]:
y_test = label_encoder.transform(eliminate_genres_without_enough_observations(df_music_test).genre)

In [32]:
top_k_accuracy_score(y_test, model_NN.predict(X_test), labels = np.arange(0,21,1))



0.3484796720191322