<a href="https://colab.research.google.com/github/Nacho2904/orga_de_datos/blob/main/TP3_MultiLabel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import top_k_accuracy_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from google.colab import drive 
import nltk
import functools

drive.mount('/content/gdrive')
path_a_training_set = 'gdrive/MyDrive/TP3 dataset music/train.parquet'
path_a_test_set = 'gdrive/MyDrive/TP3 dataset music/test.parquet'


df_music_train = pd.read_parquet(path_a_training_set).fillna("")
df_music_test = pd.read_parquet(path_a_test_set).fillna("")

Mounted at /content/gdrive


## Preprocessing

In [2]:
nltk.download('stopwords')
nltk.download('punkt')

genres = list(df_music_train["genre"].unique())
def get_vectorizers_by_genre(df_music: pd.DataFrame) -> dict:
  df_music_lyric_tokenized = df_music.copy().fillna("")
  df_music_lyric_tokenized["lyric"] = df_music_lyric_tokenized["lyric"].map(lambda lyric: set(nltk.word_tokenize(lyric)))
  df_music_grouped_by_genre = df_music_lyric_tokenized[["genre", "lyric"]].groupby('genre').agg(lambda x: functools.reduce(set.union, x)).reset_index()
  vocabs = dict(zip(df_music_grouped_by_genre.genre.to_list(), df_music_grouped_by_genre.lyric.to_list()))
  stopwords = set(nltk.corpus.stopwords.words("english")).union(set(nltk.corpus.stopwords.words("spanish"))).union(set(nltk.corpus.stopwords.words("french")))
  vectorizers = {genre: TfidfVectorizer(input = "content", stop_words = stopwords, vocabulary = vocabs[genre]) for genre in genres}
  for genre in genres:
    vectorizers[genre].fit(df_music[df_music["genre"] == genre]["genre"])
  return vectorizers

vectorizers = get_vectorizers_by_genre(df_music_train)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
  "Upper case characters found in"


In [3]:
def get_sum_tfidf_from_lyrics(df_music: pd.DataFrame) -> pd.DataFrame:
  df_music = df_music.fillna("")
  column_names = ["sum_tfidf_for_" + genre.lower() for genre in genres]
  for i in range(0, len(genres)):
    df_music[column_names[i]] = np.sum(vectorizers[genres[i]].transform(df_music["lyric"]), axis = 1)
  return df_music[column_names]

get_sum_tfidf_from_lyrics(df_music_train).columns

Index(['sum_tfidf_for_dance', 'sum_tfidf_for_pop', 'sum_tfidf_for_r&b',
       'sum_tfidf_for_hip-hop', 'sum_tfidf_for_rap', 'sum_tfidf_for_soul',
       'sum_tfidf_for_rock', 'sum_tfidf_for_blues', 'sum_tfidf_for_folk',
       'sum_tfidf_for_jazz', 'sum_tfidf_for_country',
       'sum_tfidf_for_alternative', 'sum_tfidf_for_children’s music',
       'sum_tfidf_for_indie', 'sum_tfidf_for_electronic',
       'sum_tfidf_for_classical', 'sum_tfidf_for_opera', 'sum_tfidf_for_world',
       'sum_tfidf_for_ska', 'sum_tfidf_for_comedy', 'sum_tfidf_for_reggae',
       'sum_tfidf_for_reggaeton', 'sum_tfidf_for_children's music',
       'sum_tfidf_for_soundtrack', 'sum_tfidf_for_anime',
       'sum_tfidf_for_movie'],
      dtype='object')

In [4]:
def eliminate_genres_without_enough_observations(df_music: pd.DataFrame) -> pd.DataFrame:
  df_music_recuento_filas_por_genero = df_music.groupby("genre").count().reset_index()[["genre", "track_name"]].rename(
    columns = {"track_name": "rowCount"}).sort_values("rowCount")
  problematic_genres = list(df_music_recuento_filas_por_genero[df_music_recuento_filas_por_genero["rowCount"] < 50].genre)[1:]
  return df_music[~df_music["genre"].isin(problematic_genres)]

In [5]:
nltk.download('punkt')
def get_length_transforms_for_text(df_music: pd.DataFrame) -> pd.DataFrame:
  df_music["number_of_lines"] = df_music["lyric"].map(lambda lyric: len(lyric.split("\n")))
  df_music["number_of_tokens"] = df_music["lyric"].map(lambda lyric: len(nltk.word_tokenize(lyric)))
  df_music["length_lyrics"] = df_music["lyric"].map(lambda lyric: len(lyric))
  return df_music[["length_lyrics", "number_of_lines", "number_of_tokens"]]

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [6]:
df_music_filtered_genres = eliminate_genres_without_enough_observations(df_music_train)
artists = list(df_music_filtered_genres["artist"].unique())
train_artists = set(artists[:int(0.9*len(artists))])
validation_artists = set(artists[int(0.9*len(artists)):])
train_set = df_music_filtered_genres[df_music_filtered_genres["artist"].isin(train_artists)]
validation_set = df_music_filtered_genres[df_music_filtered_genres["artist"].isin(validation_artists)]

In [7]:
def mean_hot_encoder(df_music: pd.DataFrame, df_training: pd.DataFrame) -> pd.DataFrame:
  df_training_grouped_by_lang = df_training.groupby("language").mean().reset_index()[["language", "popularity"]]
  df_new_columns = df_music.merge(df_training_grouped_by_lang, on = "language", how = "left")
  return df_new_columns[["popularity_y"]].fillna(0)

mean_hot_encoder_using_training_set = lambda df_to_encode: mean_hot_encoder(df_to_encode, df_music_train) 

In [8]:
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer

text_features = ["track_name", "lyric", "artist"]

numerical_features = ["a_songs", "a_popularity", "popularity", "acousticness", "danceability", "duration_ms",
                   "energy", "instrumentalness", "liveness", "loudness", "speechiness", "tempo", "valence"]

ordinal_features = ["key"]

one_hot_features = ["mode"]

mean_enc_features = ["language"]

artist_genres = ["a_genres", "genre"]

label = ["genre"]

identity_transformer = preprocessing.FunctionTransformer(None)

label_encoder = preprocessing.LabelEncoder()
label_encoder.fit(df_music_train["genre"])

full_processor = ColumnTransformer(transformers=[
    ('text_tf_idf', preprocessing.FunctionTransformer(get_sum_tfidf_from_lyrics), text_features),
    ('text_simple_transforms', preprocessing.FunctionTransformer(get_length_transforms_for_text), text_features),
    ('mean_encoding', preprocessing.FunctionTransformer(mean_hot_encoder_using_training_set), list(df_music_train.columns)),
    ('one_hot_encoding', preprocessing.OneHotEncoder(), one_hot_features),
    ('numerical', identity_transformer, numerical_features),
    ('ordinal', preprocessing.OrdinalEncoder(categories = [['C', 'C#', 'D', 'D#', 'E', 'F', 'F#', 'G', 'G#', 'A', 'A#', 'B']]), ordinal_features)])



In [9]:
X_train = full_processor.fit_transform(train_set)
X_validation = full_processor.transform(validation_set)

In [55]:
def create_one_hot_vector(genres_of_song:list) -> list:
  genres = sorted(list(df_music_train["genre"].unique()))
  dict_genres = {genre: genres.index(genre) for genre in genres}  
  one_hot_vector = [0 for genre in genres]
  for genre in genres_of_song:
    one_hot_vector[dict_genres[genre]] = 1
  return one_hot_vector

def obtain_dataset_with_encoded_target(df_data: pd.DataFrame) -> np.array:
  df_data_songs_and_genres = df_data[["track_name", "genre"]].groupby("track_name").agg(list)
  df_data_songs_and_genres["one_hotted_target"] = df_data_songs_and_genres["genre"].map(create_one_hot_vector)
  df_data_with_one_hot_vector = df_data_songs_and_genres.merge(df_data, on = "track_name", how = "left")
  return df_data_with_one_hot_vector


In [56]:
obtain_dataset_with_encoded_target(train_set)

Unnamed: 0,track_name,genre_x,one_hotted_target,lyric,genre_y,language,popularity,artist,a_genres,a_songs,...,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence
0,#1,"[Dance, R&B, Hip-Hop]","[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, ...",Uh uh uh\nI just gotta bring it to they attent...,Dance,en,49,Nelly,Rap; Black Music; Hip Hop,171.0,...,0.600,0.000000,G,0.419,-6.157,Major,0.2280,89.985,4/4,0.463
1,#1,"[Dance, R&B, Hip-Hop]","[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, ...",Uh uh uh\nI just gotta bring it to they attent...,R&B,en,50,Nelly,Rap; Black Music; Hip Hop,171.0,...,0.600,0.000000,G,0.419,-6.157,Major,0.2280,89.985,4/4,0.463
2,#1,"[Dance, R&B, Hip-Hop]","[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, ...",Uh uh uh\nI just gotta bring it to they attent...,Hip-Hop,en,50,Nelly,Rap; Black Music; Hip Hop,171.0,...,0.600,0.000000,G,0.419,-6.157,Major,0.2280,89.985,4/4,0.463
3,#1 Crush,"[Alternative, Dance, Children’s Music, Rock]","[1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ...",I would die for you\nI would die for you\nI've...,Alternative,en,47,Garbage,Pop/Rock; Rock,144.0,...,0.630,0.001030,D,0.349,-7.112,Minor,0.0234,94.195,4/4,0.424
4,#1 Crush,"[Alternative, Dance, Children’s Music, Rock]","[1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ...",I would die for you\nI would die for you\nI've...,Dance,en,50,Garbage,Pop/Rock; Rock,144.0,...,0.630,0.001030,D,0.349,-7.112,Minor,0.0234,94.195,4/4,0.424
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28949,untitled 05 | 09.21.2014.,"[Hip-Hop, Rap]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ...",Somebody said you bumped your head and bled th...,Rap,en,55,Kendrick Lamar,Rap; Hip Hop,125.0,...,0.630,0.000000,C,0.163,-9.839,Major,0.5080,154.152,4/4,0.413
28950,untitled 06 | 06.30.2014.,"[Hip-Hop, Rap]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ...",These metamorphic supernatural forces dominate...,Hip-Hop,en,57,Kendrick Lamar,Rap; Hip Hop,125.0,...,0.558,0.000007,E,0.225,-11.341,Minor,0.0433,89.179,4/4,0.647
28951,untitled 06 | 06.30.2014.,"[Hip-Hop, Rap]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ...",These metamorphic supernatural forces dominate...,Rap,en,57,Kendrick Lamar,Rap; Hip Hop,125.0,...,0.558,0.000007,E,0.225,-11.341,Minor,0.0433,89.179,4/4,0.647
28952,untitled 08 | 09.06.2014.,"[Hip-Hop, Rap]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ...",Why so sad?\nWalking around with them blue fac...,Hip-Hop,en,59,Kendrick Lamar,Rap; Hip Hop,125.0,...,0.527,0.000002,B,0.137,-9.661,Minor,0.2110,119.987,4/4,0.667
