# Solucion con Entrenamiento

## Instalación de librerias

In [None]:
!pip install pandas
!pip install torch
!pip install transformers
!pip install imbalanced-learn
!pip install delayed
!pip install datasets

Collecting delayed
  Downloading delayed-0.11.0b1-py2.py3-none-any.whl (19 kB)
Collecting hiredis (from delayed)
  Downloading hiredis-2.2.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (165 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m165.9/165.9 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting redis (from delayed)
  Downloading redis-5.0.1-py3-none-any.whl (250 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m250.3/250.3 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: redis, hiredis, delayed
Successfully installed delayed-0.11.0b1 hiredis-2.2.3 redis-5.0.1
Collecting datasets
  Downloading datasets-2.15.0-py3-none-any.whl (521 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow-hotfix (from datasets)
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)
Collecting dill<0.3.8,>=0.3.0 (from d

## Dataset y EDA

In [None]:
# Importar las bibliotecas necesarias
import pandas as pd
import re
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import train_test_split
from datasets import load_dataset


dataset = load_dataset("manoh2f2/tsterbak-lyrics-dataset-with-emotions")

# Cargar el archivo Excel en un DataFrame
split_name = 'train'
df = dataset[split_name].to_pandas()
df = df.drop("__index_level_0__", axis=1)

#Preprocesar la columna 'seq', que contiene los lyrics de las canciones
df["seq"] = df["seq"].astype(str) #Convertir la columna 'seq' a tipo de dato string
df["seq"] = df["seq"].apply(lambda x: re.sub(r"[^a-zA-Z\s]", "", x)) #Eliminar caracteres no alfabéticos de la columna

#df = df.drop(df.index[-40000:], inplace=False)

# Imprimir la estructura de los datos mostrando la primera fila
print(df.head(1))


Downloading readme:   0%|          | 0.00/467 [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/21.8M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/36897 [00:00<?, ? examples/s]

   Unnamed: 0        artist  \
0           0  Elijah Blake   

                                                 seq      song  emotions  
0  No noxD\nI aint ever trapped out the bandoxD\n...  Everyday  ['fear']  


In [None]:
# Calcular y mostrar la cuenta de valores únicos en la columna 'emotions'
# normalize=True devuelve las proporciones en lugar de las frecuencias absolutas
# dropna=False incluye los valores NaN en el conteo
df['emotions'].value_counts(normalize=True, dropna=False)

['fear']        0.296447
['sadness']     0.292680
['neutral']     0.154999
['anger']       0.109033
['joy']         0.078082
['surprise']    0.046237
['disgust']     0.022522
Name: emotions, dtype: float64

In [None]:
# Dividir el DataFrame en características (X) y variable objetivo (y)
X = df.drop('emotions', axis=1)# X contiene todas las columnas excepto 'emotions'
y = df['emotions']# y contiene solo la columna 'emotions', que es la variable objetivo

# Dividir los datos en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Se divide el conjunto de datos en conjuntos de entrenamiento y prueba usando train_test_split.
# X_train y y_train son las características y la variable objetivo del conjunto de entrenamiento, respectivamente.
# X_test e y_test son las características y la variable objetivo del conjunto de prueba, respectivamente.

# Aplicar Random UnderSampler al conjunto de entrenamiento resampleado
under_sampler = RandomUnderSampler(random_state=42)
X_resampled, y_resampled = under_sampler.fit_resample(X_train, y_train)
# Se aplica Random UnderSampler al conjunto de entrenamiento para equilibrar las clases.
# Esto reduce aleatoriamente la cantidad de muestras de la clase mayoritaria para igualar la cantidad de la clase minoritaria.


# Crear un nuevo DataFrame con los datos resampleados
df_resampled = pd.concat([pd.DataFrame(X_resampled, columns=X.columns), pd.Series(y_resampled, name='emotions')], axis=1)
# Se crea un nuevo DataFrame (`df_resampled`) utilizando las características y la variable objetivo resampleadas.

In [None]:
# Calcular y mostrar la cuenta de valores únicos en la columna 'emotions' del DataFrame resampleado
# normalize=True devuelve las proporciones en lugar de las frecuencias absolutas
# dropna=False incluye los valores NaN en el conteo
df_resampled['emotions'].value_counts(normalize=True, dropna=False)

['anger']       0.142857
['disgust']     0.142857
['fear']        0.142857
['joy']         0.142857
['neutral']     0.142857
['sadness']     0.142857
['surprise']    0.142857
Name: emotions, dtype: float64

In [None]:
# Supongamos que df es tu DataFrame y tiene una columna 'emociones' que contiene listas de emociones
# Primero, filtramos para mantener solo las filas con una emoción
df_filtrado = df_resampled[df_resampled['emotions'].apply(lambda x: len(x) == 1)]

# Convertimos la columna de listas en una columna de valores únicos
df_filtrado['emotions'] = df_filtrado['emotions'].apply(lambda x: x[0])

# Ahora, contamos las entradas para cada categoría de emoción
conteo_emociones = df_filtrado['emotions'].value_counts()

# Encontramos el número mínimo de entradas que una categoría de emoción tiene
min_conteo = conteo_emociones.min()

# Creamos un nuevo DataFrame vacío para las entradas balanceadas
df_balanceado = pd.DataFrame()

# Para cada emoción, tomamos una muestra del tamaño de la categoría más pequeña
for emocion in conteo_emociones.index:
    df_muestra = df_filtrado[df_filtrado['emotions'] == emocion].sample(min_conteo, replace=False)
    df_balanceado = pd.concat([df_balanceado, df_muestra])

# Restablecemos el índice del DataFrame resultante
df_balanceado.reset_index(drop=True, inplace=True)

# Ahora df_balanceado tiene un número igual de entradas para cada emoción


## Entrenamiento

In [None]:
import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset, random_split
from tqdm import tqdm  # Import tqdm



# Tokenize the lyrics
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
encoded_data = tokenizer(df_resampled['seq'].tolist(), padding=True, truncation=True, return_tensors='pt', max_length=256)
#Utiliza DistilBERT tokenizer para convertir las secuencias de texto 'seq' en representaciones numéricas

# Prepare input tensors and labels
input_ids = encoded_data['input_ids']
attention_mask = encoded_data['attention_mask']
labels = torch.tensor(df_resampled['emotions'].astype('category').cat.codes.tolist())  # Assuming 'emotions' is a categorical variable

# Create a PyTorch dataset, con las representaciones numéricas y etiquetas de emociones
dataset = TensorDataset(input_ids, attention_mask, labels)

# Split the data into training and validation sets
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

# Create data loaders para los conjuntos de entrenamiento y validación, que se usarán en el entrenamiento
batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

# Model, preentrenado para clasificaión de secuencias
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=len(df['emotions'].unique()))

# Optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Training loop, itera sobre le conjunto de entrenamiento durante 6 épocas, realiza el entrenamiento y muestra la pérdida en una barra de progreso
num_epochs = 6
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

for epoch in range(num_epochs):
    model.train()
    train_loader_tqdm = tqdm(train_loader, desc=f'Epoch {epoch + 1}')

    for batch in train_loader_tqdm:
        batch = tuple(t.to(device) for t in batch)
        inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[2]}
        outputs = model(**inputs)
        loss = outputs.loss
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        train_loader_tqdm.set_postfix({'Loss': loss.item()})  # Display loss in the progress bar



# Save the trained model
model.save_pretrained('emotion_model')


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'pre_classifier.weight', 'classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1: 100%|██████████| 118/118 [01:48<00:00,  1.08it/s, Loss=1.23]
Epoch 2: 100%|██████████| 118/118 [01:46<00:00,  1.11it/s, Loss=0.687]
Epoch 3: 100%|██████████| 118/118 [01:51<00:00,  1.06it/s, Loss=0.4]
Epoch 4: 100%|██████████| 118/118 [01:54<00:00,  1.03it/s, Loss=0.48]
Epoch 5: 100%|██████████| 118/118 [01:55<00:00,  1.02it/s, Loss=0.0681]
Epoch 6: 100%|██████████| 118/118 [01:52<00:00,  1.04it/s, Loss=0.403]


## Carga a HugginFace

In [None]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Token: 
Add token as git credential? (Y/n) n
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [None]:
import locale
def getpreferredencoding(do_setlocale = True):
    return "UTF-8"
locale.getpreferredencoding = getpreferredencoding

In [None]:
from huggingface_hub import HfApi
api = HfApi()

# Modificalo para que sea tu repositorio
api.upload_folder(
    folder_path="emotion_model", # Modelo a guardar
    repo_id="manoh2f2/recommend_songs", # Repositorio
    repo_type="model", # Tipo de repositorio, en nuestro caso modelo
)

'https://huggingface.co/manoh2f2/recommend_songs/tree/main/'

In [None]:
from transformers import DistilBertForSequenceClassification, DistilBertTokenizer

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

In [None]:
tokenizer.push_to_hub('manoh2f2/recommend_songs')

CommitInfo(commit_url='https://huggingface.co/manoh2f2/recommend_songs/commit/47f95505c53a22eb81cb9b8d9d10e9b5bb4da36c', commit_message='Upload tokenizer', commit_description='', oid='47f95505c53a22eb81cb9b8d9d10e9b5bb4da36c', pr_url=None, pr_revision=None, pr_num=None)

## Prueba del Modelo

In [None]:
# ! pip install transformers

In [None]:
from transformers import DistilBertForSequenceClassification, DistilBertTokenizer
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import numpy as np

tokenizer = AutoTokenizer.from_pretrained("manoh2f2/recommend_songs")
model = AutoModelForSequenceClassification.from_pretrained("manoh2f2/recommend_songs")

# Define a prompt
prompt = "I am sad"

# Tokenize the prompt
encoded_prompt = tokenizer(prompt, return_tensors='pt', max_length=256)

# Make a prediction using the trained model
with torch.no_grad():
    model_output = model(**encoded_prompt)

# Get the predicted emotion index
predicted_emotion_index = torch.argmax(model_output.logits).item()

# Map the index back to the emotion label using the DataFrame
predicted_emotion_label = df_resampled['emotions'].unique()[predicted_emotion_index]

# Get a song associated with the predicted emotion from the DaraFrame
result = df_resampled[df_resampled['emotions'] == predicted_emotion_label]

# Get the number of rows in the DataFrame
num_rows = result.shape[0]
#Generate a random index to select a random song from the DataFrame
random_index = np.random.randint(0, num_rows)

#Get the recommended song and artist
recommended_song = result['song'].iloc[random_index]
recommended_artist = result['artist'].iloc[random_index]

#Print the results
print(f"Prompt: {prompt}")
print(f"Predicted Emotion: {predicted_emotion_label}")
print(f"Recommended Song: {recommended_song} - {recommended_artist}")


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Prompt: I am sad
Predicted Emotion: ['sadness']
Recommended Song: Love Vigilantes - New Order
