In [1]:
!pip install -U huggingface_hub
!pip install -U datasets
!pip install transformers[torch]
!pip install evaluate
!pip install scikit-learn

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m14.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (

## **Clasificacion con Transformers**

In [2]:
import pandas as pd
from datasets import Dataset

df = pd.read_csv('canciones.csv')

# Renombrar la columna 'tag' a 'labels'
df = df.rename(columns={'tag': 'labels'})

# Crear un mapeo de etiquetas a índices
label2id = {label: idx for idx, label in enumerate(df['labels'].unique())}
id2label = {idx: label for label, idx in label2id.items()}

# Mapear etiquetas a índices numéricos
df['labels'] = df['labels'].map(label2id)

# Seleccionar columnas relevantes
df = df[['lyrics', 'labels', 'split']]

# Dividir en conjuntos de entrenamiento, validación y prueba
train_data = df[df['split'] == 'train']
val_data = df[df['split'] == 'validation']
test_data = df[df['split'] == 'test']

# Convertir a objetos Dataset de Huggingface
train_dataset = Dataset.from_pandas(train_data)
val_dataset = Dataset.from_pandas(val_data)
test_dataset = Dataset.from_pandas(test_data)


In [3]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Seleccionar modelo preentrenado
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Cargar modelo con el número de etiquetas según géneros únicos
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=len(label2id),
    id2label=id2label,
    label2id=label2id
)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
def tokenize_function(examples):
    tokens = tokenizer(examples['lyrics'], padding="max_length", truncation=True)
    tokens['labels'] = examples['labels']  # Asegurar que las etiquetas se copien correctamente como números
    return tokens

# Tokenizar los datasets
train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

# Formatear los datasets para entrenamiento
train_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
val_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
test_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])


Map:   0%|          | 0/21000 [00:00<?, ? examples/s]

Map:   0%|          | 0/4500 [00:00<?, ? examples/s]

Map:   0%|          | 0/4500 [00:00<?, ? examples/s]

In [5]:
from transformers import TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, f1_score

from sklearn.metrics import precision_recall_fscore_support, accuracy_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)

    # Cálculo de las métricas
    f1 = f1_score(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)
    precision, recall, _, support = precision_recall_fscore_support(labels, preds, average=None)

    # Retorna todas las métricas en el diccionario
    return {
        "accuracy": acc,
        "f1": f1,
        "precision": precision.tolist(),
        "recall": recall.tolist(),
        "support": support.tolist()
    }



training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=500,
    save_total_limit=2,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,  # Incluye el tokenizer
    compute_metrics=compute_metrics,
)


  trainer = Trainer(


In [6]:
print(train_dataset[0]["labels"])  # Verifica que las etiquetas sean enteros


tensor(0)


CLAVE: d6f16458f7d148cb6dbc35d20b8c5521c64a98e5

In [7]:
# Entrenar el modelo
trainer.train()

# Evaluar en el conjunto de prueba
results = trainer.evaluate(test_dataset)
print(results)


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,Support
1,0.8256,0.775606,0.668,0.666414,"[0.4692387904066736, 0.8960603520536463, 0.5542521994134897, 0.7428861788617886]","[0.4147465437788018, 0.9082412914188616, 0.675, 0.6538461538461539]","[1085, 1177, 1120, 1118]"
2,0.7003,0.769726,0.670667,0.672635,"[0.43845534995977475, 0.9008403361344538, 0.6214285714285714, 0.7286108555657773]","[0.5023041474654378, 0.9107901444350043, 0.54375, 0.7084078711985689]","[1085, 1177, 1120, 1118]"
3,0.5346,0.844629,0.679333,0.677918,"[0.4759660697455231, 0.8964076858813701, 0.614115490375802, 0.7028670721112077]","[0.46543778801843316, 0.9116397621070518, 0.5982142857142857, 0.723613595706619]","[1085, 1177, 1120, 1118]"


Trainer is attempting to log a value of "[0.4692387904066736, 0.8960603520536463, 0.5542521994134897, 0.7428861788617886]" of type <class 'list'> for key "eval/precision" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "[0.4147465437788018, 0.9082412914188616, 0.675, 0.6538461538461539]" of type <class 'list'> for key "eval/recall" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "[1085, 1177, 1120, 1118]" of type <class 'list'> for key "eval/support" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "[0.43845534995977475, 0.9008403361344538, 0.6214285714285714, 0.7286108555657773]" of type <class 'list'> for key "eval/precision" as a scalar. This invocation of Tensorboard's writer.add_scala

Trainer is attempting to log a value of "[0.48148148148148145, 0.8818565400843882, 0.5597426470588235, 0.7163543441226575]" of type <class 'list'> for key "eval/precision" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "[0.44551845342706503, 0.9024179620034543, 0.5665116279069767, 0.7449069973427812]" of type <class 'list'> for key "eval/recall" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "[1138, 1158, 1075, 1129]" of type <class 'list'> for key "eval/support" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.


{'eval_loss': 0.8552080988883972, 'eval_accuracy': 0.6671111111111111, 'eval_f1': 0.6643408310178515, 'eval_precision': [0.48148148148148145, 0.8818565400843882, 0.5597426470588235, 0.7163543441226575], 'eval_recall': [0.44551845342706503, 0.9024179620034543, 0.5665116279069767, 0.7449069973427812], 'eval_support': [1138, 1158, 1075, 1129], 'eval_runtime': 67.3802, 'eval_samples_per_second': 66.785, 'eval_steps_per_second': 8.356, 'epoch': 3.0}


## **Generacion de texto con GPT-2**

In [None]:
# Sección 4: Generación de texto con GPT-2
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import pandas as pd

# Configurar el modelo y tokenizador
device = "cuda" if torch.cuda.is_available() else "cpu"
model_name = "gpt2-xl"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name).to(device)
tokenizer.pad_token = tokenizer.eos_token


In [None]:
# Texto de entrada
input_txt = "Genre: pop Lyrics: A romantic song about love and heartbreak."
input_tokens = tokenizer(input_txt, return_tensors="pt", padding=True, truncation=True)
input_ids = input_tokens["input_ids"].to(device)
attention_mask = input_tokens["attention_mask"].to(device)


In [None]:
# Función para generación de texto
def generate_text(input_ids, attention_mask, method="greedy", **kwargs):
    if method == "greedy":
        output = model.generate(
            input_ids,
            attention_mask=attention_mask,
            max_new_tokens=50,
            do_sample=False,
            no_repeat_ngram_size=2
        )
    elif method == "beam":
        output = model.generate(
            input_ids,
            attention_mask=attention_mask,
            max_new_tokens=50,
            num_beams=5,
            no_repeat_ngram_size=2,
        )
    elif method == "sampling":
        output = model.generate(
            input_ids,
            attention_mask=attention_mask,
            max_new_tokens=50,
            do_sample=True,
            temperature=0.7,
            top_k=50,
            top_p=0.9,
            no_repeat_ngram_size=2
        )
    else:
        raise ValueError("Método desconocido: elija 'greedy', 'beam' o 'sampling'")

    return tokenizer.decode(output[0], skip_special_tokens=True)

# Generaciones con diferentes métodos
print("\nGreedy Search:")
print(generate_text(input_ids, attention_mask, method="greedy"))

print("\nBeam Search:")
print(generate_text(input_ids, attention_mask, method="beam"))

print("\nSampling:")
print(generate_text(input_ids, attention_mask, method="sampling"))

In [None]:
# Configuración del tokenizador y modelo
tokenizer.pad_token = tokenizer.eos_token
input_tokens = tokenizer(input_txt, return_tensors="pt", padding=True, truncation=True)
input_ids = input_tokens["input_ids"].to(device)
attention_mask = input_tokens["attention_mask"].to(device)

# Función para calcular probabilidad logarítmica
def log_probs_from_logits(logits, labels):
    logp = torch.nn.functional.log_softmax(logits, dim=-1)
    return torch.gather(logp, 2, labels.unsqueeze(2)).squeeze(-1)

def sequence_logprob(model, labels, input_len=0):
    with torch.no_grad():
        output = model(labels)
        log_probs = log_probs_from_logits(output.logits[:, :-1, :], labels[:, 1:])
        seq_log_prob = torch.sum(log_probs[:, input_len:])
        return seq_log_prob.cpu().numpy()

# Generación con Greedy Search
output_greedy = model.generate(
    input_ids,
    attention_mask=attention_mask,
    max_new_tokens=50,
    do_sample=False
)

# Calcular Log-prob para evaluación
logp_greedy = sequence_logprob(model, output_greedy, input_len=len(input_ids[0]))
print(f"\nLog-prob (Greedy): {logp_greedy:.2f}")
