### 1. Modelo pre entrenado

In [125]:
from datasets import load_dataset
dataset = load_dataset('csv', data_files='data_small_train.csv')

In [89]:
dataset

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'TARGET', 'FECHA_STRING', 'COMENTARIO', 'FECHA', 'HORA', 'COMENTARIO_LIMPIO', 'PALABRAS', 'TOKENS', 'NUM_TOKENS', 'sentimiento', 'comentario_tarea'],
        num_rows: 30000
    })
})

In [90]:
from datasets import DatasetDict, Dataset

def train_test_split(dataset_dict: DatasetDict, test_size: float = 0.2) -> DatasetDict:
    """Split a dataset dictionary into train and test based on test size.

    Args:
        dataset_dict (DatasetDict): Input dataset dictionary.
        test_size (float, optional): Fraction of data to include in the test set. Defaults to 0.2.

    Returns:
        DatasetDict: Result dataset dictionary with the desired splitting strategy.
    """
    train_test = dataset_dict["train"].train_test_split(test_size=test_size)

    return DatasetDict({
        "train": train_test["train"],
        "test": train_test["test"]
    })


In [91]:
# Uso de la función para dividir un DatasetDict en train y test
divided_data = train_test_split(dataset, test_size=0.2)

In [92]:
dataset_train_dataset = divided_data["train"].shuffle(seed=42).select([i for i in list(range(10000))])
dataset_test_dataset = divided_data["test"].shuffle(seed=42).select([i for i in list(range(2000))])

In [93]:
train_ds = Dataset.from_dict({ 
    "text": dataset_train_dataset["COMENTARIO_LIMPIO"], # Campo de texto 
    "label": dataset_train_dataset["TARGET"] # Campo de sentimiento
})

test_ds = Dataset.from_dict({ 
    "text": dataset_test_dataset["COMENTARIO_LIMPIO"], # Campo de texto 
    "label": dataset_test_dataset["TARGET"] # Campo de sentimiento
})

In [94]:
from transformers import T5Tokenizer, AutoTokenizer

tokenizer = T5Tokenizer.from_pretrained('google/flan-t5-small')

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [95]:
def preprocess_function(examples):
   return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=64) #padding=True
 
tokenized_train = train_ds.map(preprocess_function, batched=True)
tokenized_test = test_ds.map(preprocess_function, batched=True)

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

In [96]:
texts_with_labels = []

for i in range(len(tokenized_train)):
    text = tokenized_train[i]["text"]
    label = tokenized_train[i]["label"]
    texts_with_labels.append({"text": text, "sentiment": label})

In [126]:
from sklearn.metrics import f1_score
from transformers import AutoTokenizer, AutoModelWithLMHead,AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained("google/flan-t5-small", num_labels=2)

Some weights of T5ForSequenceClassification were not initialized from the model checkpoint at google/flan-t5-small and are newly initialized: ['classification_head.out_proj.weight', 'classification_head.out_proj.bias', 'classification_head.dense.weight', 'classification_head.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [98]:
import torch

f1_metric = load_metric("f1")

#def get_sentiment_list(texts_with_labels):
predictions = []  # Lista para almacenar las etiquetas de sentimiento generadas por el modelo
labels = []  # Lista para almacenar las etiquetas de sentimiento reales

for item in texts_with_labels:
    text = item["text"]
    true_sentiment = item["sentiment"]

    inputs = tokenizer(text, return_tensors="pt")
    outputs = model(**inputs)
    print(logits)
    logits = outputs.logits
    predicted_class = torch.argmax(logits, dim=1).item()
    
    predictions.append(predicted_class)
    labels.append(true_sentiment)

# Calcular el F1-score
f1_score = f1_metric.compute(predictions=predictions, references=labels)["f1"]

# Mostrar el resultado
print(f"F1-score con modelo preentrenado: {f1_score}")

tensor([[-0.0024, -0.0455]], grad_fn=<AddmmBackward0>)
tensor([[-0.4509,  0.1684]], grad_fn=<AddmmBackward0>)
tensor([[-0.2468,  0.1485]], grad_fn=<AddmmBackward0>)
tensor([[-0.2551,  0.0971]], grad_fn=<AddmmBackward0>)
tensor([[-0.2662,  0.1953]], grad_fn=<AddmmBackward0>)
tensor([[-0.4207,  0.1219]], grad_fn=<AddmmBackward0>)
tensor([[-0.2556,  0.1601]], grad_fn=<AddmmBackward0>)
tensor([[-0.1357,  0.3012]], grad_fn=<AddmmBackward0>)
tensor([[-0.3349,  0.1916]], grad_fn=<AddmmBackward0>)
tensor([[-0.2810,  0.0896]], grad_fn=<AddmmBackward0>)
tensor([[-0.4469,  0.1593]], grad_fn=<AddmmBackward0>)
tensor([[-0.1929,  0.3326]], grad_fn=<AddmmBackward0>)
tensor([[-0.3553,  0.3172]], grad_fn=<AddmmBackward0>)
tensor([[-0.3512,  0.3059]], grad_fn=<AddmmBackward0>)
tensor([[-0.3973,  0.1914]], grad_fn=<AddmmBackward0>)
tensor([[-0.3516,  0.1133]], grad_fn=<AddmmBackward0>)
tensor([[-0.3597,  0.1674]], grad_fn=<AddmmBackward0>)
tensor([[-0.2341,  0.2190]], grad_fn=<AddmmBackward0>)
tensor([[-

In [99]:
### Aquí termina la primera parte

### 2. Fine tuning al modelo pre entrenado

In [127]:
from datasets import load_dataset
dataset = load_dataset('csv', data_files='data_small_train.csv')

#dataset = load_dataset(train_dc_2)

In [128]:
from datasets import DatasetDict, Dataset

def train_test_split(dataset_dict: DatasetDict, test_size: float = 0.2) -> DatasetDict:
    """Split a dataset dictionary into train and test based on test size.

    Args:
        dataset_dict (DatasetDict): Input dataset dictionary.
        test_size (float, optional): Fraction of data to include in the test set. Defaults to 0.2.

    Returns:
        DatasetDict: Result dataset dictionary with the desired splitting strategy.
    """
    train_test = dataset_dict["train"].train_test_split(test_size=test_size)

    return DatasetDict({
        "train": train_test["train"],
        "test": train_test["test"]
    })

In [129]:
# Uso de la función para dividir un DatasetDict en train y test
divided_data = train_test_split(dataset, test_size=0.2)

In [130]:
dataset_train_dataset = divided_data["train"].shuffle(seed=42).select([i for i in list(range(100))])
dataset_test_dataset = divided_data["test"].shuffle(seed=42).select([i for i in list(range(20))])

In [131]:
train_ds = Dataset.from_dict({ 
    "text": dataset_train_dataset["COMENTARIO_LIMPIO"], # Campo de texto 
    "label": dataset_train_dataset["TARGET"] # Campo de sentimiento
})

test_ds = Dataset.from_dict({ 
    "text": dataset_test_dataset["COMENTARIO_LIMPIO"], # Campo de texto 
    "label": dataset_test_dataset["TARGET"] # Campo de sentimiento
})

In [132]:
from transformers import T5Tokenizer, AutoTokenizer

#tokenizer = T5Tokenizer.from_pretrained('t5-small')
tokenizer = T5Tokenizer.from_pretrained('google/flan-t5-small')
#tokenizer = AutoTokenizer.from_pretrained('google/flan-t5-small')

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [133]:
def preprocess_function(examples):
   return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=64) #padding=True
 
tokenized_train = train_ds.map(preprocess_function, batched=True)
tokenized_test = test_ds.map(preprocess_function, batched=True)

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

In [141]:
max_length = max(len(ids) for ids in tokenized_train['input_ids'])
for i in range(len(tokenized_train['input_ids'])):
    while len(tokenized_train['input_ids'][i]) < max_length:
        tokenized_train['input_ids'][i].append(0)
        tokenized_train['attention_mask'][i].append(0)

In [142]:
max_length = max(len(ids) for ids in tokenized_test['input_ids'])
for i in range(len(tokenized_test['input_ids'])):
    while len(tokenized_test['input_ids'][i]) < max_length:
        tokenized_test['input_ids'][i].append(0)
        tokenized_test['attention_mask'][i].append(0)

In [143]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [144]:
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained("google/flan-t5-small", num_labels=2)

Some weights of T5ForSequenceClassification were not initialized from the model checkpoint at google/flan-t5-small and are newly initialized: ['classification_head.out_proj.weight', 'classification_head.out_proj.bias', 'classification_head.dense.weight', 'classification_head.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [145]:
import numpy as np
from datasets import load_metric

def compute_metrics(eval_pred):
    load_f1 = load_metric("f1")

    logits, labels = eval_pred
    
    print(logits[0])
    print(labels)
    predictions = np.argmax(logits[0], axis=-1)
    f1 = load_f1.compute(predictions=predictions, references=labels)["f1"]
    return {"f1": f1}

In [146]:
from transformers import TrainingArguments, Trainer
 
repo_name = "finetuning-sentiment-model-3000-samples"
 
training_args = TrainingArguments(
   output_dir=repo_name,
   learning_rate=2e-5,
   per_device_train_batch_size=16,
   per_device_eval_batch_size=16,
   num_train_epochs=2,
   weight_decay=0.01,
   save_strategy="epoch",
)
 
trainer = Trainer(
   model=model,
   args=training_args,
   train_dataset=tokenized_train,#tokenized_train
   eval_dataset=tokenized_test,#tokenized_test
   tokenizer=tokenizer,
   data_collator=data_collator,
   compute_metrics=compute_metrics,
)

In [147]:
trainer.train()

Step,Training Loss


TrainOutput(global_step=14, training_loss=0.7067760058811733, metrics={'train_runtime': 40.414, 'train_samples_per_second': 4.949, 'train_steps_per_second': 0.346, 'total_flos': 3404189030400.0, 'train_loss': 0.7067760058811733, 'epoch': 2.0})

In [148]:
trainer.evaluate()

[[-0.10638631  0.22536479]
 [-0.22859465  0.24805708]
 [-0.27184883  0.17590147]
 [-0.35358506  0.17367108]
 [-0.28597197  0.10445598]
 [-0.3101298   0.27659613]
 [-0.32473022  0.26890147]
 [-0.32954744  0.16683611]
 [-0.35949075  0.04483829]
 [-0.2231046   0.15217641]
 [-0.30502173  0.43814382]
 [-0.3275834   0.28472427]
 [-0.2443533   0.41378346]
 [-0.2175438   0.24988937]
 [-0.40651688  0.13518102]
 [-0.18662673  0.3108778 ]
 [-0.3777661   0.07121117]
 [-0.2262328   0.23238806]
 [-0.16351596  0.26119712]
 [-0.21896645  0.19074056]]
[1 0 1 1 1 1 1 1 0 1 0 0 1 0 1 1 1 0 0 0]


{'eval_loss': 0.6771400570869446,
 'eval_f1': 0.7499999999999999,
 'eval_runtime': 2.1976,
 'eval_samples_per_second': 9.101,
 'eval_steps_per_second': 0.91,
 'epoch': 2.0}