<a href="https://colab.research.google.com/github/KrzRac/UGP/blob/main/encoder_decoder.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m13.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (

In [8]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [25]:
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, T5ForConditionalGeneration, Seq2SeqTrainer, Seq2SeqTrainingArguments
from sklearn.metrics import accuracy_score, classification_report
import numpy as np
import json

In [10]:
from google.colab import drive
import os

drive.mount('/content/drive')

drive_results_dir = "/content/drive/MyDrive/t5-small_results"
os.makedirs(drive_results_dir, exist_ok=True)

Mounted at /content/drive


In [11]:
dataset = load_dataset("rotten_tomatoes")

In [12]:
tokenizer = AutoTokenizer.from_pretrained("t5-small")
tokenizer.pad_token = tokenizer.eos_token

In [13]:
def preprocess_function(examples):
    inputs = ["classify sentiment: " + text for text in examples["text"]]  # Add task prefix
    targets = examples["label"]  # Target is the "label" field

    # Convert numeric labels to strings, required for T5
    label_map = {0: "negative", 1: "positive"}
    targets = [label_map[label] for label in targets]

    # Tokenize inputs and targets
    model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding="max_length")

    # Tokenize targets and add as "labels"
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=10, truncation=True, padding="max_length")

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [14]:
tokenized_datasets = dataset.map(preprocess_function, batched=True, remove_columns=dataset["train"].column_names)

Map:   0%|          | 0/1066 [00:00<?, ? examples/s]



In [15]:
train_dataset = tokenized_datasets["train"]
valid_dataset = tokenized_datasets["validation"]
test_dataset = tokenized_datasets["test"]

In [16]:
model = T5ForConditionalGeneration.from_pretrained("t5-small")
model.config.pad_token_id = tokenizer.pad_token_id

In [17]:
print(model)

T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Drop

In [18]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    # Replace -100 in labels with pad_token_id for decoding
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)

    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Compute accuracy
    acc = accuracy_score(decoded_labels, decoded_preds)
    return {"accuracy": acc}

In [19]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./results_t5",          
    evaluation_strategy="epoch",       
    save_strategy="epoch",             
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs_t5",           
    logging_steps=10,
    load_best_model_at_end=True,
    predict_with_generate=True,        
    push_to_hub=False,
    report_to="none"                   
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    data_collator=None,  
    compute_metrics=compute_metrics
)



In [20]:
print("Training T5ForConditionalGeneration...")
trainer.train()

Training T5ForConditionalGeneration...


Epoch,Training Loss,Validation Loss,Accuracy
1,0.0411,0.034457,0.859287
2,0.0352,0.033352,0.863039
3,0.0382,0.033239,0.866792


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Tr

TrainOutput(global_step=1602, training_loss=0.10295054988096121, metrics={'train_runtime': 322.1352, 'train_samples_per_second': 79.439, 'train_steps_per_second': 4.973, 'total_flos': 865849174917120.0, 'train_loss': 0.10295054988096121, 'epoch': 3.0})

In [21]:
history = trainer.state.log_history

history_file = os.path.join(drive_results_dir, "training_history.json")
with open(history_file, "w") as f:
    json.dump(history, f, indent=4)
print(f"Training history has been saved to {history_file}.")

Training history has been saved to /content/drive/MyDrive/t5-small_results/training_history.json.


In [22]:
eval_results = trainer.evaluate(test_dataset)
print("Evaluation results:", eval_results)

Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Tr

Evaluation results: {'eval_loss': 0.036146294325590134, 'eval_accuracy': 0.8686679174484052, 'eval_runtime': 6.2993, 'eval_samples_per_second': 169.226, 'eval_steps_per_second': 10.636, 'epoch': 3.0}


In [23]:
eval_results_file = os.path.join(drive_results_dir, "eval_results.txt")
with open(eval_results_file, "w") as f:
    f.write(str(eval_results))

print(f"Evaluation results saved at: {eval_results_file}")

print("Evaluation results:", eval_results)

Evaluation results saved at: /content/drive/MyDrive/t5-small_results/eval_results.txt
Evaluation results: {'eval_loss': 0.036146294325590134, 'eval_accuracy': 0.8686679174484052, 'eval_runtime': 6.2993, 'eval_samples_per_second': 169.226, 'eval_steps_per_second': 10.636, 'epoch': 3.0}


In [30]:
logits, labels = trainer.predict(test_dataset)[:2]

decoded_preds = tokenizer.batch_decode(logits, skip_special_tokens=True)
decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

label_map = {"negative": 0, "positive": 1}
numeric_preds = [label_map[pred.strip()] for pred in decoded_preds]
numeric_labels = [label_map[label.strip()] for label in decoded_labels]

report = classification_report(numeric_labels, numeric_preds, target_names=["negative", "positive"], digits=4)

classification_report_file = os.path.join(drive_results_dir, "classification_report.txt")
with open(classification_report_file, "w") as f:
    f.write(report)

print(f"Classification report saved at: {classification_report_file}")
print("Classification Report:\n", report)

Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Tr

Classification report saved at: /content/drive/MyDrive/t5-small_results/classification_report.txt
Classification Report:
               precision    recall  f1-score   support

    negative     0.8429    0.9062    0.8734       533
    positive     0.8986    0.8311    0.8635       533

    accuracy                         0.8687      1066
   macro avg     0.8708    0.8687    0.8685      1066
weighted avg     0.8708    0.8687    0.8685      1066



In [31]:
model.save_pretrained("./t5_finetuned")
tokenizer.save_pretrained("./t5_finetuned")

('./t5_finetuned/tokenizer_config.json',
 './t5_finetuned/special_tokens_map.json',
 './t5_finetuned/spiece.model',
 './t5_finetuned/added_tokens.json',
 './t5_finetuned/tokenizer.json')

In [33]:
examples = dataset['test']['text'][:3]

In [34]:
inputs = ["classify sentiment: " + example for example in examples]
inputs = tokenizer(inputs, return_tensors="pt", max_length=128, truncation=True, padding="max_length")

In [35]:

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
inputs = {key: val.to(device) for key, val in inputs.items()}

outputs = model.generate(**inputs)
predictions = tokenizer.batch_decode(outputs, skip_special_tokens=True)

print("Predictions:")
for example, prediction in zip(examples, predictions):
    print(f"Review: {example}")
    print(f"Predicted sentiment: {prediction}")


Predictions:
Review: lovingly photographed in the manner of a golden book sprung to life , stuart little 2 manages sweetness largely without stickiness .
Predicted sentiment: positive
Review: consistently clever and suspenseful .
Predicted sentiment: positive
Review: it's like a " big chill " reunion of the baader-meinhof gang , only these guys are more harmless pranksters than political activists .
Predicted sentiment: negative
