In [47]:
!pip install -qU transformers

In [48]:
from datasets import load_dataset, load_metric
from transformers import (T5Tokenizer, T5ForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer,  DataCollatorForSeq2Seq)
import torch
from evaluate import load
import numpy as np
import matplotlib

Get Data

In [49]:
data_files = {
    'train': 't5_datasets/train.jsonl',
    'test': 't5_datasets/test.jsonl',
    'validation': 't5_datasets/validation.jsonl'
}

dataset = load_dataset('json', data_files=data_files)
train_dataset = dataset['train']
test_dataset = dataset['test']
validation_dataset = dataset['validation']

In [50]:
dataset

DatasetDict({
    train: Dataset({
        features: ['input', 'output'],
        num_rows: 3584
    })
    test: Dataset({
        features: ['input', 'output'],
        num_rows: 768
    })
    validation: Dataset({
        features: ['input', 'output'],
        num_rows: 768
    })
})

Get the token and the T5 model

In [51]:
model_name = 'google-t5/t5-base'
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


preprocess the data

In [52]:
def preprocess_function(data_p):
    prefix = "complete: "
    max_length = 512
    inputs = [prefix + d for d in data_p['input']]
    targets = [d for d in data_p['output']]
    model_input = tokenizer(inputs, max_length=max_length, padding="max_length", truncation=True)
    
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_length, padding="max_length", truncation=True)
        
    model_input['labels'] = labels['input_ids']  
    return model_input

In [53]:
trained_data = train_dataset.map(preprocess_function, batched=True)
validation_data = validation_dataset.map(preprocess_function, batched=True)
test_data = test_dataset.map(preprocess_function, batched=True)

In [54]:
type(validation_data)

datasets.arrow_dataset.Dataset

training arguments

In [68]:
batch_size = 16
epochs = 5
max_length = 512
output_dir = 't5_data/results'
logs_dir = 't5_data/logs'



args = Seq2SeqTrainingArguments(
    output_dir = output_dir,
    evaluation_strategy='steps',
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    learning_rate=3e-5,
    num_train_epochs=epochs,
    logging_dir=logs_dir,
    eval_steps=1,
    logging_steps=200,
    save_steps=200,
    save_strategy="steps",
    weight_decay=0.01,
    load_best_model_at_end=True,
    fp16=True,
    predict_with_generate=True,
    warmup_steps=500
)

computing the metrics

In [103]:
perplexity = load("perplexity", module_type="metric")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    
    return perplexity.compute(predictions=decoded_preds, model_id='t5-base')
    

In [112]:
import numpy as np
from scipy.special import softmax
from sklearn.metrics import log_loss

def compute_metrics2(eval_preds):
    logits, labels = eval_preds
    
    # For T5, we need to handle the shifted labels
    shifted_logits = logits[..., :-1, :]
    labels = labels[..., 1:]
    
    # Flatten the tensors
    shifted_logits = shifted_logits.reshape(-1, shifted_logits.shape[-1])
    labels = labels.reshape(-1)
    
    # Compute softmax probabilities
    probabilities = softmax(shifted_logits, axis=-1)
    
    # Compute log loss (cross-entropy)
    loss = log_loss(labels, probabilities, labels=range(shifted_logits.shape[-1]))
    
    # Compute perplexity
    perplexity = np.exp(loss)
    
    return {"perplexity": perplexity}

In [108]:
validation_data.select([0,1])

Dataset({
    features: ['input', 'output', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 2
})

The trainer

In [113]:
trainer = Seq2SeqTrainer(
    model=model,
    args=args,
    train_dataset= trained_data.select([0,1]),
    eval_dataset= validation_data.select([0,1]),
    #compute_metrics=compute_metrics
)

GPU

In [110]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)

T5ForConditionalGeneration(
  (shared): Embedding(32128, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=768, out_features=3072, bias=False)
              (wo): Linear(in_features=3072, out_features=768, bias=False)
              (dropout): Dro

Train the model

In [114]:
trainer.train()

Step,Training Loss,Validation Loss


ValueError: Found input variables with inconsistent numbers of samples: [1, 1022]

save model  and tokenizer

In [14]:
model_path = 't5_data/model'
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

('t5_data/model\\tokenizer_config.json',
 't5_data/model\\special_tokens_map.json',
 't5_data/model\\spiece.model',
 't5_data/model\\added_tokens.json')

inferencing on test data