In [14]:
from datasets import load_dataset, load_metric
from transformers import (T5Tokenizer, T5ForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer,  DataCollatorForSeq2Seq)
import torch
import numpy as np
import matplotlib
import accelerate

Get Data

In [15]:
data_files = {
    'train': 't5_datasets_class1/t5_train_valid_test/train.jsonl',
    'test': 't5_datasets_class1/t5_train_valid_test/test.jsonl',
    'validation': 't5_datasets_class1/t5_train_valid_test/validation.jsonl'
}

dataset = load_dataset('json', data_files=data_files)
train_dataset = dataset['train']
test_dataset = dataset['test']
validation_dataset = dataset['validation']

In [16]:
dataset

DatasetDict({
    train: Dataset({
        features: ['input', 'output'],
        num_rows: 3584
    })
    test: Dataset({
        features: ['input', 'output'],
        num_rows: 768
    })
    validation: Dataset({
        features: ['input', 'output'],
        num_rows: 768
    })
})

Get the token and the T5 model

In [17]:
model_name = 'google-t5/t5-base'
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


preprocess the data

In [18]:
def preprocess_function(data_p):
    prefix = "complete: "
    max_length = 512
    inputs = [prefix + d for d in data_p['input']]
    targets = [d for d in data_p['output']]
    model_input = tokenizer(inputs, max_length=max_length, padding="max_length", truncation=True)
    
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_length, padding="max_length", truncation=True)
        
    model_input['labels'] = labels['input_ids']  
    return model_input

In [19]:
trained_data = train_dataset.map(preprocess_function, batched=True)
validation_data = validation_dataset.map(preprocess_function, batched=True)
test_data = test_dataset.map(preprocess_function, batched=True)

In [20]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [21]:
model.to(device)

T5ForConditionalGeneration(
  (shared): Embedding(32128, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=768, out_features=3072, bias=False)
              (wo): Linear(in_features=3072, out_features=768, bias=False)
              (dropout): Dro

In [22]:
type(validation_data)

datasets.arrow_dataset.Dataset

training arguments

In [23]:
batch_size = 5
epochs = 5
max_length = 512
output_dir = 't5_results/results'
logs_dir = 't5_results/logs'



args = Seq2SeqTrainingArguments(
    output_dir = output_dir,
    evaluation_strategy='steps',
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    learning_rate=3e-5,
    num_train_epochs=epochs,
    logging_dir=logs_dir,
    eval_steps=200,
    logging_steps=200,
    save_steps=200,
    save_strategy="steps",
    weight_decay=0.01,
    load_best_model_at_end=True,
    fp16=True,
    #predict_with_generate=True,
    warmup_steps=500
)



The trainer

In [24]:
trainer = Seq2SeqTrainer(
    model=model,
    args=args,
    train_dataset= trained_data,
    eval_dataset= validation_data,
)

GPU

Train the model

In [25]:
trainer.train()

Step,Training Loss,Validation Loss
200,6.7728,0.558686
400,0.5179,0.370332
600,0.3918,0.320692
800,0.3372,0.297587
1000,0.3227,0.279326
1200,0.3078,0.262518
1400,0.2932,0.251552
1600,0.2745,0.238591
1800,0.2661,0.233667
2000,0.2648,0.225823


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


TrainOutput(global_step=3585, training_loss=0.6508417999395266, metrics={'train_runtime': 3614.9656, 'train_samples_per_second': 4.957, 'train_steps_per_second': 0.992, 'total_flos': 1.09125253988352e+16, 'train_loss': 0.6508417999395266, 'epoch': 5.0})

save model  and tokenizer

In [26]:
model_path = 't5_data/model'
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

('t5_data/model/tokenizer_config.json',
 't5_data/model/special_tokens_map.json',
 't5_data/model/spiece.model',
 't5_data/model/added_tokens.json')

Inferencing

In [2]:
import shutil

In [3]:
shutil.make_archive("datasets_train_valid_test", 'zip', 't5_datasets_class1/t5_train_valid_test')


'/home/ubuntu/verb-workspace/first/datasets_train_valid_test.zip'

In [29]:
test_dataset = dataset['test']

In [30]:
test_dataset[0]['input']

'<xmi:XMI xmi:version="2.1" xmlns:xmi="http://schema.omg.org/spec/XMI/2.1" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:uml="http://www.eclipse.org/uml2/5.0.0/UML">\n  <uml:Model xmi:id="_2ACcIJROEeqqGZh46IEtXQ" name="model">\n    <xmi:Extension extender="http://www.eclipse.org/emf/2002/Ecore">\n      <eAnnotations xmi:id="_2ACcIZROEeqqGZh46IEtXQ" source="genmymodel">\n        <details xmi:id="_2ACcIpROEeqqGZh46IEtXQ" key="uuid" value="75b4ca00-a639-4435-b2b2-3499c00617b4"/>\n      </eAnnotations>\n    </xmi:Extension>\n    <ownedComment xmi:id="_2ACcI5ROEeqqGZh46IEtXQ" body="Flights&#xA;-We want to model a system for management of flights and pilots. An airline operates flights. Each airline has an ID.&#xA;-Each flight has an ID a departure airport and an arrival airport: an airport as a unique identifier.&#xA;-Each flight has a pilot and a co-pilot, and it uses an aircraft of a certain type; a flight has also a departure time and an arrival time.&#xA;-An airline owns a

In [32]:
with open("test_out.xmi", "w") as file:
    file.write(test_dataset[0]['output'])

In [35]:
model.to(device)

T5ForConditionalGeneration(
  (shared): Embedding(32128, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=768, out_features=3072, bias=False)
              (wo): Linear(in_features=3072, out_features=768, bias=False)
              (dropout): Dro

In [36]:
input_text = ["complete: " + test_dataset[0]['input']]
inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True, max_length=512)
output = model.generate(**inputs, do_sample=True, min_length=10, max_length=512)
decoded_output = tokenizer.batch_decode(output, skip_special_tokens=True)[0]
decoded_output

RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu! (when checking argument for argument index in method wrapper_CUDA__index_select)