In [10]:
#good
import xml.etree.ElementTree as ET
from benchmark_reader import Benchmark
from benchmark_reader import select_files
from sklearn.model_selection import train_test_split
from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch
from torch.nn.utils.rnn import pad_sequence

tokenizer = T5Tokenizer.from_pretrained('t5-base')
model = T5ForConditionalGeneration.from_pretrained('t5-base')

b = Benchmark()
files = select_files('en/train')
b.fill_benchmark(files)

triples_texts = []  

for i in range(b.entry_count()):
    entry = b.entries[i]
    category = entry.category
    data = entry.list_triples()
    texts = entry.lexs
    triple = []
    text = []

    for RDF in range(len(entry.list_triples())):
        input_string = data[RDF]

        strings = input_string.split(' | ')
        string1 = strings[0]
        string2 = strings[1]
        string3 = strings[2]

        if '_' in string1:
            string1 = string1.replace('_', ' ')

        if '_' in string2:
            string2 = string2.replace('_', ' ')

        if '"' in string3:
            string3 = string3.replace('"', '')

        MR = "__subject__ " + string1 + " __predicate__ " + string2 + " __object__ " + string3
        triple.append(MR)

    for lex in range(len(texts)):
        text.append(texts[lex].lex)

    triples_texts.append((triple, text))


train_data, test_data = train_test_split(triples_texts, test_size=0.2, random_state=42)
train_data, val_data = train_test_split(train_data, test_size=0.1, random_state=42)



def preprocess_data(dataset, max_length_diff=1):
    tokenized_inputs = []
    tokenized_outputs = []

    max_length = 0

    for item in dataset:
        input_tokens = tokenizer.encode_plus(
            item[0],  # Triple
            padding='longest',
            truncation=True,
            max_length=512,
            return_tensors='pt'
        )
        output_tokens = tokenizer.encode_plus(
            item[1],  # Text
            padding='longest',
            truncation=True,
            max_length=512,
            return_tensors='pt'
        )
        input_length = input_tokens['input_ids'].size(1)
        output_length = output_tokens['input_ids'].size(1)
        if (
            abs(input_length - output_length) > max_length_diff
            or input_length != output_length
        ):
            continue
        tokenized_inputs.append(input_tokens['input_ids'].squeeze())
        tokenized_outputs.append(output_tokens['input_ids'].squeeze())

        max_length = max(max_length, input_length)

    input_ids = pad_sequence(tokenized_inputs, batch_first=True, padding_value=tokenizer.pad_token_id)
    attention_mask = (input_ids != tokenizer.pad_token_id).float()
    output_ids = pad_sequence(tokenized_outputs, batch_first=True, padding_value=tokenizer.pad_token_id)
    output_attention_mask = (output_ids != tokenizer.pad_token_id).float()

    return input_ids, attention_mask, output_ids, output_attention_mask

with open('output.txt', 'w', encoding="utf-8") as f:
    for triple, text in triples_texts:
        f.write("Triple: " + str(triple) + "\n")
        f.write("Text: " + str(text) + "\n\n")



train_input_ids, train_attention_mask, train_output_ids, train_output_attention_mask = preprocess_data(train_data)
val_input_ids, val_attention_mask, val_output_ids, val_output_attention_mask = preprocess_data(val_data)
test_input_ids, test_attention_mask, test_output_ids, test_output_attention_mask = preprocess_data(test_data)


print("Training set size:", len(train_input_ids))
print("Validation set size:", len(val_input_ids))
print("Test set size:", len(test_input_ids))


model_config = model.config


print(model_config)


Training set size: 2165
Validation set size: 226
Test set size: 597
T5Config {
  "_name_or_path": "t5-base",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "d_ff": 3072,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dense_act_fn": "relu",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": false,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 12,
  "num_heads": 12,
  "num_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length": 200,
      "min_length": 30,
      "no_repeat_ngram_size": 3,
      "num_beams": 4,
      "prefix": "summarize: "
    },
    "translation_en_to_de": {
      "early_stoppi

In [27]:
#no use to add in the next cell
import torch
import torch.optim as optim
from torch.nn.utils import clip_grad_norm_
from torch.utils.data import DataLoader, TensorDataset

batch_size = 8
num_epochs = 10

train_dataset = TensorDataset(train_input_ids, train_attention_mask, train_output_ids, train_output_attention_mask)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

model.train()

optimizer = optim.AdamW(model.parameters(), lr=1e-4)

for epoch in range(num_epochs):
    total_loss = 0.0

    for batch in train_dataloader:
        input_ids, attention_mask, output_ids, output_attention_mask = batch
        input_ids = input_ids.squeeze(1)
        attention_mask = attention_mask.squeeze(1)
        output_ids = output_ids.squeeze(1)
        output_attention_mask = output_attention_mask.squeeze(1)

        optimizer.zero_grad()

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            decoder_input_ids=output_ids[:, :-1].contiguous(),
            decoder_attention_mask=output_attention_mask[:, :-1].contiguous(),
            labels=output_ids[:, 1:].contiguous(),
        )

        loss = outputs.loss
        loss.backward()
        clip_grad_norm_(model.parameters(), max_norm=1.0)

        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(train_dataloader)
    print(f"Epoch {epoch+1}/{num_epochs}, Average Loss: {avg_loss:.4f}")

model.save_pretrained('trained_model')


Epoch 1/10, Average Loss: 0.7316
Epoch 2/10, Average Loss: 0.2022
Epoch 3/10, Average Loss: 0.1675
Epoch 4/10, Average Loss: 0.1360
Epoch 5/10, Average Loss: 0.1156
Epoch 6/10, Average Loss: 0.0999
Epoch 7/10, Average Loss: 0.0803
Epoch 8/10, Average Loss: 0.0707
Epoch 9/10, Average Loss: 0.0610
Epoch 10/10, Average Loss: 0.0600


In [11]:
import torch
import torch.optim as optim
from torch.nn.utils import clip_grad_norm_
from torch.utils.data import DataLoader, TensorDataset
from transformers import AutoModelForSequenceClassification, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("Elron/bleurt-base-128")
model = AutoModelForSequenceClassification.from_pretrained("Elron/bleurt-base-128")

batch_size = 8
num_epochs = 10

train_dataset = TensorDataset(train_input_ids, train_attention_mask, train_output_ids, train_output_attention_mask)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataset = TensorDataset(val_input_ids, val_attention_mask, val_output_ids, val_output_attention_mask)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

model.train()

optimizer = optim.AdamW(model.parameters(), lr=1e-4)


for epoch in range(num_epochs):
    total_loss = 0.0

    for batch in train_dataloader:
        input_ids, attention_mask, output_ids, output_attention_mask = batch
        input_ids = input_ids.squeeze(1)
        attention_mask = attention_mask.squeeze(1)
        output_ids = output_ids.squeeze(1)
        output_attention_mask = output_attention_mask.squeeze(1)

        optimizer.zero_grad()

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=output_ids[:, 1:].contiguous(),
        )

        loss = outputs.loss
        loss.backward()
        clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        total_loss += loss.item()

    avg_loss = total_loss / len(train_dataloader)
    print(f"Epoch {epoch+1}/{num_epochs}, Average Loss: {avg_loss:.4f}")

    model.eval()  

    with torch.no_grad():
        val_loss = 0.0
        bleurt_scores = []

        for val_batch in val_dataloader:
            val_input_ids, val_attention_mask, val_output_ids, val_output_attention_mask = val_batch
            val_input_ids = val_input_ids.squeeze(1)
            val_attention_mask = val_attention_mask.squeeze(1)
            val_output_ids = val_output_ids.squeeze(1)
            val_output_attention_mask = val_output_attention_mask.squeeze(1)

            optimizer.zero_grad()

            val_outputs = model(
                input_ids=val_input_ids,
                attention_mask=val_attention_mask,
                labels=val_output_ids[:, 1:].contiguous(),
            )

            val_loss += val_outputs.loss.item()

        avg_loss = total_loss / len(train_dataloader)
        avg_val_loss = val_loss / len(val_dataloader)

        print(f"Epoch {epoch+1}/{num_epochs}, Average Loss: {avg_loss:.4f}, Validation Loss: {avg_val_loss:.4f}")

        if epoch == 0 or avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            torch.save(model.state_dict(), 'best_model.pth')

        generated_ids = model.generate(
            input_ids=val_input_ids,
            attention_mask=val_attention_mask,
            max_length=512,
            num_beams=5,
            early_stopping=True,
        )

        generated_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
        generated_inputs = tokenizer(generated_text, padding=True, truncation=True, return_tensors="pt")

        generated_scores = model(**generated_inputs).logits

        bleurt_scores = generated_scores.squeeze().tolist()

        avg_bleurt_score = sum(bleurt_scores) / len(bleurt_scores)
        print(f"Epoch {epoch+1}/{num_epochs}, Average BLEURT Score: {avg_bleurt_score:.4f}")

best_model = T5ForConditionalGeneration.from_pretrained('t5-base')
best_model.load_state_dict(torch.load('best_model.pth'))

best_model.eval()  

with torch.no_grad():
    test_loss = 0.0
    test_bleurt_scores = []

    for test_batch in test_dataloader:
        test_input_ids, test_attention_mask, test_output_ids, test_output_attention_mask = test_batch
        test_input_ids = test_input_ids.squeeze(1)
        test_attention_mask = test_attention_mask.squeeze(1)
        test_output_ids = test_output_ids.squeeze(1)
        test_output_attention_mask = test_output_attention_mask.squeeze(1)

        test_outputs = best_model(
            input_ids=test_input_ids,
            attention_mask=test_attention_mask,
            decoder_input_ids=test_output_ids[:, :-1].contiguous(),
            decoder_attention_mask=test_output_attention_mask[:, :-1].contiguous(),
            labels=test_output_ids[:, 1:].contiguous(),
        )

        test_loss += test_outputs.loss.item()

        generated_ids = best_model.generate(
            input_ids=test_input_ids,
            attention_mask=test_attention_mask,
            max_length=512,
            num_beams=5,
            early_stopping=True,
        )

        generated_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)

        generated_inputs = tokenizer(generated_text, padding=True, truncation=True, return_tensors="pt")
        generated_scores = best_model(**generated_inputs).logits
        test_bleurt_scores = generated_scores.squeeze().tolist()

    avg_test_loss = test_loss / len(test_dataloader)
    avg_test_bleurt_score = sum(test_bleurt_scores) / len(test_bleurt_scores)

    print(f"Test Loss: {avg_test_loss:.4f}, Average BLEURT Score: {avg_test_bleurt_score:.4f}")



RuntimeError: The size of tensor a (8) must match the size of tensor b (5) at non-singleton dimension 1