In [None]:
from transformers import AutoTokenizer, T5ForConditionalGeneration
import torch
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
import os
import matplotlib.pyplot as plt
import numpy as np
from sentence_transformers import SentenceTransformer
from scipy.spatial.distance import cosine

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
model_name = "google/Byt5-small"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

print(f"Successfully loaded tokenizer and model: {model_name}")

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
class SummaryDataset(Dataset):
    def __init__(self, dataframe, tokenizer, source_column, target_column, max_len_input, max_len_target):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.source_column = source_column
        self.target_column = target_column
        self.max_len_input = max_len_input
        self.max_len_target = max_len_target

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        source_text = str(self.data.loc[index, self.source_column])
        target_text = str(self.data.loc[index, self.target_column])

        # Tokenize source text
        input_encoding = self.tokenizer(
            source_text,
            max_length=self.max_len_input,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        # Tokenize target text (labels)
        target_encoding = self.tokenizer(
            target_text,
            max_length=self.max_len_target,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        labels = target_encoding['input_ids']
        # Replace padding token id with -100 for T5 loss calculation
        labels[labels == self.tokenizer.pad_token_id] = -100

        return {
            'input_ids': input_encoding['input_ids'].flatten(),
            'attention_mask': input_encoding['attention_mask'].flatten(),
            'labels': labels.flatten()
        }

In [None]:
EPOCHS = 5
LEARNING_RATE = 5e-5

model.to(device)
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)

print(f"Training parameters set: EPOCHS={EPOCHS}, LEARNING_RATE={LEARNING_RATE}")
print(f"Model moved to device: {device}")
print("Optimizer (AdamW) initialized successfully.")

In [None]:
train_df = pd.read_csv('/content/drive/MyDrive/train.csv')
dev_df = pd.read_csv('/content/drive/MyDrive/dev.csv')

SOURCE_COLUMN = 'Description'
TARGET_COLUMN = 'Word'
MAX_LEN_INPUT = 64
MAX_LEN_TARGET = 16
BATCH_SIZE = 8

train_dataset = SummaryDataset(
    dataframe=train_df,
    tokenizer=tokenizer,
    source_column=SOURCE_COLUMN,
    target_column=TARGET_COLUMN,
    max_len_input=MAX_LEN_INPUT,
    max_len_target=MAX_LEN_TARGET
)

dev_dataset = SummaryDataset(
    dataframe=dev_df,
    tokenizer=tokenizer,
    source_column=SOURCE_COLUMN,
    target_column=TARGET_COLUMN,
    max_len_input=MAX_LEN_INPUT,
    max_len_target=MAX_LEN_TARGET
)

train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
dev_dataloader = DataLoader(dev_dataset, batch_size=BATCH_SIZE, shuffle=False)

print(f"Train dataset size: {len(train_dataset)}")
print(f"Dev dataset size: {len(dev_dataset)}")
print(f"Train dataloader batches: {len(train_dataloader)}")
print(f"Dev dataloader batches: {len(dev_dataloader)}")

In [None]:
model.to(device)
output_dir = '/content/drive/MyDrive/t5-model'

train_losses = []
eval_losses = []

for epoch in range(EPOCHS):
    print(f"\n--- Epoch {epoch+1}/{EPOCHS} ---")
    model.train()
    total_train_loss = 0

    for batch_idx, batch in enumerate(train_dataloader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        total_train_loss += loss.item()

        if (batch_idx + 1) % 100 == 0:
            print(f"  Batch {batch_idx+1}/{len(train_dataloader)}, Loss: {loss.item():.4f}")

    avg_train_loss = total_train_loss / len(train_dataloader)
    train_losses.append(avg_train_loss)
    print(f"\nAverage training loss for Epoch {epoch+1}: {avg_train_loss:.4f}")

    os.makedirs(output_dir, exist_ok=True)
    model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)

    print(f"Model and tokenizer saved to {output_dir}")

    model.eval()
    total_eval_loss = 0
    eval_samples_printed = 0
    print("\n--- Evaluating on Development Set ---")
    with torch.no_grad():
        for batch_idx, batch in enumerate(dev_dataloader):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            # Calculate validation loss
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            total_eval_loss += loss.item()

            if eval_samples_printed < 3:
                generated_ids = model.generate(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    max_length=MAX_LEN_TARGET, # Max length for generated summary
                    do_sample=True, # Enable sampling
                    top_k=10,       # Sample from top 50 most likely next tokens
                    top_p=0.95,     # Sample from tokens forming 95% of the probability mass
                    temperature=0.7, # Lower temperature makes output more deterministic, higher more random
                    early_stopping=True
                )

                for i in range(min(len(generated_ids), 1)):
                    actual_summary_tokens = labels[i].cpu().numpy()
                    # Replace -100 with pad_token_id for decoding actual summaries
                    actual_summary_tokens[actual_summary_tokens == -100] = tokenizer.pad_token_id

                    actual_summary = tokenizer.decode(
                        actual_summary_tokens,
                        skip_special_tokens=True,
                        clean_up_tokenization_spaces=True
                    )
                    generated_summary = tokenizer.decode(
                        generated_ids[i],
                        skip_special_tokens=True,
                        clean_up_tokenization_spaces=True
                    )
                    original_text = tokenizer.decode(
                        input_ids[i],
                        skip_special_tokens=True,
                        clean_up_tokenization_spaces=True
                    )

                    print(f"\n  Original Text: {original_text}")
                    print(f"  Actual Summary: {actual_summary}")
                    print(f"  Generated Summary: {generated_summary}")
                    eval_samples_printed += 1

            if eval_samples_printed >= 3:
                break

    avg_eval_loss = total_eval_loss / len(dev_dataloader)
    eval_losses.append(avg_eval_loss)
    print(f"\nAverage development loss for Epoch {epoch+1}: {avg_eval_loss:.4f}")

print("\nTraining complete!")

In [None]:
def plot_training_history(train_loss, eval_loss):
    """Vizualiziraj povijest treninga"""
    plt.figure()
    plt.plot(train_loss, label='Training Loss', color='blue', linewidth=2)
    plt.xlabel('Epoha')
    plt.ylabel('Loss')
    plt.title('Train Loss')
    plt.figure()
    plt.plot(range(len(eval_loss)), eval_loss, label='Development Loss', color='cyan', linestyle='-.', linewidth=2)
    plt.xlabel('Epoha')
    plt.ylabel('Loss')
    plt.title('Eval Loss')
    plt.grid(True, alpha=0.3)
    plt.legend()

    plt.tight_layout()
    plt.savefig('/content/drive/MyDrive/training_history.png', dpi=300, bbox_inches='tight')
    plt.show()

plot_training_history(train_losses, eval_losses)

In [None]:
def generate_word(text, model, tokenizer, device, max_len=MAX_LEN_TARGET):
    model.eval()
    inputs = tokenizer(
        text,
        max_length=MAX_LEN_INPUT,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )
    input_ids = inputs['input_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)

    with torch.no_grad():
        generated_ids = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_length=max_len,
            do_sample=True,
            top_k=10,
            top_p=0.95,
            temperature=0.7,
            early_stopping=True
        )

    generated_word = tokenizer.decode(generated_ids[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)
    return generated_word

In [None]:
def get_char_ngrams(text, n=3):
    """Extracts character n-grams from a text."""
    text = text.lower()
    if len(text) < n:
        return [text]
    return [text[i:i+n] for i in range(len(text) - n + 1)]

def calculate_ngram_f1(actual, generated, n=3):
    """Calculates F1 score based on character n-gram overlap."""
    actual_ngrams = set(get_char_ngrams(actual, n))
    generated_ngrams = set(get_char_ngrams(generated, n))

    if not actual_ngrams and not generated_ngrams:
        return 1.0 # Both empty, perfect match
    if not actual_ngrams or not generated_ngrams:
        return 0.0 # One is empty, no match

    intersection = len(actual_ngrams.intersection(generated_ngrams))
    precision = intersection / len(generated_ngrams)
    recall = intersection / len(actual_ngrams)

    if precision + recall == 0:
        return 0.0
    f1 = 2 * (precision * recall) / (precision + recall)
    return f1

In [None]:
# Load Semantic Similarity Model
semantic_model = SentenceTransformer('sentence-transformers/distiluse-base-multilingual-cased-v2')
print("Loaded semantic similarity model: sentence-transformers/distiluse-base-multilingual-cased-v2")

def calculate_semantic_similarity(actual, generated, model):
    """
    Calculates the cosine similarity between embeddings of the actual and generated words.
    """
    if not actual and not generated:
        return 1.0 # Both empty, perfect similarity
    if not actual or not generated:
        return 0.0 # One is empty, no similarity

    embeddings = model.encode([actual, generated])
    # Cosine similarity is 1 - cosine distance
    similarity = 1 - cosine(embeddings[0], embeddings[1])
    return similarity

In [None]:

def calculate_target_cross_entropy(model, tokenizer, description, actual_word, device, max_len_input=64, max_len_target=16):
    """
    Calculates the cross-entropy loss for the actual word given the description.
    This measures the model's 'likelihood' for the actual target word.
    """
    model.eval()
    inputs = tokenizer(
        description,
        max_length=max_len_input,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    ).to(device)

    # Tokenize the actual word to get the target labels
    target_encoding = tokenizer(
        actual_word,
        max_length=max_len_target,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    ).to(device)

    labels = target_encoding['input_ids']
    labels[labels == tokenizer.pad_token_id] = -100 # Important for T5 loss calculation

    with torch.no_grad():
        # Forward pass to get the loss for the actual word
        outputs = model(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'], labels=labels)
        loss = outputs.loss.item()

    return loss

In [None]:
test_df = pd.read_csv('/content/drive/MyDrive/test.csv')

print("\n--- Testing generate_word function with test.csv ---")
num_test_samples = len(test_df)

all_ngram_f1_scores = []
all_cross_entropy_scores = []
all_semantic_similarity_scores = []

model.to(device)

for i in range(num_test_samples):
    description = test_df.loc[i, 'Description']
    actual_word = test_df.loc[i, 'Word']
    generated_word = generate_word(description, model, tokenizer, device)

    ngram_f1 = calculate_ngram_f1(actual_word, generated_word, n=3)
    cross_entropy_loss = calculate_target_cross_entropy(model, tokenizer, description, actual_word, device)
    semantic_sim = calculate_semantic_similarity(actual_word, generated_word, semantic_model)


    all_ngram_f1_scores.append(ngram_f1)
    all_cross_entropy_scores.append(cross_entropy_loss)
    all_semantic_similarity_scores.append(semantic_sim)

    print(f"Description: '{description}'")
    print(f"  Actual Word: '{actual_word}'")
    print(f"  Generated Word: '{generated_word}'")
    print(f"  3-gram F1 Score: {ngram_f1:.4f}")
    print(f"  Target Cross-Entropy Loss: {cross_entropy_loss:.4f}")
    print(f"  Semantic Similarity: {semantic_sim:.4f}")
    print("--------------------------------------------------")

if all_ngram_f1_scores:
    avg_ngram_f1 = np.mean(all_ngram_f1_scores)
    avg_cross_entropy = np.mean(all_cross_entropy_scores)
    avg_semantic_sim = np.mean(all_semantic_similarity_scores)


    print(f"\nAverage 3-gram F1 Score over {len(all_ngram_f1_scores)} samples: {avg_ngram_f1:.4f}")
    print(f"Average Target Cross-Entropy Loss over {len(all_cross_entropy_scores)} samples: {avg_cross_entropy:.4f}")
    print(f"Average Semantic Similarity over {len(all_semantic_similarity_scores)} samples: {avg_semantic_sim:.4f}")
else:
    print("No test samples processed.")