## MT5 small - seq2seq - ver1 (without metric reports)

In [None]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import MT5Tokenizer, MT5ForConditionalGeneration, AdamW
from transformers import get_linear_schedule_with_warmup
from tqdm import tqdm
import numpy as np

# Load data
train_df = pd.read_csv('train_samples.csv')
val_df = pd.read_csv('validation_samples.csv')

class Seq2SeqDataset(Dataset):
    def __init__(self, df, tokenizer, source_col, target_col, max_length=75):
        self.df = df
        self.tokenizer = tokenizer
        self.source_col = source_col
        self.target_col = target_col
        self.max_length = max_length

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        source_text = self.df.iloc[index][self.source_col]
        target_text = self.df.iloc[index][self.target_col]

        # Tokenize inputs and targets
        source_encoding = self.tokenizer(
            source_text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        target_encoding = self.tokenizer(
            target_text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        labels = target_encoding['input_ids']
        labels[labels == self.tokenizer.pad_token_id] = -100  

        return {
            'input_ids': source_encoding['input_ids'].flatten(),
            'attention_mask': source_encoding['attention_mask'].flatten(),
            'labels': labels.flatten()
        }

# Initialize the tokenizer and model
model_name = "google/mt5-small"
tokenizer = MT5Tokenizer.from_pretrained(model_name)
model = MT5ForConditionalGeneration.from_pretrained("google/mt5-small")

# Hyperparameters
batch_size = 24
epochs = 10
learning_rate = 3e-4
max_length = 75

# Create datasets and dataloaders
train_dataset = Seq2SeqDataset(train_df, tokenizer, 'poem_text', 'metre', max_length=max_length)
val_dataset = Seq2SeqDataset(val_df, tokenizer, 'poem_text', 'metre', max_length=max_length)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)

# Training setup
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

optimizer = AdamW(model.parameters(), lr=learning_rate)
total_steps = len(train_loader) * epochs
scheduler = get_linear_schedule_with_warmup(
    optimizer, num_warmup_steps=0, num_training_steps=total_steps
)

# Early stopping parameters
patience = 3
best_val_loss = float('inf')
early_stop_counter = 0

# Training and evaluation loop
for epoch in range(epochs):
    model.train()
    train_loss = 0
    for batch in tqdm(train_loader, desc=f"Training Epoch {epoch + 1}/{epochs}"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        loss.backward()
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

        train_loss += loss.item()

    avg_train_loss = train_loss / len(train_loader)
    print(f"Epoch {epoch + 1}, Train Loss: {avg_train_loss}")

    # Validation
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for batch in tqdm(val_loader, desc=f"Validating Epoch {epoch + 1}/{epochs}"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            val_loss += outputs.loss.item()

    avg_val_loss = val_loss / len(val_loader)
    print(f"Epoch {epoch + 1}, Validation Loss: {avg_val_loss}")

    # Early stopping check
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        early_stop_counter = 0
        torch.save(model.state_dict(), 'best_mt5-small_model.pth')
        print("Saved new best model")
    else:
        early_stop_counter += 1
        if early_stop_counter >= patience:
            print("Early stopping triggered")
            break

# Load the best model for inference
model.load_state_dict(torch.load('best_mt5-small_model.pth'))

# Function for inference
def generate_sequence(input_text, max_length=75, num_beams=3):
    model.eval()
    input_ids = tokenizer(input_text, return_tensors='pt', max_length=max_length, truncation=True).input_ids.to(device)
    generated_ids = model.generate(input_ids, max_length=max_length, num_beams=num_beams, early_stopping=True)
    return tokenizer.decode(generated_ids[0], skip_special_tokens=True)

# Example of generating sequence
sample_input = train_df.iloc[0]['poem_text']
output = generate_sequence(sample_input)
print(f"Input: {sample_input}")
print(f"Output: {output}")


  from .autonotebook import tqdm as notebook_tqdm
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Training Epoch 1/10: 100%|██████████| 31216/31216 [3:13:30<00:00,  2.69it/s]  


Epoch 1, Train Loss: 0.19514358406312438


Validating Epoch 1/10: 100%|██████████| 1773/1773 [03:06<00:00,  9.53it/s]


Epoch 1, Validation Loss: 0.02966804455497798
Saved new best model


Training Epoch 2/10: 100%|██████████| 31216/31216 [3:12:54<00:00,  2.70it/s]  


Epoch 2, Train Loss: 0.020055641195351578


Validating Epoch 2/10: 100%|██████████| 1773/1773 [03:05<00:00,  9.54it/s]


Epoch 2, Validation Loss: 0.014464819536981137
Saved new best model


Training Epoch 3/10: 100%|██████████| 31216/31216 [3:13:14<00:00,  2.69it/s]  


Epoch 3, Train Loss: 0.011905253752391928


Validating Epoch 3/10: 100%|██████████| 1773/1773 [03:06<00:00,  9.53it/s]


Epoch 3, Validation Loss: 0.010780521335361176
Saved new best model


Training Epoch 4/10: 100%|██████████| 31216/31216 [3:12:30<00:00,  2.70it/s]  


Epoch 4, Train Loss: 0.008701114875967766


Validating Epoch 4/10: 100%|██████████| 1773/1773 [03:06<00:00,  9.53it/s]


Epoch 4, Validation Loss: 0.008824764127284094
Saved new best model


Training Epoch 5/10: 100%|██████████| 31216/31216 [3:12:51<00:00,  2.70it/s]  


Epoch 5, Train Loss: 0.006900750159308146


Validating Epoch 5/10: 100%|██████████| 1773/1773 [03:03<00:00,  9.64it/s]


Epoch 5, Validation Loss: 0.007890467587838628
Saved new best model


Training Epoch 6/10:  23%|██▎       | 7310/31216 [45:18<2:28:09,  2.69it/s]


KeyboardInterrupt: 

In [None]:
test_df = pd.read_csv('/content/test_samples.csv')

# Apply generate_sequence on the 'poem_text' column and save results in 'metre' column
test_df['metre'] = test_df['poem_text'].apply(generate_sequence)

# Save the results to a new CSV file
test_df.to_csv('test_with_metre_predictions.csv', index=False)


## MT5 large - seq2seq - ver2 (full version)

In [None]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import MT5Tokenizer, MT5ForConditionalGeneration, AdamW, get_scheduler
from sklearn.metrics import precision_score, recall_score, f1_score
from tqdm.auto import tqdm
import numpy as np

# Load Data
train_df = pd.read_csv('train_samples.csv')
val_df = pd.read_csv('validation_samples.csv')
test_df = pd.read_csv('test_samples.csv')

# Prepare Tokenizer
tokenizer = MT5Tokenizer.from_pretrained('google/mt5-large')

class PoemDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len=128):
        self.tokenizer = tokenizer
        self.dataframe = dataframe
        self.max_len = max_len

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        poem_text = str(self.dataframe.iloc[idx]['poem_text'])
        metre = str(self.dataframe.iloc[idx]['metre'])

        inputs = self.tokenizer(poem_text, truncation=True, padding='max_length', max_length=self.max_len, return_tensors='pt')
        targets = self.tokenizer(metre, truncation=True, padding='max_length', max_length=self.max_len, return_tensors='pt')

        return {
            'input_ids': inputs.input_ids.flatten(),
            'attention_mask': inputs.attention_mask.flatten(),
            'labels': targets.input_ids.flatten()
        }

# Parameters
batch_size =16
max_len = 128
epochs = 3
learning_rate = 5e-5

# Datasets and Dataloaders
train_dataset = PoemDataset(train_df, tokenizer, max_len)
val_dataset = PoemDataset(val_df, tokenizer, max_len)
test_dataset = PoemDataset(test_df, tokenizer, max_len)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

# Model Setup
model = MT5ForConditionalGeneration.from_pretrained('google/mt5-large',  ignore_mismatched_sizes=True)
model.to('cuda')

# Optimizer and Scheduler
optimizer = AdamW(model.parameters(), lr=learning_rate)
lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=len(train_loader) * epochs
)

# Scheduled Teacher Forcing Parameters
teacher_forcing_ratio = 0.8  # Start with full teacher forcing

# Training Loop with Validation and Teacher Forcing
best_val_loss = float('inf')

from torch.cuda.amp import autocast, GradScaler
scaler = GradScaler()

for epoch in range(epochs):
    # Training
    model.train()
    train_loss = 0
    for batch in tqdm(train_loader, desc=f"Training Epoch {epoch+1}/{epochs}"):
        input_ids = batch['input_ids'].to('cuda')
        attention_mask = batch['attention_mask'].to('cuda')
        labels = batch['labels'].to('cuda')

        optimizer.zero_grad()

        with autocast():
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        lr_scheduler.step()


    teacher_forcing_ratio = max(0.5, teacher_forcing_ratio * 0.9)

    # Validation
    model.eval()
    val_loss = 0
    y_true = []
    y_pred = []
    with torch.no_grad():
        for batch in tqdm(val_loader, desc=f"Validating Epoch {epoch+1}/{epochs}"):
            input_ids = batch['input_ids'].to('cuda')
            attention_mask = batch['attention_mask'].to('cuda')
            labels = batch['labels'].to('cuda')

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            val_loss += outputs.loss.item()

            # Get predictions for evaluation
            preds = model.generate(input_ids, max_length=max_len, num_beams=3)
            decoded_preds = [tokenizer.decode(p, skip_special_tokens=True) for p in preds]
            decoded_labels = [tokenizer.decode(l, skip_special_tokens=True) for l in labels]

            y_true.extend(decoded_labels)
            y_pred.extend(decoded_preds)

    # Calculate evaluation metrics
    precision = precision_score(y_true, y_pred, average='macro', zero_division=0)
    recall = recall_score(y_true, y_pred, average='macro', zero_division=0)
    f1 = f1_score(y_true, y_pred, average='macro', zero_division=0)

    print(f"Epoch {epoch+1}, Train Loss: {train_loss/len(train_loader)}, Validation Loss: {val_loss/len(val_loader)}")
    print(f"Validation Precision: {precision}, Recall: {recall}, F1 Score: {f1}")

    # Save the best model
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save(model.state_dict(), 'best_mt5_model.pth')
        print("Saved new best model")


In [7]:
class PoemDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len=128, is_test=False):
        self.tokenizer = tokenizer
        self.dataframe = dataframe
        self.max_len = max_len
        self.is_test = is_test  # Flag to indicate if it's the test dataset

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        poem_text = str(self.dataframe.iloc[idx]['poem_text'])

        inputs = self.tokenizer(poem_text, truncation=True, padding='max_length', max_length=self.max_len, return_tensors='pt')
        
        if self.is_test:
            # Return only the input_ids and attention_mask for test dataset
            return {
                'input_ids': inputs.input_ids.flatten(),
                'attention_mask': inputs.attention_mask.flatten(),
            }
        else:
            # For training/validation, also include labels
            metre = str(self.dataframe.iloc[idx]['metre'])
            targets = self.tokenizer(metre, truncation=True, padding='max_length', max_length=self.max_len, return_tensors='pt')
            return {
                'input_ids': inputs.input_ids.flatten(),
                'attention_mask': inputs.attention_mask.flatten(),
                'labels': targets.input_ids.flatten()
            }


In [None]:

# new Dataset for the test data
test_dataset = PoemDataset(test_df, tokenizer, max_len, is_test=True)

test_loader = DataLoader(test_dataset, batch_size=batch_size)

# Apply the model on the test set and save predictions
model.eval()
test_predictions = []

with torch.no_grad():
    for batch in tqdm(test_loader, desc="Generating Test Predictions"):
        input_ids = batch['input_ids'].to('cuda')
        attention_mask = batch['attention_mask'].to('cuda')

        # Generate predictions using the model
        preds = model.generate(input_ids, max_length=max_len, num_beams=3)
        decoded_preds = [tokenizer.decode(p, skip_special_tokens=True) for p in preds]
        test_predictions.extend(decoded_preds)

# Save the predictions in a new column of the test DataFrame
test_df['predicted_metre'] = test_predictions
test_df.to_csv('mt5-large-test_predictions.csv', index=False)

print("Test predictions saved to 'mt5-large-test_predictions.csv'.")


Generating Test Predictions: 100%|██████████| 2660/2660 [17:40<00:00,  2.51it/s]


Test predictions saved to 'mt5-large-test_predictions.csv'.
