In [1]:
from google.colab import drive
drive.mount('/content/drive')

In [2]:
import zipfile
import os

zip_path = '/content/drive/MyDrive/data/dataset_nlg.zip'
extract_to = '/content/drive/MyDrive/data'
os.makedirs(extract_to, exist_ok=True)
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_to)
print("Unzipping completed!")

In [None]:
import pandas as pd

data = pd.read_csv('/content/drive/MyDrive/data/dataset_nlg_v1/train.csv', index_col=0)
data.head(5)

Unnamed: 0,text,title
871923758931292416,This statistic presents the global revenue of ...,Omnicom Group 's revenue from 2006 to 2019 ( i...
12713542298181105208,This statistic shows the number of hotel and s...,Number of hotel and similar accommodation esta...
5796511258704617257,"In 2019 , just 2.5 percent of all private wage...",Unemployment rate in the U.S. broadcasting ind...
14629703118053421010,This statistic displays the benefits of using ...,If a â€œconnected deviceâ€ ? had the following...
14801098692472737046,The statistic shows global gross domestic prod...,Global gross domestic product ( GDP ) at curre...


In [None]:
sample = pd.read_csv('/content/drive/MyDrive/data/dataset_nlg_v1/data/223685202385643.csv', index_col=0)
sample

Unnamed: 0,Year,Smoking prevalence
0,2019,29.03%
1,2018,32.2%
2,2017,29.25%
3,2016,28.97%
4,2015,30.08%


In [None]:
submission = pd.read_csv('/content/drive/MyDrive/data/dataset_nlg_v1/submission.csv', index_col=0)
submission.head(5)

Unnamed: 0,text,title
223685202396506,,
223685202396505,,
223685202396504,,
223685202396503,,
223685202396502,,


- (**5 pts**) Propose and implement at least 2 variants of the input data preprocessing from tables to string data.
- (**5 pts**) Fine-tune [T5](https://huggingface.co/docs/transformers/model_doc/t5) as a baseline using `t5-base` checkpoint ([paper](https://arxiv.org/pdf/1910.10683.pdf)). In order to handle 2 types of output, test usage of prefixes for T5 model.
- (**5 pts**) Propose and implement at least 2 variants of data augmentation, retune T5 and compare performance.
- (**5 pts**) Add domain adoptation via additional Masked language modeling loss (MLM, [paper, section 3.1, Task #1](https://arxiv.org/pdf/1810.04805.pdf)) loss term for encoder, provide hyperparameter search for the regularization parameter $\lambda$, use BERTScore as objective, compare performance:
$$L(x, y) = -LogLikelihood(x, y) + \lambda L_{MLM}(x_{masked}, x)$$


The following metrics should be reported:
- [SacreBLEU](https://github.com/mjpost/sacrebleu)
- [ROUGEL](https://github.com/google-research/google-research/tree/master/rouge)
- [METEOR](https://www.nltk.org/_modules/nltk/translate/meteor_score.html)
- [BERTScore](https://github.com/Tiiiger/bert_score) using `bert-base-uncased` checkpoint and 9th layer output

Using the best checkpoint from above prepare submission file `submission.csv`, where index is a table caption from the `data` folder, and report the link on your finetuned checkpoint.

In [3]:
!pip install sacrebleu
!pip install rouge-score
!pip install nltk
!pip install bert-score

In [None]:
from google.colab import drive
import zipfile
import os
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from transformers import T5Tokenizer, T5ForConditionalGeneration, AdamW
import torch.nn as nn
import numpy as np
from nltk.translate.bleu_score import corpus_bleu
from rouge_score import rouge_scorer
import sacrebleu
from bert_score import BERTScorer

# **Preprocessing**

In [4]:
def preprocess_table_to_string(table_df):
    metadata = ', '.join([f"{col} [{dtype}]" for col, dtype in zip(table_df.columns, table_df.dtypes)])
    data_rows = ' | '.join([', '.join(map(str, row)) for index, row in table_df.iterrows()])
    return f"Metadata: {metadata}. Data: {data_rows}."

def preprocess_to_key_value_pairs(table_df):
    result = []
    for index, row in table_df.iterrows():
        pairs = ', '.join([f"{col}: {row[col]}" for col in table_df.columns])
        result.append(pairs)
    return ' | '.join(result)

# **Augumentation**

In [None]:
def add_random_noise(dataframe, noise_level=0.01):
    """ Add random noise to numeric columns in the dataframe. """
    numeric_cols = dataframe.select_dtypes(include=[np.number]).columns.tolist()
    for col in numeric_cols:
        noise = np.random.normal(0, noise_level, size=dataframe[col].shape)
        dataframe[col] += noise
    return dataframe

def swap_rows(dataframe, swap_fraction=0.1):
    """ Randomly swap a fraction of rows in the dataframe. """
    num_swaps = int(len(dataframe) * swap_fraction)
    for _ in range(num_swaps):
        i, j = np.random.randint(0, len(dataframe), size=2)
        dataframe.iloc[i], dataframe.iloc[j] = dataframe.iloc[j].copy(), dataframe.iloc[i].copy()
    return dataframe


def preprocess_table_to_string(table_df):
    """ Convert the table dataframe to a string format suitable for T5 input. """
    table_df = add_random_noise(table_df)
    table_df = swap_rows(table_df)
    metadata = ', '.join([f"{col} [{dtype}]" for col, dtype in zip(table_df.columns, table_df.dtypes)])
    data_rows = ' | '.join([', '.join(map(str, row)) for index, row in table_df.iterrows()])
    return f"Metadata: {metadata}. Data: {data_rows}."

tokenizer = T5Tokenizer.from_pretrained('t5-base')

# **Custom dataset**

In [None]:
class TableDataset(Dataset):
    def __init__(self, dataframe, data_folder, tokenizer, preprocess_fn):
        self.dataframe = dataframe
        self.data_folder = data_folder
        self.tokenizer = tokenizer
        self.preprocess_fn = preprocess_fn

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        data_id = str(int(self.dataframe.iloc[idx, 0]))
        text = self.dataframe.iloc[idx, 1]
        title = self.dataframe.iloc[idx, 2]
        table_path = os.path.join(self.data_folder, f"{data_id}.csv")
        table_df = pd.read_csv(table_path)
        processed_text = self.preprocess_fn(table_df) + f" </s> {text} </s> {title}"
        inputs = self.tokenizer(processed_text, return_tensors="pt", padding="max_length", truncation=True, max_length=512)
        input_ids = inputs['input_ids'].squeeze()
        attention_mask = inputs['attention_mask'].squeeze()
        return input_ids, attention_mask, text, title

data_path = '/content/drive/MyDrive/data/dataset_nlg_v1/train.csv'
data_folder = '/content/drive/MyDrive/data/dataset_nlg_v1/data'
data = pd.read_csv(data_path)

# **Data loader**

In [None]:
from sklearn.model_selection import train_test_split
train_data, val_data = train_test_split(data, test_size=0.2, random_state=42)

train_dataset = TableDataset(train_data, data_folder, tokenizer, preprocess_table_to_string)
train_loader = DataLoader(train_dataset, batch_size=1, shuffle=True)

val_dataset = TableDataset(val_data, data_folder, tokenizer, preprocess_table_to_string)
val_loader = DataLoader(val_dataset, batch_size=1, shuffle=False)

# **T5-base without MLM**

In [None]:
import torch.nn as nn

class T5Model(nn.Module):
    def __init__(self, t5_model):
        super(T5Model, self).__init__()
        self.t5_model = t5_model

    def forward(self, input_ids, attention_mask=None, labels=None):
        outputs = self.t5_model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        return outputs

    def generate(self, *args, **kwargs):
        return self.t5_model.generate(*args, **kwargs)


# **T5-base with MLM**

In [None]:
class T5WithMLMHead(nn.Module):
    def __init__(self, t5_model):
        super(T5WithMLMHead, self).__init__()
        self.t5_model = t5_model
        self.mlm_head = nn.Linear(t5_model.config.d_model, t5_model.config.vocab_size)

    def forward(self, input_ids, attention_mask=None, labels=None):
        outputs = self.t5_model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)

        if labels is not None:
            loss = outputs.loss
            logits = outputs.logits
            mlm_loss = self.compute_mlm_loss(logits, labels)
            total_loss = loss + 0.4 * mlm_loss
            return {"loss": total_loss}

        return outputs.logits

    def compute_mlm_loss(self, logits, labels):
        loss_fct = nn.CrossEntropyLoss()
        mlm_loss = loss_fct(logits.view(-1, logits.size(-1)), labels.view(-1))
        return mlm_loss

In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = T5Model(T5ForConditionalGeneration.from_pretrained('t5-base').to(device))
optimizer = AdamW(model.parameters(), lr=0.0001)

In [6]:
import nltk
from bert_score import score
import sacrebleu
from rouge_score import rouge_scorer
from nltk.tokenize import word_tokenize
from nltk.translate import meteor_score
nltk.download('punkt')
nltk.download('wordnet')

In [None]:
new_folder_path = '/content/drive/MyDrive/model_weights_metrics'
if not os.path.exists(new_folder_path):
    os.makedirs(new_folder_path)

# Training and Evaluation function

In [None]:
def train_model(model, train_loader, optimizer, device):
    model.train()
    total_loss = 0
    batch_count = 0

    for batch in train_loader:
        input_ids, attention_mask, _, _ = batch
        input_ids, attention_mask = input_ids.to(device), attention_mask.to(device)
        labels = input_ids.clone()
        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs['loss']
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        batch_count += 1

    torch.save(model.state_dict(), f'{new_folder_path}/model.pth')
    print(f"Trained on {batch_count} batches.")
    print(f"Average training loss: {total_loss / batch_count}")

def evaluate_model(model, val_loader, tokenizer, device):
    model.eval()
    predictions, references = [], []
    rouge = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
    bert_scorer = BERTScorer(model_type='bert-base-uncased', batch_size=8, num_layers=9)

    batch_count = 0
    with torch.no_grad():
        for input_ids, attention_mask, texts, titles in val_loader:
            input_ids, attention_mask = input_ids.to(device), attention_mask.to(device)
            generated_ids = model.generate(input_ids, attention_mask=attention_mask, max_length=50, num_beams=5, early_stopping=True)
            batch_predictions = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
            tokenized_predictions = [word_tokenize(pred) for pred in batch_predictions]
            tokenized_references = [[word_tokenize(text + " " + title)] for text, title in zip(texts, titles)]
            predictions.extend(tokenized_predictions)
            references.extend(tokenized_references)
            batch_count += 1

    meteor_scores = [meteor_score.single_meteor_score(ref[0], pred) for ref, pred in zip(references, predictions)]
    rouge_scores = [rouge.score(' '.join(ref[0]), ' '.join(pred))['rougeL'].fmeasure for ref, pred in zip(references, predictions)]
    precision, recall, bert_scores = score([' '.join(pred) for pred in predictions], [' '.join(ref[0]) for ref in references], lang='en', verbose=True)

    bleu_score = nltk.translate.bleu_score.corpus_bleu(references, predictions)
    avg_rouge = sum(rouge_scores) / len(rouge_scores)
    avg_meteor = sum(meteor_scores) / len(meteor_scores)
    avg_bert_score = bert_scores.mean().item()

    with open(f'{new_folder_path}/evaluation_metrics.txt', 'w') as f:
        f.write(f"BLEU Score: {bleu_score}\n")
        f.write(f"Average ROUGE-L Score: {avg_rouge}\n")
        f.write(f"METEOR Score: {avg_meteor}\n")
        f.write(f"BERTScore: {avg_bert_score}\n")

    print(f"Evaluated on {batch_count} batches.")
    print(f"BLEU Score: {bleu_score}")
    print(f"Average ROUGE-L Score: {avg_rouge}")
    print(f"METEOR Score: {avg_meteor}")
    print(f"BERTScore: {avg_bert_score}")


# **Main excution**

In [7]:
train_model(model, train_loader, optimizer, device)
evaluate_model(model, val_loader,tokenizer, device)

# **Inference**

In [8]:
import pandas as pd
from torch.utils.data import DataLoader
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_path = '/content/drive/MyDrive/model_weights_metrics/model.pth'
model.load_state_dict(torch.load(model_path, map_location=device))
model.eval()
tokenizer = T5Tokenizer.from_pretrained('t5-base')

def inference(submission_path, data_folder, output_folder):
    submission_df = pd.read_csv(submission_path, dtype={0: 'int'})
    dataset = TableDataset(submission_df, data_folder, tokenizer, preprocess_table_to_string)
    loader = DataLoader(dataset, batch_size=1, shuffle=False)

    texts = []
    titles = []

    with torch.no_grad():
        for input_ids, attention_mask, _, _ in loader:
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            generated_ids = model.generate(input_ids, attention_mask=attention_mask, max_length=50, num_beams=5, early_stopping=True)
            generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
            parts = generated_text.split(" </s> ")
            text = parts[0] if len(parts) > 0 else "Text missing"
            title = parts[1] if len(parts) > 1 else "Title missing"
            texts.append(text)
            titles.append(title)

    submission_df['text'] = texts
    submission_df['title'] = titles

    output_csv_path = f"{output_folder}/submission.csv"
    submission_df.to_csv(output_csv_path, index=False)
    print(f"Inference completed and file saved to {output_csv_path}")

submission_csv_path = '/content/drive/MyDrive/data/dataset_nlg_v1/submission.csv'
data_folder_path = '/content/drive/MyDrive/data/dataset_nlg_v1/data'
output_folder = '/content/drive/MyDrive/model_weights_metrics'

inference(submission_csv_path, data_folder_path, output_folder)


## Reranking approach (15 pts)

Using maximum likelihood, an ideal model will assign all probability mass to the reference summary. During inference, the model must also generate the output based on possibly erroneous previous steps. This can affect the performance of the model, a phenomenon often called exposure bias. One way to solve this problem is to require our model to be able to accurately predict the ranking order of a set of most likely candidates via an additional contrastive loss term

$$L(x, y) = -LogLikelihood(x, y) + L_{contrastive}(x, y)$$

where

$$
L_{contrastive}(x, y) = \sum_i\sum_{j < i}\max(0, f(s_i(x)) - f(s_j(x)) + \alpha_{ij})
$$

where $\alpha_{ij} = \alpha \cdot (i - j)$ is a margin, $s_i$ and $s_j$ are different candidates (generated by [beam search](https://huggingface.co/blog/how-to-generate)) such that for selected ranking function $r$ $r(s_j, y) > r(s_i, y)$, and
$f(s)$ is a length-normalised estimated log-probability:

$$
f(s) = \frac{\sum_{t} LogProb(s_t| s_{<t}, x)}{|x|},
$$

where $|x|$ is a lenght of $x$.

Your task is to fine-tune the model with reranking-aware loss using BERTScore as the ranking function $r$, provide hyperparameter search for the margin scaling factor $\alpha$ using BERTScore as objective, report metrics for the best case (SacreBLEU, ROUGEL, METEOR, BERTScore), and prepare the submission file `submission_reranking.csv` and report the link on your finetuned checkpoint.

In [None]:
import torch.nn.functional as F

def contrastive_loss(scores, alpha):
    batch_size = scores.size(0)
    loss = 0
    for i in range(batch_size):
        for j in range(i):
            margin = alpha * (i - j)
            loss += F.relu(scores[i] - scores[j] + margin)
    return loss


In [None]:
def train_model_with_alpha(model, optimizer, train_loader, val_loader, tokenizer, device, alpha, epochs=1, save_path='model.pth'):
    model.train()
    for epoch in range(epochs):
        total_loss = 0.0
        for batch_idx, batch in enumerate(train_loader):
            input_ids, attention_mask, labels = batch
            input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            logits = outputs.logits
            loss = outputs.loss
            contrastive_loss_value = contrastive_loss(logits, alpha)
            total_loss += loss.item()
            total_loss += contrastive_loss_value.item()
            loss.backward()
            optimizer.step()

            if batch_idx % 100 == 0:
                print(f'Epoch {epoch+1}/{epochs}, Batch {batch_idx}/{len(train_loader)}, Loss: {total_loss / (batch_idx+1)}')

        torch.save(model.state_dict(), save_path)
        print(f'Model saved after epoch {epoch+1}')

    print('Training finished!')


In [None]:
alpha_values = [0.01, 0.1, 0.5, 1.0]
best_alpha = 0
best_bert_score = 0
num_batches = 50

for alpha in alpha_values:
    batch_count = 0
    total_score = 0
    for batch in val_loader:
        if batch_count >= num_batches:
            break
        input_ids, attention_mask = batch[:2]
        current_bert_score = evaluate_model(model, val_loader,tokenizer, device)
        total_score += current_bert_score
        batch_count += 1
    average_score = total_score / num_batches
    if average_score > best_bert_score:
        best_bert_score = average_score
        best_alpha = alpha

print(f"Best alpha: {best_alpha}")


In [None]:
train_model_with_alpha(model, optimizer, train_loader, val_loader, tokenizer, device, alpha=0.5, epochs=5, save_path='model.pth')
evaluate_model(model, val_loader, tokenizer, device)