In [None]:
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/NLP-Project/

Mounted at /content/drive
/content/drive/MyDrive/NLP-Project


In [None]:
import os
import pandas as pd
from dataclasses import dataclass
from typing import Dict, List, Any
import torch
import numpy as np
from datasets import Dataset
from transformers import (
    T5Tokenizer,
    T5ForConditionalGeneration,
    BartTokenizer,
    BartForConditionalGeneration,
    DataCollatorForLanguageModeling,
    AutoModelForSeq2SeqLM,
    Trainer,
    TrainingArguments,
    EarlyStoppingCallback,
    pipeline
)

In [None]:
import transformers
print(transformers.__version__)

4.52.4


In [None]:
# 1) point to train/val/test CSVs:
import pandas as pd
from datasets import Dataset

# 1) Paths to your CSV splits
train_csv = "hf_image_prompt_captioned_fullset/train_hf_image_prompt_captioned_fullset.csv"
val_csv   = "hf_image_prompt_captioned_fullset/val_hf_image_prompt_captioned_fullset.csv"
test_csv  = "hf_image_prompt_captioned_fullset/test_hf_image_prompt_captioned_fullset.csv"

# 2) Load with pandas
df_train = pd.read_csv(train_csv)
df_val   = pd.read_csv(val_csv)
df_test  = pd.read_csv(test_csv)

# 3) Wrap into HF Datasets
ds_train = Dataset.from_pandas(df_train)
ds_val   = Dataset.from_pandas(df_val)
ds_test  = Dataset.from_pandas(df_test)

# **T5**

In [None]:
# 4) Load T5 tokenizer & model
model_checkpoint = "t5-base"   # or "t5-small", "t5-large", etc.
tokenizer       = T5Tokenizer.from_pretrained(model_checkpoint)
model           = T5ForConditionalGeneration.from_pretrained(model_checkpoint)

max_len = 128

In [None]:
# 4) Tokenize *only* the prompt text
def tokenize_fn(examples):
    # returns input_ids & attention_mask
    return tokenizer(
        examples["prompt"],
        truncation=True,
        padding="max_length",
        max_length=max_len
    )

ds_train = ds_train.map(tokenize_fn, batched=True, remove_columns=ds_train.column_names)
ds_val   = ds_val.map(tokenize_fn, batched=True, remove_columns=ds_val.column_names)
ds_test  = ds_test.map(tokenize_fn, batched=True, remove_columns=ds_test.column_names)

Map:   0%|          | 0/9470 [00:00<?, ? examples/s]

Map:   0%|          | 0/1184 [00:00<?, ? examples/s]

Map:   0%|          | 0/1184 [00:00<?, ? examples/s]

In [None]:
# =================================================================================
# 4. CUSTOM DATA COLLATOR for T5 Span Corruption (The Core Change)
# =================================================================================
@dataclass
class T5SpanCorruptionDataCollator:
    """
    Data collator that implements T5 span-corruption denoising.
    """
    tokenizer: T5Tokenizer
    noise_density: float = 0.15
    mean_noise_span_length: float = 3.0

    def __call__(self, examples: List[Dict[str, Any]]) -> Dict[str, Any]:
        # Convert list of dicts to dict of lists
        batch = {k: [dic[k] for dic in examples] for k in examples[0]}

        input_ids = batch["input_ids"]
        batch_size = len(input_ids)

        # Generate the corrupted inputs and the corresponding labels
        corrupted_input_ids, labels = self.t5_corrupt(input_ids)

        # Pad the corrupted inputs and labels
        padded_input_ids = self.tokenizer.pad(
            [{"input_ids": ids} for ids in corrupted_input_ids],
            padding="longest",
            return_tensors="pt"
        ).input_ids

        padded_labels = self.tokenizer.pad(
            [{"input_ids": ids} for ids in labels],
            padding="longest",
            return_tensors="pt"
        ).input_ids

        # In labels, replace padding token id with -100 so it's ignored in loss
        padded_labels[padded_labels == self.tokenizer.pad_token_id] = -100

        # The attention mask corresponds to the padded inputs
        attention_mask = (padded_input_ids != self.tokenizer.pad_token_id).long()

        return {
            "input_ids": padded_input_ids,
            "attention_mask": attention_mask,
            "labels": padded_labels
        }

    def t5_corrupt(self, input_ids: List[List[int]]):
        """Applies T5's span corruption to a batch of tokenized inputs."""
        corrupted_inputs = []
        label_sequences = []

        # Get sentinel tokens
        sentinel_tokens = [f"<extra_id_{i}>" for i in range(100)]
        sentinel_token_ids = self.tokenizer.convert_tokens_to_ids(sentinel_tokens)

        for example_ids in input_ids:
            # Don't corrupt special tokens
            # T5 tokenizer adds an EOS token at the end.
            actual_ids = [tok for tok in example_ids if tok >= 0 and tok not in self.tokenizer.all_special_ids]
            if not actual_ids:
                corrupted_inputs.append(example_ids)
                label_sequences.append([])
                continue

            # Decide which tokens to mask
            num_tokens_to_mask = int(round(len(actual_ids) * self.noise_density))

            # Avoid masking everything
            if num_tokens_to_mask == 0 and len(actual_ids) > 0:
                 num_tokens_to_mask = 1
            if num_tokens_to_mask >= len(actual_ids):
                num_tokens_to_mask = len(actual_ids) -1


            # Create a boolean mask for the tokens
            mask = np.array([False] * len(actual_ids))

            # Determine span lengths
            num_spans = int(round(num_tokens_to_mask / self.mean_noise_span_length))
            if num_spans == 0 and num_tokens_to_mask > 0:
                num_spans = 1

            span_lengths = np.random.poisson(self.mean_noise_span_length, num_spans).tolist()

            # Distribute the total number of masked tokens among the spans
            total_masked = sum(span_lengths)
            while total_masked > num_tokens_to_mask:
                idx = np.argmax(span_lengths)
                span_lengths[idx] -= 1
                total_masked -= 1
            while total_masked < num_tokens_to_mask:
                idx = np.random.randint(0, len(span_lengths))
                span_lengths[idx] += 1
                total_masked += 1

            # Select tokens to mask
            indices_to_mask = np.random.choice(len(actual_ids), num_tokens_to_mask, replace=False)
            mask[indices_to_mask] = True

            # Create the corrupted input and the label sequence
            new_input_ids = []
            current_label_sequence = []
            sentinel_idx = 0

            i = 0
            while i < len(actual_ids):
                if not mask[i]:
                    new_input_ids.append(actual_ids[i])
                    i += 1
                else:
                    start = i
                    while i < len(actual_ids) and mask[i]:
                        i += 1

                    # Add sentinel token to input
                    new_input_ids.append(sentinel_token_ids[sentinel_idx])

                    # Add sentinel and masked sequence to labels
                    current_label_sequence.append(sentinel_token_ids[sentinel_idx])
                    current_label_sequence.extend(actual_ids[start:i])
                    sentinel_idx += 1

            # Add final sentinel to label sequence if it was used
            if current_label_sequence:
                 current_label_sequence.append(sentinel_token_ids[sentinel_idx])

            corrupted_inputs.append(new_input_ids + [self.tokenizer.eos_token_id])
            label_sequences.append(current_label_sequence)

        return corrupted_inputs, label_sequences

# Instantiate the custom collator
data_collator = T5SpanCorruptionDataCollator(tokenizer=tokenizer)


In [None]:
# =================================================================================
# 5. Set Up Trainer
# =================================================================================
training_args = TrainingArguments(
    output_dir="t5-denoising-pretrain",
    overwrite_output_dir=True,
    num_train_epochs=20,  # Reduced for a quick example
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=3e-4,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    save_total_limit=2,
    fp16=True,
    logging_steps=100,
    report_to=["none"],
)

# Trainer with early stopping on val_loss
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=ds_train,
    eval_dataset=ds_val,
    tokenizer=tokenizer,
    data_collator=data_collator,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=4)]
)

# =================================================================================
# 6. Train the Model
# =================================================================================
print("Starting T5 denoising pre-training...")
trainer.train()

print("Training finished.")

  trainer = Trainer(


Starting T5 denoising pre-training...


Epoch,Training Loss,Validation Loss
1,0.5591,0.79699
2,0.5442,0.758857
3,0.6016,0.72159
4,0.6831,0.723168
5,2.1669,1.342998
6,2.6917,1.483887
7,2.613,1.481747


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


Training finished.


In [None]:
from transformers import pipeline
import os

model_folder = "/content/drive/MyDrive/NLP-Project/t5-unsup-pretrain"
os.makedirs(model_folder, exist_ok=True)

trainer.save_model(model_folder)
tokenizer.save_pretrained(model_folder)
generator = pipeline(
    "text2text-generation",
    model=model_folder,
    tokenizer=model_folder,
    device=0
)

You set `add_prefix_space`. The tokenizer needs to be converted from the slow tokenizers
Device set to use cuda:0


In [None]:
output = generator(
    "digital art trending on artstation a soviet astronaut playing soccer on mars mars landscape cinematic relaxing",
    max_new_tokens=256,
    do_sample=True,
    top_p=0.9,
    temperature=0.8
)[0]["generated_text"]

print(output)

a portrait of a man highly detailed concept art cinematic beautiful atmosphere beautiful vibrant colors atmosphere artstation  a atmosphere atmosphere  artstation concept art lighting a atmosphere cinematic lighting atmosphere atmosphere cinematic atmosphere cinematic relaxing relaxing relaxing relaxing relaxing relaxing relaxing relaxing relaxing relaxing relaxing relaxing relaxing relaxing relaxing relaxing relaxing relaxing relaxing relaxing relaxing relaxing relaxing relaxing relaxing atmosphere.


# **Test Evaluation with BLEU, BERT_F1**

In [None]:
pip install sacrebleu

Collecting sacrebleu
  Downloading sacrebleu-2.5.1-py3-none-any.whl.metadata (51 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/51.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting portalocker (from sacrebleu)
  Downloading portalocker-3.1.1-py3-none-any.whl.metadata (8.6 kB)
Collecting colorama (from sacrebleu)
  Downloading colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Downloading sacrebleu-2.5.1-py3-none-any.whl (104 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Downloading portalocker-3.1.1-py3-none-any.whl (19 kB)
Installing collected packages: portalocker, colorama, sacrebleu
Successfully installed colorama-0.4.6 portalocker-3.1.1 sacrebleu-2.5.1


In [None]:
# Add this line to see the available columns
print("Columns in ds_test:", ds_test.column_names)

Columns in ds_test: ['input_ids', 'attention_mask']


In [None]:
from transformers import pipeline
import evaluate
import pandas as pd
from datasets import Dataset

# 1) Load your test DataFrame & wrap in a Dataset
df_test = pd.read_csv("hf_image_prompt_captioned_fullset/test_hf_image_prompt_captioned_fullset.csv")
ds_test = Dataset.from_pandas(df_test)

# 2) Point a text2text pipeline at your saved model folder
model_folder = "/content/drive/MyDrive/NLP-Project/t5-unsup-pretrain"
generator = pipeline(
    "text2text-generation",
    model=model_folder,
    tokenizer=model_folder,
    device=0,
    max_length=128,
    do_sample=False,
    batch_size=32,
)

# 3) Add a “pred” column with batched map
def gen_batch(batch):
    outs = generator(batch["prompt"])
    batch["pred"] = [o["generated_text"] for o in outs]
    return batch

# Only drop the auto‐added index column; keep “prompt”
to_remove = [c for c in ds_test.column_names if c not in ("prompt",)]
ds_pred = ds_test.map(
    gen_batch,
    batched=True,
    batch_size=32,
    remove_columns=to_remove
)

# 4) Compute BLEU and BERT‐F1 in one go
bleu      = evaluate.load("bleu")
bertscore = evaluate.load("bertscore", device=0)

preds = ds_pred["pred"]
refs  = ds_pred["prompt"]

bleu_res = bleu.compute(
    predictions=preds,
    references=[[r] for r in refs]    # list-of-lists form
)
bert_res = bertscore.compute(
    predictions=preds,
    references=refs,                  # simple list form
    lang="en",
    model_type="bert-base-uncased",
    rescale_with_baseline=True
)

print(f"→ BLEU:         {bleu_res['bleu']*100:.2f}")
print(f"→ BERTScore F1: {100 * sum(bert_res['f1'])/len(bert_res['f1']):.2f}")


Device set to use cuda:0


Map:   0%|          | 0/1184 [00:00<?, ? examples/s]

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

→ BLEU:         6.83
→ BERTScore F1: 29.48


# **BART**

In [None]:
# 3) Load BART tokenizer & model
checkpoint = "facebook/bart-base"
tokenizer  = BartTokenizer.from_pretrained(checkpoint)
model      = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

# 4) Tokenize **only** the prompt column
max_len = 128
def tokenize_fn(examples):
    return tokenizer(
        examples["prompt"],
        truncation=True,
        padding="max_length",
        max_length=max_len
    )

for split in ("train", "validation", "test"):
    ds = {"train": ds_train, "validation": ds_val, "test": ds_test}[split]
    to_drop = [c for c in ds.column_names if c != "prompt"]
    tokenized = ds.map(
        tokenize_fn,
        batched=True,
        remove_columns=to_drop
    )
    if split == "train":     ds_train = tokenized
    if split == "validation": ds_val   = tokenized
    if split == "test":      ds_test  = tokenized

# 5) Set up MLM collator for BART
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
)

# 6) TrainingArguments
training_args = TrainingArguments(
    output_dir="bart-unsup-pretrain-123",
    overwrite_output_dir=True,

    num_train_epochs=20,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=5e-5,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    save_total_limit=2,
    fp16=True,
    logging_steps=200,
    report_to=["none"],
)

# 7) Trainer with early stopping on val_loss
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=ds_train,
    eval_dataset=ds_val,
    tokenizer=tokenizer,
    data_collator=data_collator,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=4)]
)

# 8) Train!
print("Starting BART pre-training...")
trainer.train()
print("Done.")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.72k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/558M [00:00<?, ?B/s]

Map:   0%|          | 0/9470 [00:00<?, ? examples/s]

Map:   0%|          | 0/1184 [00:00<?, ? examples/s]

Map:   0%|          | 0/1184 [00:00<?, ? examples/s]

  trainer = Trainer(


Starting BART MLM pre-training...


Epoch,Training Loss,Validation Loss
1,0.0159,0.004478
2,0.0115,0.001131
3,0.0099,0.001833
4,0.0092,0.001236
5,0.0051,0.001003
6,0.0051,0.001111
7,0.0047,0.001051
8,0.0051,0.004152
9,0.0035,0.000756
10,0.0031,0.000919


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].


Done.


In [None]:
from transformers import pipeline
import os

model_folder = "/content/drive/MyDrive/NLP-Project/bart-unsup-pretrain"
os.makedirs(model_folder, exist_ok=True)

trainer.save_model(model_folder)
tokenizer.save_pretrained(model_folder)
generator = pipeline(
    "text2text-generation",
    model=model_folder,
    tokenizer=model_folder,
    device=0
)

Device set to use cuda:0


In [None]:
output = generator(
    "digital art trending on artstation a soviet astronaut playing soccer on mars mars landscape cinematic relaxing",
    max_new_tokens=50,
    do_sample=True,
    top_p=0.9,
    temperature=0.8
)[0]["generated_text"]

print(output)

digital art trending on artstation a soviet astronaut playing soccer on mars mars landscape cinematic relaxing


# **Test Evaluation with BLEU, BERT_F1**

In [None]:
from transformers import pipeline
import evaluate
import pandas as pd
from datasets import Dataset

# 1) Load your test DataFrame & wrap in a Dataset
df_test = pd.read_csv("hf_image_prompt_captioned_fullset/test_hf_image_prompt_captioned_fullset.csv")
ds_test = Dataset.from_pandas(df_test)

# 2) Point a text2text pipeline at your saved model folder
model_folder = "/content/drive/MyDrive/NLP-Project/bart-unsup-pretrain"
generator = pipeline(
    "text2text-generation",
    model=model_folder,
    tokenizer=model_folder,
    device=0,
    max_length=128,
    do_sample=False,
    batch_size=32,       # pipeline will batch inputs internally
)

# 3) Add a “pred” column with batched map
def gen_batch(batch):
    # batch["prompt"] is a list of strings
    outs = generator(batch["prompt"])
    batch["pred"] = [o["generated_text"] for o in outs]
    return batch

# Only drop the auto‐added index column; keep “prompt”
to_remove = [c for c in ds_test.column_names if c not in ("prompt",)]
ds_pred = ds_test.map(
    gen_batch,
    batched=True,
    batch_size=32,
    remove_columns=to_remove
)

# 4) Compute BLEU and BERT‐F1 in one go
bleu      = evaluate.load("bleu")
bertscore = evaluate.load("bertscore", device=0)

preds = ds_pred["pred"]
refs  = ds_pred["prompt"]

bleu_res = bleu.compute(
    predictions=preds,
    references=[[r] for r in refs]
)
bert_res = bertscore.compute(
    predictions=preds,
    references=refs,
    lang="en",
    model_type="bert-base-uncased",
    rescale_with_baseline=True
)

print(f"→ BLEU:         {bleu_res['bleu']*100:.2f}")
print(f"→ BERTScore F1: {100 * sum(bert_res['f1'])/len(bert_res['f1']):.2f}")
