In [None]:
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/NLP-Project/

Mounted at /content/drive
/content/drive/MyDrive/NLP-Project


In [None]:
import datasets
import pandas as pd
from transformers import (
    T5Tokenizer,
    T5ForConditionalGeneration,
    BartForConditionalGeneration,
    BartTokenizer,
    DataCollatorForSeq2Seq,
    Trainer,
    TrainingArguments,
    EarlyStoppingCallback,
    pipeline
)
from datasets import Features, Value
import os
from datasets import Dataset, DatasetDict

# **Data Preprocessing**

In [None]:
import pandas as pd

# Your source directory (if files are already in /content/drive/MyDrive/NLP-Project/)
src_dir = "/content/drive/MyDrive/NLP-Project/civitai_image_prompt_captioned_cleaned/"
filenames = [
    "train_civitai_image_prompt_captioned_cleaned.csv",
    "val_civitai_image_prompt_captioned_cleaned.csv",
    "test_civitai_image_prompt_captioned_cleaned.csv"
]

for fname in filenames:
    full_path = src_dir + fname
    df = pd.read_csv(full_path)

    # Replace "The image depicts/features a" at the start with "Generate an image of"
    df['caption'] = df['caption'].str.replace(
        r"(?i)^(The image (?:depicts|features a?))",
        "Generate an image of",
        regex=True
    )

    # Save rewritten file to the same directory with _rewritten suffix
    new_fname = fname.replace(".csv", "_rewritten2.csv")
    df.to_csv(src_dir + new_fname, index=False)
    print(f"Processed and saved: {src_dir + new_fname}")

Processed and saved: /content/drive/MyDrive/NLP-Project/civitai_image_prompt_captioned_cleaned/train_civitai_image_prompt_captioned_cleaned_rewritten2.csv
Processed and saved: /content/drive/MyDrive/NLP-Project/civitai_image_prompt_captioned_cleaned/val_civitai_image_prompt_captioned_cleaned_rewritten2.csv
Processed and saved: /content/drive/MyDrive/NLP-Project/civitai_image_prompt_captioned_cleaned/test_civitai_image_prompt_captioned_cleaned_rewritten2.csv


In [None]:
df = pd.read_csv("/content/drive/MyDrive/NLP-Project/civitai_image_prompt_captioned_cleaned/train_civitai_image_prompt_captioned_cleaned_rewritten.csv")
df.head()

Unnamed: 0,id,prompt,image_filename,caption
0,70251977,"slums beautiful landscape, evening, woman, ve...",70251977.jpg,Generate an image of a serene landscape at dus...
1,29649516,"extremely detailed landscape, (futuristic mono...",29649516.jpg,"Generate an image of a towering, dark rock mon..."
2,24627882,"a picture of a of a small fat fluffy creature,...",24627882.jpg,"Generate an image of cute, fluffy white kitten..."
3,32809724,"score_9, score_8_up, score_8, BREAK, zzMajor...",32809724.jpg,"Generate an image of vibrant, stylized charact..."
4,20975912,Creates an ultra detailed and realistic image ...,20975912.jpg,Generate an image oftwo sleek black muscle car...


# **Dataset Loading & Input/Target Formatting**

In [None]:
# ---
# 1) Load the datasets
# ---
# Define filenames
filenames = {
    "train": "civitai_image_prompt_captioned_cleaned/train_civitai_image_prompt_captioned_cleaned_rewritten.csv",
    "validation": "civitai_image_prompt_captioned_cleaned/val_civitai_image_prompt_captioned_cleaned_rewritten.csv",
    "test": "civitai_image_prompt_captioned_cleaned/test_civitai_image_prompt_captioned_cleaned_rewritten.csv"
}

# Load CSVs with pandas
df_train = pd.read_csv(filenames["train"])
df_val = pd.read_csv(filenames["validation"])
df_test = pd.read_csv(filenames["test"])

# Convert to HF datasets
raw_datasets = DatasetDict({
    "train": Dataset.from_pandas(df_train),
    "validation": Dataset.from_pandas(df_val),
    "test": Dataset.from_pandas(df_test)
})

In [None]:
# ---
# 2) Prepare "input" and "target"
# ---
def make_input_target(example):
    # T5, BART expects a dict with "input" and "target".
    src = f"{example['caption']}"
    tgt = example["prompt"]
    return {"input_text": src, "target_text": tgt}

# Apply the function to all splits in the DatasetDict
processed_datasets = raw_datasets.map(
    make_input_target,
    remove_columns=raw_datasets["train"].column_names,
)

Map:   0%|          | 0/1058 [00:00<?, ? examples/s]

Map:   0%|          | 0/132 [00:00<?, ? examples/s]

Map:   0%|          | 0/133 [00:00<?, ? examples/s]

# **T5**

In [None]:
pip install --upgrade datasets

Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.6.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.5/491.5 kB[0m [31m20.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2025.3.0-py3-none-any.whl (193 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m15.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fsspec, datasets
  Attempting uninstall: fsspec
    Found existing installation: fsspec 2025.3.2
    Uninstalling fsspec-2025.3.2:
      Successfully uninstalled fsspec-2025.3.2
  Attempting uninstall: datasets
    Found existing installation: datasets 2.14.4
    Uninstalling datasets-2.14.4:
      Successfully uninstalled datasets-2.14.4
[31mERROR: pip's dependency r

In [None]:
# ---
# 3) Load T5 tokenizer & add control tokens
# ---
model_folder = "/content/drive/MyDrive/NLP-Project/t5-unsup-pretrain"
tokenizer = T5Tokenizer.from_pretrained(model_folder)

# Set the pad token
tokenizer.pad_token = tokenizer.eos_token

# ---
# 4) Tokenize both inputs and targets
# ---
max_length = 128

def tokenize_fn(examples):
    # Tokenize the "input_text"
    inputs = tokenizer(
        examples["input_text"],
        truncation=True,
        padding="max_length",
        max_length=max_length,
    )
    # Tokenize the "target_text"
    with tokenizer.as_target_tokenizer():
        targets = tokenizer(
            examples["target_text"],
            truncation=True,
            padding="max_length",
            max_length=max_length,
        )

    # Prepare labels; replace pad_token_id with -100
    labels = targets["input_ids"].copy()
    labels = [
        [(lbl if lbl != tokenizer.pad_token_id else -100) for lbl in label_seq]
        for label_seq in labels
    ]

    batch = {
        "input_ids": inputs["input_ids"],
        "attention_mask": inputs["attention_mask"],
        "labels": labels,
    }
    return batch

tokenized_datasets = processed_datasets.map(
    tokenize_fn,
    batched=True,
    remove_columns=["input_text", "target_text"]
)


# ---
# 5) Load T5 model and resize embeddings
# ---
model = T5ForConditionalGeneration.from_pretrained(model_folder)
model.resize_token_embeddings(len(tokenizer))


# ---
# 6) Data collator for Seq2Seq + Trainer setup
# ---
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model,
    label_pad_token_id=-100,
)

# ---
# 7) Configure Training Arguments for Early Stopping
# ---
training_args = TrainingArguments(
    output_dir="t5-kw-controlled-finetuned",
    num_train_epochs=20,  # Set epochs to 20
    per_device_train_batch_size=4,
    gradient_accumulation_steps=8,
    learning_rate=5e-5,
    warmup_steps=100,
    logging_strategy="steps",
    eval_strategy="steps", # Evaluate at each logging step
    save_strategy="steps", # Save at each logging step
    logging_steps=100,
    save_steps=100,
    save_total_limit=2, # Optional: limits the total amount of checkpoints
    load_best_model_at_end=True, # Load the best model when training ends
    metric_for_best_model="eval_loss",
    fp16=True,
    report_to=[],
)


# ---
# 8) Set up Trainer with datasets and early stopping
# ---
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"], # Use the validation set for evaluation
    data_collator=data_collator,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=4)] # Add early stopping
)


# ---
# 9) Fine‐tune the model
# ---
trainer.train()

Map:   0%|          | 0/1058 [00:00<?, ? examples/s]



Map:   0%|          | 0/132 [00:00<?, ? examples/s]

Map:   0%|          | 0/133 [00:00<?, ? examples/s]

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss,Validation Loss
100,4.8253,3.474996
200,3.3613,3.10107
300,3.0922,2.982335
400,2.9292,2.925564
500,2.8545,2.893809
600,2.8054,2.877954


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


TrainOutput(global_step=680, training_loss=3.245566020292394, metrics={'train_runtime': 772.6024, 'train_samples_per_second': 27.388, 'train_steps_per_second': 0.88, 'total_flos': 3221387241062400.0, 'train_loss': 3.245566020292394, 'epoch': 20.0})

In [None]:
import os
from transformers import pipeline

# anywhere you like—here in the project root under 'gpt2-finetuned'
model_folder_2 = "/content/drive/MyDrive/NLP-Project/t5-finetuned"
os.makedirs(model_folder_2, exist_ok=True)

trainer.save_model(model_folder_2)
tokenizer.save_pretrained(model_folder_2)

generator_2 = pipeline(
    "text2text-generation",
    model=model_folder_2,
    tokenizer=model_folder_2,
    device=0
)

In [None]:
# 5) Call the pipeline
output = generator_2(
    "A young woman in a forest, dressed in a flowing, natural-looking garment adorned with feathers and jewelry, as she walks gracefully through lush greenery.",
    max_new_tokens=100,
    max_length=128,
    do_sample=True,
)[0]["generated_text"]

# ─── E) Example usage ───
print(output)

Both `max_new_tokens` (=100) and `max_length`(=128) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


a woman in a forest, wearing a flowing, natural-looking garment adorned with feathers and jewelry, walking through lush greenery. The woman is wearing a flowing, natural-looking garment adorned with feathers and jewelry, and is wearing a flowing, natural-looking garment. She is wearing a flowing, natural-looking garment adorned with feathers and jewelry. She is wearing a flowing, natural-looking garment adorned with feathers and jewelry


# **Test with BLEU, BERT-F1**

In [None]:
!pip install evaluate bert_score

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


In [None]:
from transformers import pipeline
import evaluate
from datasets import Dataset

class Text2TextEvaluator:
    def __init__(
        self,
        model_path: str,
        df_test,
        device: int = 0,
        max_new_tokens: int = 100,
        do_sample: bool = True,
        batch_size: int = 32,
    ):
        """
        model_path: path or HF ID for finetuned text2text model
        df_test: pandas.DataFrame with "prompt" column
        """
        self.model_path = model_path
        self.df_test = df_test
        self.device = device
        self.max_new_tokens = max_new_tokens
        self.do_sample = do_sample
        self.batch_size = batch_size

        # metrics
        self.bleu = evaluate.load("bleu")
        self.bertscore = evaluate.load("bertscore", device=device)

        # prepare dataset & pipeline
        self._build_dataset()
        self._build_pipeline()

    def _build_dataset(self):
        ds = Dataset.from_pandas(self.df_test.reset_index(drop=True))
        # keep only the "prompt" column
        self.ds_test = ds.map(
            lambda ex: {"prompt": ex["prompt"]},
            remove_columns=[c for c in ds.column_names if c != "prompt"],
        )

    def _build_pipeline(self):
        self.generator = pipeline(
            "text2text-generation",
            model=self.model_path,
            tokenizer=self.model_path,
            device=self.device,
            max_new_tokens=self.max_new_tokens,
            do_sample=self.do_sample,
            batch_size=self.batch_size,
        )

    def _gen_batch(self, batch):
        outs = self.generator(batch["prompt"], batch_size=len(batch["prompt"]))
        # prepend fixed prefix
        batch["pred"] = ["Generate an image of " + o["generated_text"].strip() for o in outs]
        return batch

    def evaluate(self):
        # 1) generate
        ds_pred = self.ds_test.map(
            self._gen_batch,
            batched=True,
            batch_size=self.batch_size,
            remove_columns=["prompt"]
        )

        preds = ds_pred["pred"]
        refs  = self.ds_test["prompt"]

        # 2) BLEU
        bleu_res = self.bleu.compute(
            predictions=preds,
            references=[[r] for r in refs]
        )

        # 3) BERTScore
        bert_res = self.bertscore.compute(
            predictions=preds,
            references=refs,
            lang="en",
            model_type="bert-base-uncased",
            rescale_with_baseline=True
        )

        # 4) return a summary dict
        return {
            "BLEU": bleu_res["bleu"] * 100,
            "BERTScore_F1": 100 * sum(bert_res["f1"]) / len(bert_res["f1"])
        }

In [None]:
# assume df_test is your pandas DataFrame with a "prompt" column
evaluator = Text2TextEvaluator(
    model_path="/content/drive/MyDrive/NLP-Project/t5-finetuned",
    df_test=df_test,
    device=0
)

results = evaluator.evaluate()
print(f"→ BLEU:         {results['BLEU']:.2f}")
print(f"→ BERTScore F1: {results['BERTScore_F1']:.2f}")

Device set to use cuda:0


Map:   0%|          | 0/133 [00:00<?, ? examples/s]

→ BLEU:         7.35
→ BERTScore F1: 25.64


# **BART**

In [None]:
# ---
# 3) Load BART tokenizer
# ---
model_folder = "/content/drive/MyDrive/NLP-Project/bart-unsup-pretrain"
tokenizer = BartTokenizer.from_pretrained(model_folder)

# ---
# 4) Tokenize input/target
# ---
max_length = 128

def tokenize_fn(examples):
    model_inputs = tokenizer(
        examples["input_text"],
        truncation=True,
        padding="max_length",
        max_length=max_length,
    )
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            examples["target_text"],
            truncation=True,
            padding="max_length",
            max_length=max_length,
        )

    labels = labels["input_ids"]
    labels = [
        [(lbl if lbl != tokenizer.pad_token_id else -100) for lbl in seq]
        for seq in labels
    ]

    model_inputs["labels"] = labels
    return model_inputs

tokenized_datasets = processed_datasets.map(
    tokenize_fn,
    batched=True,
    remove_columns=["input_text", "target_text"]
)

# ---
# 5) Load BART model
# ---
model = BartForConditionalGeneration.from_pretrained(model_folder)
model.resize_token_embeddings(len(tokenizer))

# ---
# 6) Data collator
# ---
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model,
    label_pad_token_id=-100,
)

# ---
# 7) Training arguments
# ---
training_args = TrainingArguments(
    output_dir="bart-kw-controlled-finetuned",
    num_train_epochs=20,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=8,
    learning_rate=5e-5,
    warmup_steps=100,
    logging_strategy="steps",
    eval_strategy="steps",
    save_strategy="steps",
    logging_steps=100,
    save_steps=100,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    fp16=True,
    report_to=[],
)

# ---
# 8) Trainer setup
# ---
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
)

# ---
# 9) Train
# ---
trainer.train()

Map:   0%|          | 0/1058 [00:00<?, ? examples/s]



Map:   0%|          | 0/132 [00:00<?, ? examples/s]

Map:   0%|          | 0/133 [00:00<?, ? examples/s]

  trainer = Trainer(


Step,Training Loss,Validation Loss
100,6.5826,3.568853
200,3.1896,3.093141
300,2.6539,3.026177
400,2.3378,2.993963
500,2.151,3.018849
600,2.0202,3.022699


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].


TrainOutput(global_step=680, training_loss=3.0122655083151426, metrics={'train_runtime': 333.5006, 'train_samples_per_second': 63.448, 'train_steps_per_second': 2.039, 'total_flos': 1612752932044800.0, 'train_loss': 3.0122655083151426, 'epoch': 20.0})

In [None]:
# 6) Save model + tokenizer
model_folder_3 = "/content/drive/MyDrive/NLP-Project/bart-finetuned"
import os
os.makedirs(model_folder_3, exist_ok=True)

trainer.save_model(model_folder_3)
tokenizer.save_pretrained(model_folder_3)

from transformers import pipeline
generator_3 = pipeline(
    "text2text-generation",
    model=model_folder_3,
    tokenizer=model_folder_3,
    device=0
)

In [None]:
output = generator_3(
    "Generate an image of a beautifully crafted, translucent dragon sculpture with iridescent purple and blue hues, posed majestically on a base of crystalline amethyst surrounded by shimmering light.",
    max_length=100,
    max_new_tokens = 256,
    do_sample=True,
    top_p=0.9,
    temperature=0.8
)[0]["generated_text"]

print(output)

Both `max_new_tokens` (=256) and `max_length`(=100) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


masterpiece, masterpiece, best quality, very aesthetic, absurdres, newest, 8K, depth of field, focused subject, 
dragon scales, scales, crystalline amethyst,


# **Test with BLEU, BERT-F1**

In [None]:
evaluator = Text2TextEvaluator(
    model_path="/content/drive/MyDrive/NLP-Project/bart-finetuned",
    df_test=df_test,
    device=0
)

results = evaluator.evaluate()
print(f"→ BLEU:         {results['BLEU']:.2f}")
print(f"→ BERTScore F1: {results['BERTScore_F1']:.2f}")

Map:   0%|          | 0/200 [00:00<?, ? examples/s]