In [24]:
import os
import gc
import json
import random
import numpy as np
import pandas as pd
import nltk
import torch
import optuna
from datasets import Dataset
from sklearn.model_selection import train_test_split
from transformers import (
    AutoTokenizer, BartForConditionalGeneration,
    Seq2SeqTrainer, Seq2SeqTrainingArguments,
    DataCollatorForSeq2Seq,
    EarlyStoppingCallback
)
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction
from nltk.translate.meteor_score import meteor_score
from rouge_score import rouge_scorer
from bert_score import score
import re
import shutil
import logging
from optuna.samplers import TPESampler
from optuna.pruners import MedianPruner

In [2]:
# Setup logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Seed for reproducibility
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
optuna.trial.FixedTrial.seed = SEED
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [3]:
# Check CUDA availability
print("CUDA available:", torch.cuda.is_available())
print("CUDA version:", torch.version.cuda)
print("CUDNN version:", torch.backends.cudnn.version())

CUDA available: True
CUDA version: 12.8
CUDNN version: 90701


In [4]:
# NLTK downloads
try:
    nltk.download('wordnet', quiet=True)
    nltk.download('omw-1.4', quiet=True)
    nltk.download('punkt', quiet=True)
    nltk.download('punkt_tab', quiet=True)
except Exception as e:
    logger.error(f"NLTK download failed: {e}")
    raise

In [5]:
# Define directories
PROJECT_ROOT = r"D:\A_CSE499"
DATA_DIR = os.path.join(PROJECT_ROOT, "data")
OUTPUT_DIR = os.path.join(PROJECT_ROOT, "output_bloom")
os.makedirs(DATA_DIR, exist_ok=True)
os.makedirs(OUTPUT_DIR, exist_ok=True)

In [6]:
# Define Bloom levels and corresponding special tokens
BLOOM_LEVELS = {
    'remembering': '<REM>',
    'understanding': '<UND>',
    'applying': '<APP>',
    'analyzing': '<ANA>'
}

In [8]:
# Convert JSON to CSV
def convert_json_to_csv(json_file, csv_file, required_fields):
    if not os.path.exists(json_file):
        logger.error(f"JSON file not found: {json_file}")
        raise FileNotFoundError(f"JSON file not found: {json_file}")
    try:
        with open(json_file, 'r', encoding='utf-8') as f:
            data = json.load(f)
        if not isinstance(data, list):
            logger.error(f"Data in {json_file} must be a list of records")
            raise ValueError(f"Data in {json_file} must be a list of records")
        for record in data:
            if not all(field in record for field in required_fields):
                logger.error(f"Missing required fields in {json_file}: {record}")
                raise ValueError(f"Missing required fields in {json_file}")
        df = pd.DataFrame(data)
        df.to_csv(csv_file, index=False, encoding='utf-8')
        logger.info(f"Converted {json_file} to {csv_file}")
        return df
    except Exception as e:
        logger.error(f"Error converting {json_file} to CSV: {e}")
        raise

# Convert JSON datasets to CSV
required_fields = ['context', 'question', 'level']
eduqg_json = os.path.join(DATA_DIR, "eduqg_903.json")
merged_json = os.path.join(DATA_DIR, "merged_600_per_level.json")
eduqg_csv = os.path.join(DATA_DIR, "eduqg_903.csv")
merged_csv = os.path.join(DATA_DIR, "merged_600_per_level.csv")
eduqg_df = convert_json_to_csv(eduqg_json, eduqg_csv, required_fields)
merged_df = convert_json_to_csv(merged_json, merged_csv, required_fields)

INFO:__main__:Converted D:\A_CSE499\data\eduqg_903.json to D:\A_CSE499\data\eduqg_903.csv
INFO:__main__:Converted D:\A_CSE499\data\merged_600_per_level.json to D:\A_CSE499\data\merged_600_per_level.csv


In [9]:
# Punctuation cleaning function
def fix_punctuation_spacing(text):
    if not isinstance(text, str):
        return text
    text = text.replace(r'\newline', ' ').replace('\n', ' ')
    text = text.replace(r'\"', '"')
    text = re.sub(r'\s+([,.;:!?])', r'\1', text)
    text = re.sub(r'([,.;:!?])([^\s\W])', r'\1 \2', text)
    text = re.sub(r'([.;:!?])(")', r'\2\1', text)
    text = re.sub(r'\(\s+', '(', text)
    text = re.sub(r'\s+\)', ')', text)
    text = re.sub(r'"\s+', '"', text)
    text = re.sub(r'\s+"', '"', text)
    text = re.sub(r"'\s+", "'", text)
    text = re.sub(r"\s+'", "'", text)
    text = re.sub(r'\s*[-–—]+\s*', ' — ', text)
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

In [10]:
# Load tokenizer and add special tokens
tokenizer = AutoTokenizer.from_pretrained("D:\\A_CSE499\\outputLarge_B_phase2\\final_model\\reload_model")
special_tokens = list(BLOOM_LEVELS.values())
tokenizer.add_special_tokens({'additional_special_tokens': special_tokens})
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [11]:
# Preprocessing function
def preprocess_function(batch):
    inputs = []
    targets = []
    for c, q, level in zip(batch['context'], batch['question'], batch['level']):
        c_clean = fix_punctuation_spacing(str(c)) if c else ""
        q_clean = fix_punctuation_spacing(str(q)) if q else ""
        if q_clean.startswith("What is the"):
            q_clean = q_clean.replace("What is the", "What can you tell about")
        bloom_token = BLOOM_LEVELS.get(level.lower(), '<REM>')
        inputs.append(f"{bloom_token} {c_clean}")
        targets.append(f"{bloom_token} {q_clean}")
    model_inputs = tokenizer(inputs, truncation=True, max_length=512)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, truncation=True, max_length=64)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [12]:
# Utility to validate CSV
def validate_csv(file_path, required_columns):
    if not os.path.exists(file_path):
        logger.error(f"CSV file not found: {file_path}")
        raise FileNotFoundError(f"CSV file not found: {file_path}")
    df = pd.read_csv(file_path)
    for col in required_columns:
        if col not in df.columns:
            logger.error(f"Column '{col}' missing in {file_path}")
            raise ValueError(f"Column '{col}' missing in {file_path}")
    return df

In [13]:
# Load and merge datasets
eduqg_df = validate_csv(os.path.join(DATA_DIR, "eduqg_903.csv"), ['context', 'question', 'level'])
merged_df = validate_csv(os.path.join(DATA_DIR, "merged_600_per_level.csv"), ['context', 'question', 'level'])
df = pd.concat([eduqg_df, merged_df], ignore_index=True)

# Normalize level and filter
df['level'] = df['level'].str.lower()
df = df[df['level'].isin(BLOOM_LEVELS.keys())]

# Print Bloom level distribution
bloom_counts = df['level'].value_counts()
print("Bloom level distribution in full dataset:")
print(bloom_counts)

Bloom level distribution in full dataset:
level
applying         729
understanding    716
analyzing        634
remembering      623
Name: count, dtype: int64


In [14]:
# Split into train (80%), temp (20%) with stratification
train_df, temp_df = train_test_split(df, test_size=0.2, random_state=SEED, stratify=df['level'])

# Split temp into validation (10%) and test (10%)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=SEED, stratify=temp_df['level'])

print(f"Train: {len(train_df)}, Validation: {len(val_df)}, Test: {len(test_df)}")
print("Bloom level distribution in train set:")
print(train_df['level'].value_counts())

Train: 2161, Validation: 270, Test: 271
Bloom level distribution in train set:
level
applying         583
understanding    573
analyzing        507
remembering      498
Name: count, dtype: int64


In [15]:
# Display sample data
sample = df.sample(5)
for i, row in sample.iterrows():
    print(f"Context: {row['context'][:300]}")
    print(f"Question: {row['question']}")
    print(f"Level: {row['level']}")
    print("="*80)

Context: The demographics of the British Isles today are characterised by a generally high density of population in England, which accounts for almost 80% of the total population of the islands. In elsewhere on Great Britain and on Ireland, high density of population is limited to areas around, or close to, 
Question: What percentage of the population does England have compared to the other areas in the British Isles?
Level: analyzing
Context: Many years ago my student asked me the question, "Mrs. Kindred, why do you teach?" Without taking time to reflect, I answered, "Because someday I might say something that might make a difference in someone's life." Even though I was sincere, that wasn't a very good answer and my student didn't let i
Question: Why did the student continue to ask the question about the writer's being a teacher?
Level: applying
Context: " yes , " said the king 's daughter , " indeed i will be careful , and not touch the least thing , just as you have told me . " but

In [16]:
# Convert to HuggingFace Datasets
dataset_train = Dataset.from_pandas(train_df)
dataset_val = Dataset.from_pandas(val_df)
dataset_test = Dataset.from_pandas(test_df)

# Tokenize datasets
processed_train_dataset = dataset_train.map(preprocess_function, batched=True, batch_size=50)
processed_val_dataset = dataset_val.map(preprocess_function, batched=True, batch_size=50)
processed_test_dataset = dataset_test.map(preprocess_function, batched=True, batch_size=50)

Map:   0%|          | 0/2161 [00:00<?, ? examples/s]



Map:   0%|          | 0/270 [00:00<?, ? examples/s]

Map:   0%|          | 0/271 [00:00<?, ? examples/s]

In [17]:
# Evaluation Metrics
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    if isinstance(predictions, tuple):
        predictions = predictions[0]
    predictions = np.clip(predictions, 0, tokenizer.vocab_size - 1)
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=False)
    labels[labels == -100] = tokenizer.pad_token_id
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=False)

    def remove_bloom_token(text):
        for token in BLOOM_LEVELS.values():
            text = text.replace(token, '').strip()
        return text

    decoded_preds_clean = [remove_bloom_token(pred) for pred in decoded_preds]
    decoded_labels_clean = [remove_bloom_token(ref) for ref in decoded_labels]

    ref_tokens = [[nltk.word_tokenize(ref)] for ref in decoded_labels_clean]
    pred_tokens = [nltk.word_tokenize(pred) for pred in decoded_preds_clean]

    smoothie = SmoothingFunction().method4
    bleu1 = corpus_bleu(ref_tokens, pred_tokens, weights=(1, 0, 0, 0), smoothing_function=smoothie)
    bleu4 = corpus_bleu(ref_tokens, pred_tokens, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=smoothie)

    scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
    rouge_l = sum(
        scorer.score(ref, pred)['rougeL'].fmeasure
        for ref, pred in zip(decoded_labels_clean, decoded_preds_clean)
    ) / len(decoded_labels_clean)

    meteor = sum(
        meteor_score([nltk.word_tokenize(ref)], nltk.word_tokenize(pred))
        for ref, pred in zip(decoded_labels_clean, decoded_preds_clean)
    ) / len(decoded_labels_clean)

    try:
        P, R, F1 = score(decoded_preds_clean, decoded_labels_clean, lang="en", verbose=False)
        bertscore = F1.mean().item()
    except Exception:
        bertscore = 0.0

    return {
        "bleu-1": bleu1, "bleu-4": bleu4,
        "rouge-l": rouge_l, "meteor": meteor, "bertscore": bertscore
    }

In [18]:
# Early stopping callback
class CustomEarlyStoppingCallback(EarlyStoppingCallback):
    def __init__(self, early_stopping_patience, min_delta=0.005):
        super().__init__(early_stopping_patience=early_stopping_patience)
        self.min_delta = min_delta
        self.best_metric = float('inf')
        self.early_stopping_patience_counter = 0

    def on_evaluate(self, args, state, control, metrics, **kwargs):
        eval_loss = metrics.get('eval_loss', float('inf'))
        if self.best_metric == float('inf') or eval_loss < self.best_metric - self.min_delta:
            self.best_metric = eval_loss
            self.early_stopping_patience_counter = 0
        else:
            self.early_stopping_patience_counter += 1
        if self.early_stopping_patience_counter >= self.early_stopping_patience:
            logger.info(f"Early stopping triggered after {self.early_stopping_patience} evaluations with eval_loss={eval_loss}")
            control.should_training_stop = True

In [19]:
# Save trial results helper
def save_trial_results(study, output_dir):
    trials_data = []
    for trial in study.trials:
        trial_data = {
            'trial_number': trial.number,
            'eval_loss': trial.value if trial.value is not None else float('inf'),
            'state': str(trial.state),
            **trial.params
        }
        trials_data.append(trial_data)
    best_trial_data = {
        'trial_number': study.best_trial.number,
        'eval_loss': study.best_trial.value,
        'state': 'BEST',
        **study.best_params
    }
    trials_data.append(best_trial_data)
    output_path = os.path.join(output_dir, 'optuna_trials.csv')
    trials_df = pd.DataFrame(trials_data)
    mode = 'a' if os.path.exists(output_path) else 'w'
    trials_df.to_csv(output_path, index=False, mode=mode, header=not os.path.exists(output_path))
    logger.info("Saved trial results to optuna_trials.csv")

In [20]:
# Objective function for Optuna tuning
def objective(trial):
    torch.cuda.empty_cache()
    gc.collect()
    model = BartForConditionalGeneration.from_pretrained("D:\\A_CSE499\\outputLarge_B_phase2\\final_model\\reload_model", trust_remote_code=True, use_safetensors=True)
    model.resize_token_embeddings(len(tokenizer))
    model.to(device)
    model.gradient_checkpointing_enable()
    model.generation_config.no_repeat_ngram_size = 3
    model.generation_config.min_length = 5

    learning_rate = trial.suggest_float("learning_rate", 1e-5, 5e-5, log=True)
    weight_decay = trial.suggest_float("weight_decay", 0.01, 0.1, log=True)
    warmup_steps = trial.suggest_int("warmup_steps", 100, 500, step=50)
    lr_scheduler_type = trial.suggest_categorical("lr_scheduler_type", ["linear", "cosine", "cosine_with_restarts"])

    trial_output_dir = os.path.join(OUTPUT_DIR, f"trial_{trial.number}")
    os.makedirs(trial_output_dir, exist_ok=True)

    training_args = Seq2SeqTrainingArguments(
        output_dir=trial_output_dir,
        num_train_epochs=8,
        per_device_train_batch_size=2,
        per_device_eval_batch_size=2,
        gradient_accumulation_steps=16,
        dataloader_num_workers=0,
        dataloader_pin_memory=torch.cuda.is_available(),
        lr_scheduler_type=lr_scheduler_type,
        learning_rate=learning_rate,
        warmup_steps=warmup_steps,
        remove_unused_columns=True,
        report_to=[],
        eval_strategy="epoch",
        save_strategy="epoch",
        save_total_limit=2,
        weight_decay=weight_decay,
        fp16=True,
        logging_strategy="epoch",
        predict_with_generate=True,
        generation_max_length=64,
        generation_num_beams=3,
        load_best_model_at_end=True,
        metric_for_best_model="eval_loss",
        greater_is_better=False,
        group_by_length=True
    )

    trainer = Seq2SeqTrainer(
        model=model,
        args=training_args,
        train_dataset=processed_train_dataset,
        eval_dataset=processed_val_dataset,
        compute_metrics=None,
        data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model, padding=True),
        tokenizer=tokenizer,
        callbacks=[CustomEarlyStoppingCallback(early_stopping_patience=2)]
    )

    checkpoint = None
    try:
        for checkpoint_dir in sorted(os.listdir(trial_output_dir), reverse=True):
            if checkpoint_dir.startswith("checkpoint"):
                checkpoint = os.path.join(trial_output_dir, checkpoint_dir)
                break
    except FileNotFoundError:
        logger.info(f"No checkpoints found in {trial_output_dir}")

    if checkpoint:
        try:
            logger.info(f"Resuming training from {checkpoint}")
            trainer.train(resume_from_checkpoint=checkpoint)
        except Exception as e:
            logger.warning(f"Failed to resume from checkpoint {checkpoint}: {e}. Starting new training.")
            trainer.train()
    else:
        logger.info(f"Starting new training for trial {trial.number}")
        trainer.train()

    eval_results = trainer.evaluate()
    torch.cuda.empty_cache()

    for log in trainer.state.log_history:
        if 'eval_loss' in log:
            step = log.get("epoch", 0)
            trial.report(log['eval_loss'], step=int(step))

    return eval_results["eval_loss"]

In [25]:
# Run Optuna hyperparameter search
study_name = "bart_bloom_question_generation"
storage = f"sqlite:///{os.path.join(OUTPUT_DIR, 'optuna_study.db')}"
try:
    study = optuna.create_study(
        study_name=study_name,
        storage=storage,
        direction="minimize",
        sampler=TPESampler(seed=42),  # Bayesian optimization
        pruner=MedianPruner(n_warmup_steps=2),
        load_if_exists=True
    )
    study.optimize(objective, n_trials=13)
    save_trial_results(study, OUTPUT_DIR)
    best_params = study.best_params
    with open(os.path.join(OUTPUT_DIR, "best_params.json"), "w") as f:
        json.dump(best_params, f, indent=4)
    print("Best hyperparameters:", best_params)
    print("Best objective value (eval_loss):", study.best_value)
except Exception as e:
    logger.error(f"Optuna optimization or file saving failed: {e}")
    raise

[I 2025-07-25 20:03:43,798] A new study created in RDB with name: bart_bloom_question_generation
The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`
  trainer = Seq2SeqTrainer(
INFO:__main__:Starting new training for trial 0


Epoch,Training Loss,Validation Loss
1,3.0889,2.417607
2,2.3169,2.183061
3,2.143,2.158609
4,2.0123,2.145115
5,1.8695,2.148754
6,1.7031,2.150242


INFO:__main__:Early stopping triggered after 2 evaluations with eval_loss=2.1502420902252197
There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].


INFO:__main__:Early stopping triggered after 2 evaluations with eval_loss=2.1502861976623535
[I 2025-07-25 20:44:51,753] Trial 0 finished with value: 2.1502861976623535 and parameters: {'learning_rate': 1.827226177606625e-05, 'weight_decay': 0.08927180304353628, 'warmup_steps': 400, 'lr_scheduler_type': 'linear'}. Best is trial 0 with value: 2.1502861976623535.
  trainer = Seq2SeqTrainer(
INFO:__main__:Starting new training for trial 1


Epoch,Training Loss,Validation Loss
1,3.1883,2.543861
2,2.3802,2.198973
3,2.189,2.165307
4,2.0776,2.145302
5,1.9604,2.1472
6,1.8324,2.149708


INFO:__main__:Early stopping triggered after 2 evaluations with eval_loss=2.149707555770874
There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].


INFO:__main__:Early stopping triggered after 2 evaluations with eval_loss=2.1504056453704834
[I 2025-07-25 21:22:35,744] Trial 1 finished with value: 2.1504056453704834 and parameters: {'learning_rate': 1.097990803659665e-05, 'weight_decay': 0.07348118405270448, 'warmup_steps': 350, 'lr_scheduler_type': 'cosine_with_restarts'}. Best is trial 0 with value: 2.1502861976623535.
  trainer = Seq2SeqTrainer(
INFO:__main__:Starting new training for trial 2


Epoch,Training Loss,Validation Loss
1,2.6998,2.2125
2,2.175,2.187758
3,1.8677,2.251009
4,1.3982,1.818248
5,0.9852,1.852001
6,0.7989,1.956798


INFO:__main__:Early stopping triggered after 2 evaluations with eval_loss=1.9567984342575073
There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].


INFO:__main__:Early stopping triggered after 2 evaluations with eval_loss=1.8243459463119507
[I 2025-07-25 21:54:24,480] Trial 2 finished with value: 1.8243459463119507 and parameters: {'learning_rate': 3.818145165896868e-05, 'weight_decay': 0.016305687346221478, 'warmup_steps': 150, 'lr_scheduler_type': 'cosine_with_restarts'}. Best is trial 2 with value: 1.8243459463119507.
  trainer = Seq2SeqTrainer(
INFO:__main__:Starting new training for trial 3


Epoch,Training Loss,Validation Loss
1,3.0268,2.339916
2,2.2905,2.177272
3,2.1146,2.15831
4,1.9692,2.152182
5,1.805,2.146192
6,1.5885,2.107433
7,1.3239,2.00095
8,1.1416,1.985947


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].


[I 2025-07-25 22:37:03,172] Trial 3 finished with value: 1.9899410009384155 and parameters: {'learning_rate': 2.0040871876541563e-05, 'weight_decay': 0.019553708662745254, 'warmup_steps': 350, 'lr_scheduler_type': 'cosine_with_restarts'}. Best is trial 2 with value: 1.8243459463119507.
  trainer = Seq2SeqTrainer(
INFO:__main__:Starting new training for trial 4


Epoch,Training Loss,Validation Loss
1,2.8061,2.231433
2,2.2125,2.16154
3,1.9836,2.186859
4,1.7307,2.131996
5,1.4892,2.071246
6,1.2538,2.071425
7,1.1181,1.968015
8,1.0605,1.95744


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].


[I 2025-07-25 23:26:56,327] Trial 4 finished with value: 1.9620593786239624 and parameters: {'learning_rate': 2.083431561152948e-05, 'weight_decay': 0.06097839109531514, 'warmup_steps': 150, 'lr_scheduler_type': 'cosine'}. Best is trial 2 with value: 1.8243459463119507.
  trainer = Seq2SeqTrainer(
INFO:__main__:Starting new training for trial 5


Epoch,Training Loss,Validation Loss
1,2.7062,2.220086
2,2.1667,2.167386
3,1.8459,2.211313
4,1.5154,2.221535


INFO:__main__:Early stopping triggered after 2 evaluations with eval_loss=2.2215352058410645
There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].


INFO:__main__:Early stopping triggered after 2 evaluations with eval_loss=2.1644623279571533
[I 2025-07-25 23:48:15,482] Trial 5 finished with value: 2.1644623279571533 and parameters: {'learning_rate': 2.658616083788978e-05, 'weight_decay': 0.014808945119975192, 'warmup_steps': 100, 'lr_scheduler_type': 'cosine'}. Best is trial 2 with value: 1.8243459463119507.
  trainer = Seq2SeqTrainer(
INFO:__main__:Starting new training for trial 6


Epoch,Training Loss,Validation Loss
1,3.1192,2.455489
2,2.3331,2.186971
3,2.1564,2.15896
4,2.0328,2.144551
5,1.8984,2.142253
6,1.7476,2.152521


INFO:__main__:Early stopping triggered after 2 evaluations with eval_loss=2.1525213718414307
There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].


INFO:__main__:Early stopping triggered after 2 evaluations with eval_loss=2.148099660873413
[I 2025-07-26 00:19:47,053] Trial 6 finished with value: 2.148099660873413 and parameters: {'learning_rate': 1.632735695468795e-05, 'weight_decay': 0.012521954287060391, 'warmup_steps': 400, 'lr_scheduler_type': 'cosine_with_restarts'}. Best is trial 2 with value: 1.8243459463119507.
  trainer = Seq2SeqTrainer(
INFO:__main__:Starting new training for trial 7


Epoch,Training Loss,Validation Loss
1,3.0486,2.362625
2,2.2989,2.177549
3,2.123,2.15967
4,1.9831,2.144153
5,1.8464,2.151132
6,1.7439,2.163075


INFO:__main__:Early stopping triggered after 2 evaluations with eval_loss=2.1630754470825195
There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].


INFO:__main__:Early stopping triggered after 2 evaluations with eval_loss=2.14947247505188
[I 2025-07-26 00:52:09,353] Trial 7 finished with value: 2.14947247505188 and parameters: {'learning_rate': 1.0569064414047021e-05, 'weight_decay': 0.08115595675970502, 'warmup_steps': 200, 'lr_scheduler_type': 'linear'}. Best is trial 2 with value: 1.8243459463119507.
  trainer = Seq2SeqTrainer(
INFO:__main__:Starting new training for trial 8


Epoch,Training Loss,Validation Loss
1,3.074,2.397157
2,2.3101,2.182395
3,2.1356,2.158985
4,2.0008,2.146745
5,1.8542,2.154955
6,1.679,2.151036


INFO:__main__:Early stopping triggered after 2 evaluations with eval_loss=2.151036024093628
There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].


INFO:__main__:Early stopping triggered after 2 evaluations with eval_loss=2.1514883041381836
[I 2025-07-26 01:24:58,053] Trial 8 finished with value: 2.1514883041381836 and parameters: {'learning_rate': 2.4106495902171624e-05, 'weight_decay': 0.015305744365500184, 'warmup_steps': 500, 'lr_scheduler_type': 'cosine'}. Best is trial 2 with value: 1.8243459463119507.
  trainer = Seq2SeqTrainer(
INFO:__main__:Starting new training for trial 9


Epoch,Training Loss,Validation Loss
1,2.6952,2.218149
2,2.1705,2.16616
3,1.8561,2.151522
4,1.5564,2.144971
5,1.2311,1.863189
6,0.9998,1.909464
7,0.8855,1.89612


INFO:__main__:Early stopping triggered after 2 evaluations with eval_loss=1.8961198329925537
There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].


INFO:__main__:Early stopping triggered after 2 evaluations with eval_loss=1.87299382686615
[I 2025-07-26 02:03:00,218] Trial 9 finished with value: 1.87299382686615 and parameters: {'learning_rate': 2.6176655097040075e-05, 'weight_decay': 0.0835361075531176, 'warmup_steps': 100, 'lr_scheduler_type': 'cosine_with_restarts'}. Best is trial 2 with value: 1.8243459463119507.
  trainer = Seq2SeqTrainer(
INFO:__main__:Starting new training for trial 10


Epoch,Training Loss,Validation Loss
1,2.7452,2.207074
2,2.1894,2.17161
3,1.9355,2.217925
4,1.5624,1.917864
5,1.0096,1.802283
6,0.8072,1.93212
7,0.6544,1.965615


INFO:__main__:Early stopping triggered after 2 evaluations with eval_loss=1.9656147956848145
There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].


INFO:__main__:Early stopping triggered after 2 evaluations with eval_loss=1.8131524324417114
[I 2025-07-26 02:39:52,443] Trial 10 finished with value: 1.8131524324417114 and parameters: {'learning_rate': 4.8260289205439004e-05, 'weight_decay': 0.02973906577570737, 'warmup_steps': 250, 'lr_scheduler_type': 'cosine_with_restarts'}. Best is trial 10 with value: 1.8131524324417114.
  trainer = Seq2SeqTrainer(
INFO:__main__:Starting new training for trial 11


Epoch,Training Loss,Validation Loss
1,2.7436,2.205958
2,2.1909,2.19937
3,1.9322,2.175875
4,1.5818,1.884568
5,1.0368,1.78413
6,0.8163,1.95426
7,0.6587,1.992813


INFO:__main__:Early stopping triggered after 2 evaluations with eval_loss=1.992812991142273
There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].


INFO:__main__:Early stopping triggered after 2 evaluations with eval_loss=1.7933887243270874
[I 2025-07-26 03:17:52,363] Trial 11 finished with value: 1.7933887243270874 and parameters: {'learning_rate': 4.8625815893389155e-05, 'weight_decay': 0.030098158477190287, 'warmup_steps': 250, 'lr_scheduler_type': 'cosine_with_restarts'}. Best is trial 11 with value: 1.7933887243270874.
  trainer = Seq2SeqTrainer(
INFO:__main__:Starting new training for trial 12


Epoch,Training Loss,Validation Loss
1,2.7449,2.207433
2,2.1918,2.173019
3,1.9316,2.183483
4,1.5719,1.897065
5,1.0427,1.972192
6,0.8246,1.989672


INFO:__main__:Early stopping triggered after 2 evaluations with eval_loss=1.9896717071533203
There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].


INFO:__main__:Early stopping triggered after 2 evaluations with eval_loss=1.9050681591033936
[I 2025-07-26 03:49:56,934] Trial 12 finished with value: 1.9050681591033936 and parameters: {'learning_rate': 4.765821424574219e-05, 'weight_decay': 0.033664587127253655, 'warmup_steps': 250, 'lr_scheduler_type': 'cosine_with_restarts'}. Best is trial 11 with value: 1.7933887243270874.
INFO:__main__:Saved trial results to optuna_trials.csv


Best hyperparameters: {'learning_rate': 4.8625815893389155e-05, 'weight_decay': 0.030098158477190287, 'warmup_steps': 250, 'lr_scheduler_type': 'cosine_with_restarts'}
Best objective value (eval_loss): 1.7933887243270874


In [26]:
# Clean up non-best trials
best_trial_dir = os.path.join(OUTPUT_DIR, f"trial_{study.best_trial.number}")
for trial_dir in os.listdir(OUTPUT_DIR):
    if trial_dir.startswith("trial_") and trial_dir != os.path.basename(best_trial_dir):
        shutil.rmtree(os.path.join(OUTPUT_DIR, trial_dir))

In [27]:
# Final Training with best params
best_model_dir = os.path.join(OUTPUT_DIR, "best_model")
os.makedirs(best_model_dir, exist_ok=True)
model = BartForConditionalGeneration.from_pretrained("D:\\A_CSE499\\outputLarge_B_phase2\\final_model\\reload_model", trust_remote_code=True, use_safetensors=True)
model.resize_token_embeddings(len(tokenizer))
model.to(device)
model.gradient_checkpointing_enable()
model.generation_config.no_repeat_ngram_size = 3
model.generation_config.min_length = 5

training_args = Seq2SeqTrainingArguments(
    output_dir=best_model_dir,
    num_train_epochs=10,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=16,
    dataloader_num_workers=0,
    lr_scheduler_type=best_params["lr_scheduler_type"],
    learning_rate=best_params["learning_rate"],
    warmup_steps=best_params["warmup_steps"],
    remove_unused_columns=True,
    report_to=[],
    eval_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    weight_decay=best_params["weight_decay"],
    fp16=True,
    logging_strategy="epoch",
    predict_with_generate=True,
    generation_max_length=64,
    generation_num_beams=3,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    group_by_length=True,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=processed_train_dataset,
    eval_dataset=processed_val_dataset,
    compute_metrics=compute_metrics,
    data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model, padding=True),
    tokenizer=tokenizer,
    callbacks=[CustomEarlyStoppingCallback(early_stopping_patience=3)]
)

checkpoint = None
try:
    for checkpoint_dir in sorted(os.listdir(best_model_dir), reverse=True):
        if checkpoint_dir.startswith("checkpoint"):
            checkpoint = os.path.join(best_model_dir, checkpoint_dir)
            break
except FileNotFoundError:
    logger.info(f"No checkpoints found in {best_model_dir}")

if checkpoint:
    try:
        logger.info(f"Resuming final training from {checkpoint}")
        trainer.train(resume_from_checkpoint=checkpoint)
    except Exception as e:
        logger.warning(f"Failed to resume from checkpoint {checkpoint}: {e}. Starting new training.")
        trainer.train()
else:
    logger.info("Starting new final training")
    trainer.train()

  trainer = Seq2SeqTrainer(
INFO:__main__:Starting new final training


Epoch,Training Loss,Validation Loss,Bleu-1,Bleu-4,Rouge-l,Meteor,Bertscore
1,2.7434,2.207138,0.61356,0.197788,0.141655,0.580153,0.855882
2,2.1755,2.182361,0.613555,0.184598,0.13278,0.582881,0.856541
3,1.9206,2.149946,0.613435,0.18802,0.135964,0.57955,0.856892
4,1.5414,1.990693,0.454158,0.16425,0.159382,0.447152,0.860341
5,1.0371,1.971221,0.424009,0.162269,0.164989,0.429214,0.864315
6,0.8293,2.172569,0.577612,0.236932,0.181978,0.536924,0.865397
7,0.6941,1.966817,0.601289,0.219109,0.157713,0.598908,0.8586
8,0.5776,2.132075,0.603524,0.21771,0.152872,0.600486,0.856471


INFO:absl:Using default tokenizer.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
INFO:absl:Using default tokenizer.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
INFO:absl:Using default tokenizer.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
INFO:absl:Using default tokenizer.
Some weights of RobertaModel wer

In [28]:
# Save final model and tokenizer
reload_model_dir = os.path.join(best_model_dir, "reload_model")
os.makedirs(reload_model_dir, exist_ok=True)
try:
    model.save_pretrained(reload_model_dir, safe_serialization=True)
    tokenizer.save_pretrained(reload_model_dir)
    necessary_files = [
        "model.safetensors",
        "config.json",
        "tokenizer_config.json",
        "vocab.json",
        "merges.txt",
        "special_tokens_map.json"
    ]
    for file in os.listdir(reload_model_dir):
        if file not in necessary_files:
            os.remove(os.path.join(reload_model_dir, file))
    logger.info(f"Saved model and tokenizer to {reload_model_dir}")
except Exception as e:
    logger.error(f"Failed to save model/tokenizer to {reload_model_dir}: {e}")
    raise

INFO:__main__:Saved model and tokenizer to D:\A_CSE499\output_bloom\best_model\reload_model


In [29]:
# Evaluate on test set and save results
model.eval()
torch.cuda.empty_cache()
test_results = trainer.predict(processed_test_dataset)
test_metrics = test_results.metrics
test_results_df = pd.DataFrame([test_metrics])
test_results_path = os.path.join(OUTPUT_DIR, "test_results.csv")
try:
    test_results_df.to_csv(test_results_path, index=False)
    logger.info(f"Saved test results to {test_results_path}")
except Exception as e:
    logger.error(f"Failed to save test results: {e}")
    raise

test_preds = test_results.predictions[0] if isinstance(test_results.predictions, tuple) else test_results.predictions
test_preds = np.clip(test_preds, 0, tokenizer.vocab_size - 1)
decoded_preds = tokenizer.batch_decode(test_preds, skip_special_tokens=False)
label_ids = np.clip(test_results.label_ids, 0, tokenizer.vocab_size - 1)
decoded_refs = tokenizer.batch_decode(label_ids, skip_special_tokens=False)
pred_ref_df = pd.DataFrame({
    "context": processed_test_dataset["context"],
    "level": processed_test_dataset["level"],
    "predicted_question": decoded_preds,
    "reference_question": decoded_refs
})
pred_ref_path = os.path.join(OUTPUT_DIR, "test_predictions.csv")
try:
    pred_ref_df.to_csv(pred_ref_path, index=False)
    logger.info(f"Saved test predictions to {pred_ref_path}")
except Exception as e:
    logger.error(f"Failed to save test predictions: {e}")
    raise

INFO:absl:Using default tokenizer.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
INFO:__main__:Saved test results to D:\A_CSE499\output_bloom\test_results.csv
INFO:__main__:Saved test predictions to D:\A_CSE499\output_bloom\test_predictions.csv


In [30]:
# Compute metrics per Bloom level
metrics_by_level = {}
for level, group in pred_ref_df.groupby('level'):
    preds = [p.replace(BLOOM_LEVELS.get(level.lower(), '<REM>'), '').strip() for p in group['predicted_question']]
    refs = [r.replace(BLOOM_LEVELS.get(level.lower(), '<REM>'), '').strip() for r in group['reference_question']]
    ref_tokens = [[nltk.word_tokenize(ref)] for ref in refs]
    pred_tokens = [nltk.word_tokenize(pred) for pred in preds]
    smoothie = SmoothingFunction().method4

    bleu1 = corpus_bleu(ref_tokens, pred_tokens, weights=(1, 0, 0, 0), smoothing_function=smoothie)
    bleu4 = corpus_bleu(ref_tokens, pred_tokens, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=smoothie)
    scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
    rouge_l = sum(scorer.score(ref, pred)['rougeL'].fmeasure for ref, pred in zip(refs, preds)) / len(refs)
    meteor = sum(meteor_score([nltk.word_tokenize(ref)], nltk.word_tokenize(pred)) for ref, pred in zip(refs, preds)) / len(refs)
    try:
        P, R, F1 = score(preds, refs, lang="en", verbose=False)
        bertscore = F1.mean().item()
    except Exception:
        bertscore = 0.0

    metrics_by_level[level] = {
        'bleu-1': bleu1,
        'bleu-4': bleu4,
        'rouge-l': rouge_l,
        'meteor': meteor,
        'bertscore': bertscore,
        'num_examples': len(group)
    }

metrics_df = pd.DataFrame(metrics_by_level).T
metrics_path = os.path.join(OUTPUT_DIR, "metrics_by_level.csv")
metrics_df.to_csv(metrics_path)
logger.info(f"Saved per-level metrics to {metrics_path}")
print("Metrics by Bloom level:")
print(metrics_df)


INFO:absl:Using default tokenizer.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
INFO:absl:Using default tokenizer.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
INFO:absl:Using default tokenizer.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
INFO:absl:Using default tokenizer.
Some weights of RobertaModel wer

Metrics by Bloom level:
                 bleu-1    bleu-4   rouge-l    meteor  bertscore  num_examples
analyzing      0.554460  0.267330  0.233436  0.587353   0.884223          64.0
applying       0.544616  0.267717  0.244837  0.583565   0.884355          73.0
remembering    0.538796  0.277847  0.245975  0.576238   0.888526          62.0
understanding  0.541083  0.284006  0.256050  0.585837   0.887610          72.0


In [31]:
# Display example predictions
try:
    num_examples = min(5, len(decoded_preds), len(decoded_refs), len(processed_test_dataset['context']))
    logger.info("\nExample Predictions and References:")
    for i in range(num_examples):
        logger.info(f"\nExample {i+1}:")
        logger.info(f"Context: {processed_test_dataset['context'][i]}")
        logger.info(f"Level: {processed_test_dataset['level'][i]}")
        logger.info(f"Prediction: {decoded_preds[i]}")
        logger.info(f"Reference: {decoded_refs[i]}")
except Exception as e:
    logger.error(f"Failed to print example predictions: {e}")

INFO:__main__:
Example Predictions and References:
INFO:__main__:
Example 1:
INFO:__main__:Context: The Puritans   get a bad reputation in America--especially when it comes to alcohol.
Mayflower, the first ship that came over from England to Massachusetts Bay, actually carried more beer than water.In fact the Founding Fathers of America liked a drink--Samuel Adams was a partner in his father's brewery, and Thomas Jefferson was famous for importing European wines.
Early Americans took a healthful small drink for breakfast, whiskey was a typical lunchtime drink, ale   accompanied supper and the day ended with another drink called nightcap.Most Americans in 1790 consumed an average of 5.8 gallons of pure alcohol a year.In 1830, consumption reached 7.1 gallons a year and alcoholism was starting to have a serious influence on communities.Women and children might be in physical danger if the man of the house began drinking.If he became ill or lost his job through drinking, there was no socia