In [1]:
!pip install scikit-learn



In [2]:
import os
import gc
import json
import random
import numpy as np
import pandas as pd
import nltk
import torch
from datasets import Dataset
from transformers import (
    AutoTokenizer, BartForConditionalGeneration,
    Seq2SeqTrainer, Seq2SeqTrainingArguments,
    DataCollatorForSeq2Seq, EarlyStoppingCallback
)
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction
from nltk.translate.meteor_score import meteor_score
from rouge_score import rouge_scorer
from bert_score import score
from sklearn.model_selection import train_test_split
import re
import sqlite3
import optuna
import logging
import shutil
from optuna.samplers import TPESampler
from optuna.pruners import MedianPruner




In [3]:
# Setup logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger(__name__)

In [22]:
# Seed for reproducibility
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

# Check CUDA
logger.info(f"CUDA available: {torch.cuda.is_available()}")
logger.info(f"CUDA version: {torch.version.cuda}")
logger.info(f"CUDNN version: {torch.backends.cudnn.version()}")

# NLTK downloads
try:
    nltk.download('wordnet', quiet=True)
    nltk.download('omw-1.4', quiet=True)
    nltk.download('punkt', quiet=True)
    nltk.download('punkt_tab', quiet=True)
except Exception as e:
    logger.error(f"NLTK download failed: {e}")
    raise

# Directories
PROJECT_ROOT = r"D:\A_CSE499"
DATA_DIR = os.path.join(PROJECT_ROOT, "data_phase2")
OUTPUT_DIR = os.path.join(PROJECT_ROOT, "outputLarge_B_phase2")
MODEL_PATH = os.path.join(PROJECT_ROOT, "outputLarge_B", "final_model")
os.makedirs(DATA_DIR, exist_ok=True)
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Validate model path
if not os.path.exists(MODEL_PATH):
    logger.error(f"Model path does not exist: {MODEL_PATH}")
    raise FileNotFoundError(f"Model path does not exist: {MODEL_PATH}. Please ensure the model is saved or update MODEL_PATH.")

    
if os.path.exists(MODEL_PATH):
    print(f"✅ Model path exists: {MODEL_PATH}")
else:
    print(f"❌ Model path does NOT exist: {MODEL_PATH}")

2025-07-24 03:15:32,381 - INFO - CUDA available: True
2025-07-24 03:15:32,382 - INFO - CUDA version: 12.8
2025-07-24 03:15:32,383 - INFO - CUDNN version: 90701


✅ Model path exists: D:\A_CSE499\outputLarge_B\final_model


In [5]:
# Load tokenizer and set device
tokenizer = AutoTokenizer.from_pretrained(os.path.join(PROJECT_ROOT, "outputLarge_B", "final_model"))
device = torch.device("cuda:0")
if not torch.cuda.is_available():
    raise RuntimeError("CUDA is not available. Please check your PyTorch installation and NVIDIA drivers.")

In [6]:
# Punctuation cleaning function
def fix_punctuation_spacing(text):
    if not isinstance(text, str):
        return text
    text = text.replace(r'\newline', ' ')
    text = re.sub(r'\s+([,.;:!?])', r'\1', text)
    text = re.sub(r'([,.;:!?])([^\s\W])', r'\1 \2', text)
    text = re.sub(r'\(\s+', '(', text)
    text = re.sub(r'\s+\)', ')', text)
    text = re.sub(r'"\s+', '"', text)
    text = re.sub(r'\s+"', '"', text)
    text = re.sub(r"'\s+", "'", text)
    text = re.sub(r"\s+'", "'", text)
    text = re.sub(r'\s*[-–—]+\s*', ' — ', text)
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

In [7]:
# Preprocess function
def preprocess_function(batch):
    inputs = []
    targets = []
    for c, q in zip(batch['context'], batch['question']):
        c_clean = fix_punctuation_spacing(str(c)) if c else ""
        q_clean = fix_punctuation_spacing(str(q)) if q else ""
        if q_clean.startswith("What is the"):
            q_clean = q_clean.replace("What is the", "What can you tell about")
        if c_clean and q_clean:
            inputs.append(c_clean)
            targets.append(q_clean)

    model_inputs = tokenizer(
        inputs,
        max_length=64,
        truncation=True,
        padding="max_length",
        return_tensors="pt"
    )
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            targets,
            max_length=64,
            truncation=True,
            padding="max_length",
            return_tensors="pt"
        )
    model_inputs["labels"] = labels["input_ids"]
    model_inputs["labels"][model_inputs["labels"] == tokenizer.pad_token_id] = -100
    return model_inputs

In [8]:
# Load and validate dataset
def validate_csv(file_path, required_columns):
    if not os.path.exists(file_path):
        logger.error(f"CSV file not found: {file_path}")
        raise FileNotFoundError(f"CSV file not found: {file_path}")
    df = pd.read_csv(file_path)
    for col in required_columns:
        if col not in df.columns:
            logger.error(f"Column '{col}' missing in {file_path}")
            raise ValueError(f"Column '{col}' missing in {file_path}")
    return df

In [9]:
# Load combined 100k dataset
df = validate_csv(os.path.join(DATA_DIR, "final_100k_dataset.csv"), ['context', 'question'])

# Shuffle and reset index
df = df.sample(frac=1, random_state=SEED).reset_index(drop=True)

# Split into train (80%), validation (10%), test (10%)
train_df, temp_df = train_test_split(df, test_size=0.2, random_state=SEED)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=SEED)

# Log dataset sizes
logger.info(f"Train dataset size: {len(train_df)}")
logger.info(f"Validation dataset size: {len(val_df)}")
logger.info(f"Test dataset size: {len(test_df)}")

2025-07-22 16:31:57,758 - INFO - Train dataset size: 80000
2025-07-22 16:31:57,759 - INFO - Validation dataset size: 10000
2025-07-22 16:31:57,759 - INFO - Test dataset size: 10000


In [10]:
sample = df.sample(5)
for i, row in sample.iterrows():
    print(f"Context: {row['context'][:300]}")
    print(f"Question: {row['question']}")
    # print(f"Answer: {row['answer']}")
    print("="*80)

Context: The big problem that you have in looking at polls of people who watched conventions is something called sorting . Republicans watch Republican conventions and Democrats watch Democratic conventions . The problem is probably even worse than that because it is usually the more hardcore in those groups
Question: Why do the republicans and democrats watch the conventions ?
Context: Peter Holmström: Peter Holmström is an American rock musician.  He is in the bands The Dandy Warhols, Pete International Airport, Radis Noir and Rebel Drones.  Holmström's first guitar was a Gibson SG.  Holmström is an amateur photographer.  In the past he has also composed music for his sister's th
Question: "Plan A" is a song by American rock band The Dandy Warhols, an American alternative rock band, formed in Portland, Oregon in which year, by singer-guitarist Courtney Taylor-Taylor and guitarist Peter Holmström?
Context: Strangers (Halsey song): "Strangers" is a song recorded by American singer and 

In [11]:
# After splitting your dataframes:
train_df = train_df
val_df = val_df
test_df = test_df

In [12]:
# Convert to HuggingFace Datasets
dataset_train = Dataset.from_pandas(train_df)
dataset_val = Dataset.from_pandas(val_df)
dataset_test = Dataset.from_pandas(test_df)

In [40]:
# Tokenize datasets
tokenized_dir = os.path.join(OUTPUT_DIR, "tokenized_datasets")
os.makedirs(tokenized_dir, exist_ok=True)

def save_tokenized_datasets(train_dataset, val_dataset, test_dataset):
    if not os.path.exists(os.path.join(tokenized_dir, "train")):
        train_dataset.save_to_disk(os.path.join(tokenized_dir, "train"))
        logger.info("Saved tokenized training dataset")
    if not os.path.exists(os.path.join(tokenized_dir, "val")):
        val_dataset.save_to_disk(os.path.join(tokenized_dir, "val"))
        logger.info("Saved tokenized validation dataset")
    if not os.path.exists(os.path.join(tokenized_dir, "test")):
        test_dataset.save_to_disk(os.path.join(tokenized_dir, "test"))
        logger.info("Saved tokenized test dataset")

def load_tokenized_datasets():
    global processed_train_dataset, processed_val_dataset, processed_test_dataset
    if os.path.exists(os.path.join(tokenized_dir, "train")):
        train_dataset = Dataset.load_from_disk(os.path.join(tokenized_dir, "train"))
        temp = train_dataset.filter(lambda x: all(k in x for k in ["input_ids", "attention_mask", "labels"]))
        if len(temp) == len(train_dataset):
            logger.info("Loaded tokenized training dataset")
        else:
            logger.warning("Invalid train dataset format, re-tokenizing...")
            train_dataset = dataset_train.map(preprocess_function, batched=True, batch_size=50, remove_columns=['context', 'question'])
            train_dataset.save_to_disk(os.path.join(tokenized_dir, "train"))
            logger.info("Tokenized and saved training dataset")
    else:
        train_dataset = dataset_train.map(preprocess_function, batched=True, batch_size=50, remove_columns=['context', 'question'])
        train_dataset.save_to_disk(os.path.join(tokenized_dir, "train"))
        logger.info("Tokenized and saved training dataset")

    if os.path.exists(os.path.join(tokenized_dir, "val")):
        val_dataset = Dataset.load_from_disk(os.path.join(tokenized_dir, "val"))
        temp = val_dataset.filter(lambda x: all(k in x for k in ["input_ids", "attention_mask", "labels"]))
        if len(temp) == len(val_dataset):
            logger.info("Loaded tokenized validation dataset")
        else:
            logger.warning("Invalid val dataset format, re-tokenizing...")
            val_dataset = dataset_val.map(preprocess_function, batched=True, batch_size=50, remove_columns=['context', 'question'])
            val_dataset.save_to_disk(os.path.join(tokenized_dir, "val"))
            logger.info("Tokenized and saved validation dataset")
    else:
        val_dataset = dataset_val.map(preprocess_function, batched=True, batch_size=50, remove_columns=['context', 'question'])
        val_dataset.save_to_disk(os.path.join(tokenized_dir, "val"))
        logger.info("Tokenized and saved validation dataset")

    if os.path.exists(os.path.join(tokenized_dir, "test")):
        test_dataset = Dataset.load_from_disk(os.path.join(tokenized_dir, "test"))
        temp = test_dataset.filter(lambda x: all(k in x for k in ["input_ids", "attention_mask", "labels"]))
        if len(temp) == len(test_dataset):
            logger.info("Loaded tokenized test dataset")
        else:
            logger.warning("Invalid test dataset format, re-tokenizing...")
            test_dataset = dataset_test.map(preprocess_function, batched=True, batch_size=50, remove_columns=['context', 'question'])
            test_dataset.save_to_disk(os.path.join(tokenized_dir, "test"))
            logger.info("Tokenized and saved test dataset")
    else:
        test_dataset = dataset_test.map(preprocess_function, batched=True, batch_size=50, remove_columns=['context', 'question'])
        test_dataset.save_to_disk(os.path.join(tokenized_dir, "test"))
        logger.info("Tokenized and saved test dataset")

    return train_dataset, val_dataset, test_dataset

# Load and assign tokenized datasets globally
processed_train_dataset, processed_val_dataset, processed_test_dataset = load_tokenized_datasets()

2025-07-24 14:56:52,002 - INFO - Loaded tokenized training dataset
2025-07-24 14:56:52,009 - INFO - Loaded tokenized validation dataset


Filter:   0%|          | 0/10000 [00:00<?, ? examples/s]

2025-07-24 14:56:53,156 - INFO - Loaded tokenized test dataset


In [14]:
# Log dataset sizes
logger.info(f"Train dataset size: {len(processed_train_dataset)}")
logger.info(f"Validation dataset size: {len(processed_val_dataset)}")
logger.info(f"Test dataset size: {len(processed_test_dataset)}")

2025-07-22 16:32:54,490 - INFO - Train dataset size: 80000
2025-07-22 16:32:54,490 - INFO - Validation dataset size: 10000
2025-07-22 16:32:54,490 - INFO - Test dataset size: 10000


In [15]:
# Clean datasets
for ds_name in ["processed_train_dataset", "processed_val_dataset", "processed_test_dataset"]:
    if ds_name in globals():
        ds = globals()[ds_name]
        cleaned = ds.remove_columns([c for c in ds.column_names if c not in ["input_ids", "attention_mask", "labels"]])
        globals()[ds_name] = cleaned


In [16]:
# Evaluation Metrics
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    if isinstance(predictions, tuple):
        predictions = predictions[0]
    predictions = np.clip(predictions, 0, tokenizer.vocab_size - 1)
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels[labels == -100] = tokenizer.pad_token_id
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    ref_tokens = [[nltk.word_tokenize(ref)] for ref in decoded_labels]
    pred_tokens = [nltk.word_tokenize(pred) for pred in decoded_preds]
    smoothie = SmoothingFunction().method4
    bleu1 = corpus_bleu(ref_tokens, pred_tokens, weights=(1, 0, 0, 0), smoothing_function=smoothie)
    bleu2 = corpus_bleu(ref_tokens, pred_tokens, weights=(0.5, 0.5, 0, 0), smoothing_function=smoothie)
    bleu3 = corpus_bleu(ref_tokens, pred_tokens, weights=(0.33, 0.33, 0.33, 0), smoothing_function=smoothie)
    bleu4 = corpus_bleu(ref_tokens, pred_tokens, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=smoothie)
    scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
    rouge_l = sum(
        scorer.score(ref, pred)['rougeL'].fmeasure
        for ref, pred in zip(decoded_labels, decoded_preds)
    ) / len(decoded_labels)
    meteor = sum(
        meteor_score([nltk.word_tokenize(ref)], nltk.word_tokenize(pred))
        for ref, pred in zip(decoded_labels, decoded_preds)
    ) / len(decoded_labels)
    try:
        P, R, F1 = score(decoded_preds, decoded_labels, lang="en", verbose=False)
        bertscore = F1.mean().item()
    except Exception:
        bertscore = 0.0
    return {
        "bleu-1": bleu1, "bleu-2": bleu2, "bleu-3": bleu3, "bleu-4": bleu4,
        "rouge-l": rouge_l, "meteor": meteor, "bertscore": bertscore
    }

In [18]:
# Custom Early Stopping Callback
class CustomEarlyStoppingCallback(EarlyStoppingCallback):
    def __init__(self, early_stopping_patience, min_delta=0.01):
        super().__init__(early_stopping_patience=early_stopping_patience)
        self.min_delta = min_delta
        self.best_metric = float('inf')
        self.early_stopping_patience_counter = 0

    def on_train_begin(self, args, state, control, **kwargs):
        if args.load_best_model_at_end:
            assert args.metric_for_best_model is not None, (
                "EarlyStoppingCallback requires metric_for_best_model to be defined when load_best_model_at_end=True"
            )
        assert args.eval_strategy != "no", (
            "EarlyStoppingCallback requires eval_strategy to be 'steps' or 'epoch'"
        )
        logger.info("Initialized CustomEarlyStoppingCallback")

    def on_evaluate(self, args, state, control, metrics, **kwargs):
        eval_loss = metrics.get('eval_loss', float('inf'))
        if self.best_metric == float('inf') or eval_loss < self.best_metric - self.min_delta:
            self.best_metric = eval_loss
            self.early_stopping_patience_counter = 0
        else:
            self.early_stopping_patience_counter += 1
        if self.early_stopping_patience_counter >= self.early_stopping_patience:
            logger.info(f"Early stopping triggered after {self.early_stopping_patience} evaluations with eval_loss={eval_loss}")
            control.should_training_stop = True




In [19]:
# Save trial results helper
def save_trial_results(study, output_dir):
    trials_data = []
    for trial in study.trials:
        trial_data = {
            'trial_number': trial.number,
            'eval_loss': trial.value if trial.value is not None else float('inf'),
            'state': str(trial.state),
            **trial.params
        }
        trials_data.append(trial_data)
    best_trial_data = {
        'trial_number': study.best_trial.number,
        'eval_loss': study.best_trial.value,
        'state': 'BEST',
        **study.best_params
    }
    trials_data.append(best_trial_data)
    output_path = os.path.join(output_dir, 'optuna_trials.csv')
    trials_df = pd.DataFrame(trials_data)
    mode = 'a' if os.path.exists(output_path) else 'w'
    trials_df.to_csv(output_path, index=False, mode=mode, header=not os.path.exists(output_path))
    logger.info("Saved trial results to optuna_trials.csv")

In [20]:
# Objective function for Optuna tuning
def objective(trial):
    torch.cuda.empty_cache()
    gc.collect()
    
    # Normalize and validate model path
    model_path = os.path.abspath(os.path.normpath(MODEL_PATH))
    if not os.path.exists(model_path):
        logger.error(f"Model path does not exist: {model_path}")
        raise FileNotFoundError(f"Model path does not exist: {model_path}")

    # Load model
    try:
        model = BartForConditionalGeneration.from_pretrained(
            model_path,
            local_files_only=True,
            use_safetensors=True,
            trust_remote_code=True
        )
    except Exception as e:
        logger.error(f"Failed to load model from {model_path}: {e}")
        raise

    model.to(device)
    model.gradient_checkpointing_enable()
    logger.info(f"Gradient checkpointing enabled: {model.is_gradient_checkpointing}")
    generation_config = model.generation_config
    generation_config.no_repeat_ngram_size = 3
    generation_config.min_length = 5
    generation_config.max_length = 64
    generation_config.num_beams = 3

    learning_rate = trial.suggest_float("learning_rate", 1e-5, 5e-5, log=True)
    weight_decay = trial.suggest_float("weight_decay", 0.01, 0.1, log=True)
    warmup_steps = trial.suggest_int("warmup_steps", 100, 500, step=50)
    lr_scheduler_type = trial.suggest_categorical("lr_scheduler_type", ["linear", "cosine"])

    logger.info(f"Trial {trial.number} parameters: learning_rate={learning_rate}, "
                f"weight_decay={weight_decay}, warmup_steps={warmup_steps}, "
                f"lr_scheduler_type={lr_scheduler_type}")

    # Log VRAM usage before training
    if torch.cuda.is_available():
        vram_used = torch.cuda.memory_allocated(device) / 1024**3
        vram_total = torch.cuda.get_device_properties(device).total_memory / 1024**3
        logger.info(f"VRAM usage before training (trial {trial.number}): {vram_used:.2f}GB / {vram_total:.2f}GB")

    trial_output_dir = os.path.join(OUTPUT_DIR, f"trial_{trial.number}")
    os.makedirs(trial_output_dir, exist_ok=True)

    training_args = Seq2SeqTrainingArguments(
        output_dir=trial_output_dir,
        num_train_epochs=5,
        per_device_train_batch_size=2,
        per_device_eval_batch_size=2,
        gradient_accumulation_steps=16,
        dataloader_num_workers=0,
        dataloader_pin_memory=torch.cuda.is_available(),
        lr_scheduler_type=lr_scheduler_type,
        learning_rate=learning_rate,
        warmup_steps=warmup_steps,
        remove_unused_columns=False,
        report_to=[],
        eval_strategy="steps",
        eval_steps=200,
        save_strategy="no",
        weight_decay=weight_decay,
        fp16=torch.cuda.is_available(),
        logging_strategy="steps",
        logging_steps=50,
        predict_with_generate=True,
        generation_max_length=64,
        generation_num_beams=3,
        load_best_model_at_end=False,
        group_by_length=True,
        skip_memory_metrics=True,
        disable_tqdm=True,
    )

    trainer = Seq2SeqTrainer(
        model=model,
        args=training_args,
        train_dataset=processed_train_dataset,
        eval_dataset=processed_val_dataset,
        compute_metrics=None,
        data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model, padding=True),
        callbacks=[CustomEarlyStoppingCallback(early_stopping_patience=3)]
    )

    # Train the model with mixed precision
    logger.info(f"Starting new training for trial {trial.number}")
    try:
        with torch.cuda.amp.autocast():
            trainer.train()
    except Exception as e:
        logger.error(f"Training failed for trial {trial.number}: {e}")
        raise
    
    # Evaluate and report
    eval_results = trainer.evaluate()
    torch.cuda.empty_cache()
    gc.collect()
    
    # Log VRAM after training
    if torch.cuda.is_available():
        vram_used = torch.cuda.memory_allocated(device) / 1024**3
        logger.info(f"VRAM usage after training: {vram_used:.2f}GB / {vram_total:.2f}GB")
    
    for log in trainer.state.log_history:
        if 'eval_loss' in log:
            step = log.get("step", 0)
            trial.report(log['eval_loss'], step=step)
            if trial.should_prune():
                raise optuna.exceptions.TrialPruned()
    
    return eval_results["eval_loss"]

In [21]:
# Run Optuna hyperparameter search with Bayesian optimization
study_name = "bart_question_generation"
storage_url = f"sqlite:///{os.path.join(OUTPUT_DIR, 'optuna_study.db')}"
db_path = os.path.join(OUTPUT_DIR, "optuna_study.db")
if os.path.exists(db_path):
    try:
        conn = sqlite3.connect(db_path)
        conn.close()
        os.remove(db_path)
        logger.info("Deleted existing Optuna database to start fresh.")
    except (PermissionError, sqlite3.OperationalError) as e:
        logger.warning(f"Could not delete existing database {db_path}: {e}. Reusing existing database.")

try:
    study = optuna.create_study(
        study_name=study_name,
        storage=storage_url,
        direction="minimize",
        sampler=TPESampler(seed=42),  # Bayesian optimization
        pruner=MedianPruner(n_warmup_steps=2),
        load_if_exists=True
    )
    study.optimize(objective, n_trials=15)
    save_trial_results(study, OUTPUT_DIR)
    best_params = study.best_params
    with open(os.path.join(OUTPUT_DIR, "best_params.json"), "w") as f:
        json.dump(best_params, f, indent=4)
    logger.info(f"Best hyperparameters: {best_params}")
    logger.info(f"Best objective value (eval_loss): {study.best_value}")

    # Clean up non-best trials
    best_trial_dir = os.path.join(OUTPUT_DIR, f"trial_{study.best_trial.number}")
    for trial_dir in os.listdir(OUTPUT_DIR):
        if trial_dir.startswith("trial_") and trial_dir != os.path.basename(best_trial_dir):
            try:
                shutil.rmtree(os.path.join(OUTPUT_DIR, trial_dir))
                logger.info(f"Deleted non-best trial directory: {trial_dir}")
            except Exception as e:
                logger.warning(f"Failed to delete trial directory {trial_dir}: {e}")
except Exception as e:
    logger.error(f"Optuna optimization or file saving failed: {e}")
    raise
finally:
    if 'storage' in locals():
        del storage
    logger.info("Closed Optuna storage connections.")

[I 2025-07-22 16:36:30,004] A new study created in RDB with name: bart_question_generation
2025-07-22 16:36:36,274 - INFO - Gradient checkpointing enabled: True
2025-07-22 16:36:36,304 - INFO - Trial 0 parameters: learning_rate=1.827226177606625e-05, weight_decay=0.08927180304353628, warmup_steps=400, lr_scheduler_type=linear
2025-07-22 16:36:36,306 - INFO - VRAM usage before training (trial 0): 1.51GB / 7.96GB
2025-07-22 16:36:36,431 - INFO - Starting new training for trial 0
  with torch.cuda.amp.autocast():
2025-07-22 16:36:46,750 - INFO - Initialized CustomEarlyStoppingCallback


{'loss': 2.7963, 'grad_norm': 8.041173934936523, 'learning_rate': 2.0556294498074535e-06, 'epoch': 0.02}
{'loss': 2.6685, 'grad_norm': 6.349306106567383, 'learning_rate': 4.3396621718157345e-06, 'epoch': 0.04}
{'loss': 2.5531, 'grad_norm': 5.658001899719238, 'learning_rate': 6.623694893824016e-06, 'epoch': 0.06}
{'loss': 2.5554, 'grad_norm': 5.723142623901367, 'learning_rate': 8.862046961392131e-06, 'epoch': 0.08}
{'eval_loss': 2.371518850326538, 'eval_runtime': 109.2292, 'eval_samples_per_second': 91.551, 'eval_steps_per_second': 45.775, 'epoch': 0.08}
{'loss': 2.5198, 'grad_norm': 10.649038314819336, 'learning_rate': 1.1146079683400413e-05, 'epoch': 0.1}
{'loss': 2.46, 'grad_norm': 9.487222671508789, 'learning_rate': 1.3430112405408695e-05, 'epoch': 0.12}
{'loss': 2.4502, 'grad_norm': 11.642400741577148, 'learning_rate': 1.5714145127416976e-05, 'epoch': 0.14}
{'loss': 2.4304, 'grad_norm': 6.292455196380615, 'learning_rate': 1.7998177849425257e-05, 'epoch': 0.16}
{'eval_loss': 2.28006

2025-07-22 18:54:21,414 - INFO - Early stopping triggered after 3 evaluations with eval_loss=2.091510534286499


{'eval_loss': 2.091510534286499, 'eval_runtime': 109.8784, 'eval_samples_per_second': 91.01, 'eval_steps_per_second': 45.505, 'epoch': 1.3599999999999999}
{'train_runtime': 8254.6836, 'train_samples_per_second': 48.457, 'train_steps_per_second': 1.514, 'train_loss': 2.2362316086713006, 'epoch': 1.3599999999999999}


2025-07-22 18:56:35,343 - INFO - Early stopping triggered after 3 evaluations with eval_loss=2.091402530670166


{'eval_loss': 2.091402530670166, 'eval_runtime': 132.6596, 'eval_samples_per_second': 75.381, 'eval_steps_per_second': 37.69, 'epoch': 1.3599999999999999}


2025-07-22 18:56:35,673 - INFO - VRAM usage after training: 4.43GB / 7.96GB
[I 2025-07-22 18:56:36,103] Trial 0 finished with value: 2.091402530670166 and parameters: {'learning_rate': 1.827226177606625e-05, 'weight_decay': 0.08927180304353628, 'warmup_steps': 400, 'lr_scheduler_type': 'linear'}. Best is trial 0 with value: 2.091402530670166.
2025-07-22 18:56:37,293 - INFO - Gradient checkpointing enabled: True
2025-07-22 18:56:37,323 - INFO - Trial 1 parameters: learning_rate=1.2853916978930139e-05, weight_decay=0.011430983876313222, warmup_steps=450, lr_scheduler_type=cosine
2025-07-22 18:56:37,323 - INFO - VRAM usage before training (trial 1): 1.53GB / 7.96GB
2025-07-22 18:56:37,373 - INFO - Starting new training for trial 1
2025-07-22 18:56:47,883 - INFO - Initialized CustomEarlyStoppingCallback


{'loss': 2.8115, 'grad_norm': 7.2612762451171875, 'learning_rate': 1.285391697893014e-06, 'epoch': 0.02}
{'loss': 2.7058, 'grad_norm': 6.481954574584961, 'learning_rate': 2.713604695551918e-06, 'epoch': 0.04}
{'loss': 2.5877, 'grad_norm': 5.4879150390625, 'learning_rate': 4.141817693210823e-06, 'epoch': 0.06}
{'loss': 2.5881, 'grad_norm': 6.2297821044921875, 'learning_rate': 5.541466430916549e-06, 'epoch': 0.08}
{'eval_loss': 2.3953311443328857, 'eval_runtime': 108.8703, 'eval_samples_per_second': 91.852, 'eval_steps_per_second': 45.926, 'epoch': 0.08}
{'loss': 2.5421, 'grad_norm': 8.587467193603516, 'learning_rate': 6.969679428575454e-06, 'epoch': 0.1}
{'loss': 2.4847, 'grad_norm': 9.54176139831543, 'learning_rate': 8.397892426234358e-06, 'epoch': 0.12}
{'loss': 2.4725, 'grad_norm': 12.026006698608398, 'learning_rate': 9.826105423893262e-06, 'epoch': 0.14}
{'loss': 2.4494, 'grad_norm': 5.933807849884033, 'learning_rate': 1.1254318421552166e-05, 'epoch': 0.16}
{'eval_loss': 2.294660091

2025-07-22 21:32:07,175 - INFO - Early stopping triggered after 3 evaluations with eval_loss=2.087785243988037


{'eval_loss': 2.087785243988037, 'eval_runtime': 110.1594, 'eval_samples_per_second': 90.778, 'eval_steps_per_second': 45.389, 'epoch': 1.52}
{'train_runtime': 9319.292, 'train_samples_per_second': 42.922, 'train_steps_per_second': 1.341, 'train_loss': 2.2421884657207287, 'epoch': 1.52}


2025-07-22 21:34:30,456 - INFO - Early stopping triggered after 3 evaluations with eval_loss=2.0840227603912354


{'eval_loss': 2.0840227603912354, 'eval_runtime': 142.0121, 'eval_samples_per_second': 70.417, 'eval_steps_per_second': 35.208, 'epoch': 1.52}


2025-07-22 21:34:30,816 - INFO - VRAM usage after training: 4.43GB / 7.96GB
[I 2025-07-22 21:34:31,256] Trial 1 finished with value: 2.0840227603912354 and parameters: {'learning_rate': 1.2853916978930139e-05, 'weight_decay': 0.011430983876313222, 'warmup_steps': 450, 'lr_scheduler_type': 'cosine'}. Best is trial 1 with value: 2.0840227603912354.
2025-07-22 21:34:32,467 - INFO - Gradient checkpointing enabled: True
2025-07-22 21:34:32,497 - INFO - Trial 2 parameters: learning_rate=1.0336843570697396e-05, weight_decay=0.09330606024425668, warmup_steps=450, lr_scheduler_type=linear
2025-07-22 21:34:32,497 - INFO - VRAM usage before training (trial 2): 1.53GB / 7.96GB
2025-07-22 21:34:32,537 - INFO - Starting new training for trial 2
2025-07-22 21:34:43,479 - INFO - Initialized CustomEarlyStoppingCallback


{'loss': 2.8171, 'grad_norm': 7.255630016326904, 'learning_rate': 1.0336843570697397e-06, 'epoch': 0.02}
{'loss': 2.7232, 'grad_norm': 6.611057281494141, 'learning_rate': 2.1822225315916726e-06, 'epoch': 0.04}
{'loss': 2.6048, 'grad_norm': 5.606945037841797, 'learning_rate': 3.3307607061136057e-06, 'epoch': 0.06}
{'loss': 2.6048, 'grad_norm': 6.262466907501221, 'learning_rate': 4.4563281171450995e-06, 'epoch': 0.08}
{'eval_loss': 2.413343906402588, 'eval_runtime': 115.5106, 'eval_samples_per_second': 86.572, 'eval_steps_per_second': 43.286, 'epoch': 0.08}
{'loss': 2.557, 'grad_norm': 8.0410737991333, 'learning_rate': 5.6048662916670335e-06, 'epoch': 0.1}
{'loss': 2.5013, 'grad_norm': 9.267122268676758, 'learning_rate': 6.7304337026985265e-06, 'epoch': 0.12}
{'loss': 2.4841, 'grad_norm': 11.373714447021484, 'learning_rate': 7.87897187722046e-06, 'epoch': 0.14}
{'loss': 2.4614, 'grad_norm': 5.951822280883789, 'learning_rate': 9.027510051742392e-06, 'epoch': 0.16}
{'eval_loss': 2.30895400

2025-07-22 23:31:12,132 - INFO - Early stopping triggered after 3 evaluations with eval_loss=2.107339382171631


{'eval_loss': 2.107339382171631, 'eval_runtime': 109.5999, 'eval_samples_per_second': 91.241, 'eval_steps_per_second': 45.62, 'epoch': 1.12}
{'train_runtime': 6988.6631, 'train_samples_per_second': 57.236, 'train_steps_per_second': 1.789, 'train_loss': 2.3091955402919226, 'epoch': 1.12}


2025-07-22 23:33:25,696 - INFO - Early stopping triggered after 3 evaluations with eval_loss=2.1087207794189453


{'eval_loss': 2.1087207794189453, 'eval_runtime': 132.2933, 'eval_samples_per_second': 75.59, 'eval_steps_per_second': 37.795, 'epoch': 1.12}


2025-07-22 23:33:26,056 - INFO - VRAM usage after training: 4.43GB / 7.96GB
[I 2025-07-22 23:33:26,166] Trial 2 finished with value: 2.1087207794189453 and parameters: {'learning_rate': 1.0336843570697396e-05, 'weight_decay': 0.09330606024425668, 'warmup_steps': 450, 'lr_scheduler_type': 'linear'}. Best is trial 1 with value: 2.0840227603912354.
2025-07-22 23:33:27,296 - INFO - Gradient checkpointing enabled: True
2025-07-22 23:33:27,316 - INFO - Trial 3 parameters: learning_rate=1.34336568680343e-05, weight_decay=0.02014847788415866, warmup_steps=300, lr_scheduler_type=linear
2025-07-22 23:33:27,326 - INFO - VRAM usage before training (trial 3): 1.53GB / 7.96GB
2025-07-22 23:33:27,366 - INFO - Starting new training for trial 3
2025-07-22 23:33:37,929 - INFO - Initialized CustomEarlyStoppingCallback


{'loss': 2.7971, 'grad_norm': 7.976375102996826, 'learning_rate': 2.015048530205145e-06, 'epoch': 0.02}
{'loss': 2.67, 'grad_norm': 6.334558486938477, 'learning_rate': 4.253991341544195e-06, 'epoch': 0.04}
{'loss': 2.5543, 'grad_norm': 5.585394382476807, 'learning_rate': 6.492934152883245e-06, 'epoch': 0.06}
{'loss': 2.556, 'grad_norm': 5.596184253692627, 'learning_rate': 8.642319251768733e-06, 'epoch': 0.08}
{'eval_loss': 2.3708176612854004, 'eval_runtime': 109.3592, 'eval_samples_per_second': 91.442, 'eval_steps_per_second': 45.721, 'epoch': 0.08}
{'loss': 2.5172, 'grad_norm': 14.524303436279297, 'learning_rate': 1.0881262063107785e-05, 'epoch': 0.1}
{'loss': 2.4598, 'grad_norm': 8.856276512145996, 'learning_rate': 1.3120204874446833e-05, 'epoch': 0.12}
{'loss': 2.453, 'grad_norm': 9.52880573272705, 'learning_rate': 1.3386308733171556e-05, 'epoch': 0.14}
{'loss': 2.4283, 'grad_norm': 5.561245918273926, 'learning_rate': 1.3331252762400925e-05, 'epoch': 0.16}
{'eval_loss': 2.2647476196

2025-07-23 01:30:02,535 - INFO - Early stopping triggered after 3 evaluations with eval_loss=2.0984644889831543


{'eval_loss': 2.0984644889831543, 'eval_runtime': 110.5059, 'eval_samples_per_second': 90.493, 'eval_steps_per_second': 45.246, 'epoch': 1.12}
{'train_runtime': 6984.6061, 'train_samples_per_second': 57.269, 'train_steps_per_second': 1.79, 'train_loss': 2.2848770414079937, 'epoch': 1.12}


2025-07-23 01:32:36,880 - INFO - Early stopping triggered after 3 evaluations with eval_loss=2.099980354309082


{'eval_loss': 2.099980354309082, 'eval_runtime': 153.0305, 'eval_samples_per_second': 65.346, 'eval_steps_per_second': 32.673, 'epoch': 1.12}


2025-07-23 01:32:37,223 - INFO - VRAM usage after training: 4.43GB / 7.96GB
[I 2025-07-23 01:32:37,623] Trial 3 finished with value: 2.099980354309082 and parameters: {'learning_rate': 1.34336568680343e-05, 'weight_decay': 0.02014847788415866, 'warmup_steps': 300, 'lr_scheduler_type': 'linear'}. Best is trial 1 with value: 2.0840227603912354.
2025-07-23 01:32:38,773 - INFO - Gradient checkpointing enabled: True
2025-07-23 01:32:38,797 - INFO - Trial 4 parameters: learning_rate=2.6771137242145903e-05, weight_decay=0.013787764619353767, warmup_steps=200, lr_scheduler_type=cosine
2025-07-23 01:32:38,798 - INFO - VRAM usage before training (trial 4): 1.53GB / 7.96GB
2025-07-23 01:32:38,849 - INFO - Starting new training for trial 4
2025-07-23 01:32:49,513 - INFO - Initialized CustomEarlyStoppingCallback


{'loss': 2.7474, 'grad_norm': 7.37462854385376, 'learning_rate': 6.023505879482828e-06, 'epoch': 0.02}
{'loss': 2.593, 'grad_norm': 6.3125081062316895, 'learning_rate': 1.2716290190019304e-05, 'epoch': 0.04}
{'loss': 2.4957, 'grad_norm': 5.938993453979492, 'learning_rate': 1.940907450055578e-05, 'epoch': 0.06}
{'loss': 2.5078, 'grad_norm': 4.848028659820557, 'learning_rate': 2.5968003124881524e-05, 'epoch': 0.08}
{'eval_loss': 2.3206734657287598, 'eval_runtime': 108.642, 'eval_samples_per_second': 92.045, 'eval_steps_per_second': 46.023, 'epoch': 0.08}
{'loss': 2.4865, 'grad_norm': 13.955236434936523, 'learning_rate': 2.6770291968879354e-05, 'epoch': 0.1}
{'loss': 2.4308, 'grad_norm': 7.160529136657715, 'learning_rate': 2.6767279517753222e-05, 'epoch': 0.12}
{'loss': 2.4295, 'grad_norm': 7.3774824142456055, 'learning_rate': 2.67622099417975e-05, 'epoch': 0.14}
{'loss': 2.4013, 'grad_norm': 5.519293308258057, 'learning_rate': 2.6754877149203204e-05, 'epoch': 0.16}
{'eval_loss': 2.251462

2025-07-23 03:42:36,932 - INFO - Early stopping triggered after 3 evaluations with eval_loss=2.1261348724365234


{'eval_loss': 2.1261348724365234, 'eval_runtime': 109.041, 'eval_samples_per_second': 91.709, 'eval_steps_per_second': 45.854, 'epoch': 1.28}
{'train_runtime': 7787.4205, 'train_samples_per_second': 51.365, 'train_steps_per_second': 1.605, 'train_loss': 2.2299020862579346, 'epoch': 1.28}


2025-07-23 03:45:06,410 - INFO - Early stopping triggered after 3 evaluations with eval_loss=2.1252169609069824


{'eval_loss': 2.1252169609069824, 'eval_runtime': 148.1575, 'eval_samples_per_second': 67.496, 'eval_steps_per_second': 33.748, 'epoch': 1.28}


2025-07-23 03:45:06,740 - INFO - VRAM usage after training: 4.43GB / 7.96GB
[I 2025-07-23 03:45:07,150] Trial 4 finished with value: 2.1252169609069824 and parameters: {'learning_rate': 2.6771137242145903e-05, 'weight_decay': 0.013787764619353767, 'warmup_steps': 200, 'lr_scheduler_type': 'cosine'}. Best is trial 1 with value: 2.0840227603912354.
2025-07-23 03:45:08,290 - INFO - Gradient checkpointing enabled: True
2025-07-23 03:45:08,320 - INFO - Trial 5 parameters: learning_rate=3.538461259525519e-05, weight_decay=0.015837031559118753, warmup_steps=300, lr_scheduler_type=linear
2025-07-23 03:45:08,320 - INFO - VRAM usage before training (trial 5): 1.53GB / 7.96GB
2025-07-23 03:45:08,370 - INFO - Starting new training for trial 5
2025-07-23 03:45:18,840 - INFO - Initialized CustomEarlyStoppingCallback


{'loss': 2.7539, 'grad_norm': 7.405422210693359, 'learning_rate': 5.307691889288278e-06, 'epoch': 0.02}
{'loss': 2.601, 'grad_norm': 6.280384063720703, 'learning_rate': 1.120512732183081e-05, 'epoch': 0.04}
{'loss': 2.5, 'grad_norm': 5.811661243438721, 'learning_rate': 1.710256275437334e-05, 'epoch': 0.06}
{'loss': 2.5089, 'grad_norm': 5.000411033630371, 'learning_rate': 2.2999998186915873e-05, 'epoch': 0.08}
{'eval_loss': 2.3226265907287598, 'eval_runtime': 109.8387, 'eval_samples_per_second': 91.043, 'eval_steps_per_second': 45.521, 'epoch': 0.08}
{'loss': 2.4902, 'grad_norm': 11.01403522491455, 'learning_rate': 2.8897433619458402e-05, 'epoch': 0.1}
{'loss': 2.4434, 'grad_norm': 7.720127105712891, 'learning_rate': 3.4794869052000935e-05, 'epoch': 0.12}
{'loss': 2.4482, 'grad_norm': 7.812938213348389, 'learning_rate': 3.525409558158417e-05, 'epoch': 0.14}
{'loss': 2.4263, 'grad_norm': 6.838233470916748, 'learning_rate': 3.510907667750525e-05, 'epoch': 0.16}
{'eval_loss': 2.27621531486

2025-07-23 07:32:22,962 - INFO - Early stopping triggered after 3 evaluations with eval_loss=2.060640811920166


{'eval_loss': 2.060640811920166, 'eval_runtime': 110.0397, 'eval_samples_per_second': 90.876, 'eval_steps_per_second': 45.438, 'epoch': 2.16}
{'train_runtime': 13624.1323, 'train_samples_per_second': 29.36, 'train_steps_per_second': 0.917, 'train_loss': 2.103348623205114, 'epoch': 2.16}


2025-07-23 07:34:36,636 - INFO - Early stopping triggered after 3 evaluations with eval_loss=2.059054374694824


{'eval_loss': 2.059054374694824, 'eval_runtime': 132.4042, 'eval_samples_per_second': 75.526, 'eval_steps_per_second': 37.763, 'epoch': 2.16}


2025-07-23 07:34:36,986 - INFO - VRAM usage after training: 4.43GB / 7.96GB
[I 2025-07-23 07:34:37,307] Trial 5 pruned. 
2025-07-23 07:34:38,467 - INFO - Gradient checkpointing enabled: True
2025-07-23 07:34:38,487 - INFO - Trial 6 parameters: learning_rate=2.658616083788978e-05, weight_decay=0.014808945119975192, warmup_steps=100, lr_scheduler_type=cosine
2025-07-23 07:34:38,487 - INFO - VRAM usage before training (trial 6): 1.53GB / 7.96GB
2025-07-23 07:34:38,537 - INFO - Starting new training for trial 6
2025-07-23 07:34:49,009 - INFO - Initialized CustomEarlyStoppingCallback


{'loss': 2.7117, 'grad_norm': 7.753420352935791, 'learning_rate': 1.1963772377050402e-05, 'epoch': 0.02}
{'loss': 2.5561, 'grad_norm': 6.3083343505859375, 'learning_rate': 2.525685279599529e-05, 'epoch': 0.04}
{'loss': 2.4823, 'grad_norm': 6.894830703735352, 'learning_rate': 2.658529692026355e-05, 'epoch': 0.06}
{'loss': 2.4849, 'grad_norm': 7.655059814453125, 'learning_rate': 2.6582391307993265e-05, 'epoch': 0.08}
{'eval_loss': 2.296781301498413, 'eval_runtime': 109.8006, 'eval_samples_per_second': 91.074, 'eval_steps_per_second': 45.537, 'epoch': 0.08}
{'loss': 2.4665, 'grad_norm': 12.592074394226074, 'learning_rate': 2.657731520677366e-05, 'epoch': 0.1}
{'loss': 2.4071, 'grad_norm': 8.354461669921875, 'learning_rate': 2.657010740048031e-05, 'epoch': 0.12}
{'loss': 2.4092, 'grad_norm': 7.120022773742676, 'learning_rate': 2.6560769045741158e-05, 'epoch': 0.14}
{'loss': 2.3885, 'grad_norm': 6.286619663238525, 'learning_rate': 2.6549301641070685e-05, 'epoch': 0.16}
{'eval_loss': 2.25602

2025-07-23 08:41:50,807 - INFO - Early stopping triggered after 3 evaluations with eval_loss=2.270174503326416


{'eval_loss': 2.270174503326416, 'eval_runtime': 109.7792, 'eval_samples_per_second': 91.092, 'eval_steps_per_second': 45.546, 'epoch': 0.64}
{'train_runtime': 4021.7987, 'train_samples_per_second': 99.458, 'train_steps_per_second': 3.108, 'train_loss': 2.35186598777771, 'epoch': 0.64}


2025-07-23 08:45:17,880 - INFO - Early stopping triggered after 3 evaluations with eval_loss=2.2632651329040527


{'eval_loss': 2.2632651329040527, 'eval_runtime': 205.8023, 'eval_samples_per_second': 48.59, 'eval_steps_per_second': 24.295, 'epoch': 0.64}


2025-07-23 08:45:18,220 - INFO - VRAM usage after training: 4.43GB / 7.96GB
[I 2025-07-23 08:45:18,540] Trial 6 pruned. 
2025-07-23 08:45:19,740 - INFO - Gradient checkpointing enabled: True
2025-07-23 08:45:19,760 - INFO - Trial 7 parameters: learning_rate=3.67320780046205e-05, weight_decay=0.020165721691808594, warmup_steps=100, lr_scheduler_type=linear
2025-07-23 08:45:19,760 - INFO - VRAM usage before training (trial 7): 1.53GB / 7.96GB
2025-07-23 08:45:19,810 - INFO - Starting new training for trial 7
2025-07-23 08:45:30,221 - INFO - Initialized CustomEarlyStoppingCallback


{'loss': 2.6955, 'grad_norm': 8.338322639465332, 'learning_rate': 1.6529435102079224e-05, 'epoch': 0.02}
{'loss': 2.5488, 'grad_norm': 7.6975998878479, 'learning_rate': 3.489547410438947e-05, 'epoch': 0.04}
{'loss': 2.508, 'grad_norm': 103.20072937011719, 'learning_rate': 3.66017383729912e-05, 'epoch': 0.06}
{'loss': 2.6326, 'grad_norm': 30.514934539794922, 'learning_rate': 3.6453625155230634e-05, 'epoch': 0.08}
{'eval_loss': 2.4320242404937744, 'eval_runtime': 109.7171, 'eval_samples_per_second': 91.144, 'eval_steps_per_second': 45.572, 'epoch': 0.08}
{'loss': 2.539, 'grad_norm': 26.09661293029785, 'learning_rate': 3.630551193747007e-05, 'epoch': 0.1}
{'loss': 2.4611, 'grad_norm': 21.795413970947266, 'learning_rate': 3.61573987197095e-05, 'epoch': 0.12}
{'loss': 2.4481, 'grad_norm': 7.859288215637207, 'learning_rate': 3.6012247766304144e-05, 'epoch': 0.14}
{'loss': 2.4409, 'grad_norm': 5.966324806213379, 'learning_rate': 3.586413454854357e-05, 'epoch': 0.16}
{'eval_loss': 2.3034112453

2025-07-23 10:41:49,729 - INFO - Early stopping triggered after 3 evaluations with eval_loss=2.11639404296875


{'eval_loss': 2.11639404296875, 'eval_runtime': 110.2734, 'eval_samples_per_second': 90.684, 'eval_steps_per_second': 45.342, 'epoch': 1.12}
{'train_runtime': 6979.5075, 'train_samples_per_second': 57.311, 'train_steps_per_second': 1.791, 'train_loss': 2.2704624230521064, 'epoch': 1.12}


2025-07-23 10:44:04,197 - INFO - Early stopping triggered after 3 evaluations with eval_loss=2.118257761001587


{'eval_loss': 2.118257761001587, 'eval_runtime': 133.171, 'eval_samples_per_second': 75.091, 'eval_steps_per_second': 37.546, 'epoch': 1.12}


2025-07-23 10:44:04,547 - INFO - VRAM usage after training: 4.43GB / 7.96GB
[I 2025-07-23 10:44:04,845] Trial 7 pruned. 
2025-07-23 10:44:06,075 - INFO - Gradient checkpointing enabled: True
2025-07-23 10:44:06,104 - INFO - Trial 8 parameters: learning_rate=1.2170293883738781e-05, weight_decay=0.03127353036780371, warmup_steps=100, lr_scheduler_type=linear
2025-07-23 10:44:06,104 - INFO - VRAM usage before training (trial 8): 1.53GB / 7.96GB
2025-07-23 10:44:06,162 - INFO - Starting new training for trial 8
2025-07-23 10:44:17,034 - INFO - Initialized CustomEarlyStoppingCallback


{'loss': 2.7523, 'grad_norm': 7.356632709503174, 'learning_rate': 5.476632247682452e-06, 'epoch': 0.02}
{'loss': 2.5992, 'grad_norm': 6.268283367156982, 'learning_rate': 1.1561779189551843e-05, 'epoch': 0.04}
{'loss': 2.4985, 'grad_norm': 5.718626976013184, 'learning_rate': 1.2126127494644567e-05, 'epoch': 0.06}
{'loss': 2.5066, 'grad_norm': 5.504232883453369, 'learning_rate': 1.2078035204297536e-05, 'epoch': 0.08}
{'eval_loss': 2.3337972164154053, 'eval_runtime': 109.6069, 'eval_samples_per_second': 91.235, 'eval_steps_per_second': 45.618, 'epoch': 0.08}
{'loss': 2.4791, 'grad_norm': 11.100144386291504, 'learning_rate': 1.2028961438637299e-05, 'epoch': 0.1}
{'loss': 2.4267, 'grad_norm': 7.685903072357178, 'learning_rate': 1.1979887672977061e-05, 'epoch': 0.12}
{'loss': 2.4212, 'grad_norm': 11.275552749633789, 'learning_rate': 1.1930813907316826e-05, 'epoch': 0.14}
{'loss': 2.4013, 'grad_norm': 5.556790828704834, 'learning_rate': 1.1881740141656589e-05, 'epoch': 0.16}
{'eval_loss': 2.2

2025-07-23 12:44:40,977 - INFO - Early stopping triggered after 3 evaluations with eval_loss=2.095647096633911


{'eval_loss': 2.095647096633911, 'eval_runtime': 109.4003, 'eval_samples_per_second': 91.407, 'eval_steps_per_second': 45.704, 'epoch': 1.12}
{'train_runtime': 7223.9438, 'train_samples_per_second': 55.371, 'train_steps_per_second': 1.73, 'train_loss': 2.276690583910261, 'epoch': 1.12}


2025-07-23 12:46:57,240 - INFO - Early stopping triggered after 3 evaluations with eval_loss=2.0971078872680664


{'eval_loss': 2.0971078872680664, 'eval_runtime': 134.9924, 'eval_samples_per_second': 74.078, 'eval_steps_per_second': 37.039, 'epoch': 1.12}


2025-07-23 12:46:57,590 - INFO - VRAM usage after training: 4.43GB / 7.96GB
[I 2025-07-23 12:46:58,010] Trial 8 finished with value: 2.0971078872680664 and parameters: {'learning_rate': 1.2170293883738781e-05, 'weight_decay': 0.03127353036780371, 'warmup_steps': 100, 'lr_scheduler_type': 'linear'}. Best is trial 1 with value: 2.0840227603912354.
2025-07-23 12:46:59,200 - INFO - Gradient checkpointing enabled: True
2025-07-23 12:46:59,220 - INFO - Trial 9 parameters: learning_rate=2.9045790726652738e-05, weight_decay=0.020497980520950188, warmup_steps=300, lr_scheduler_type=linear
2025-07-23 12:46:59,220 - INFO - VRAM usage before training (trial 9): 1.53GB / 7.96GB
2025-07-23 12:46:59,270 - INFO - Starting new training for trial 9
2025-07-23 12:47:09,690 - INFO - Initialized CustomEarlyStoppingCallback


{'loss': 2.7638, 'grad_norm': 7.634951114654541, 'learning_rate': 4.356868608997911e-06, 'epoch': 0.02}
{'loss': 2.6142, 'grad_norm': 6.317503929138184, 'learning_rate': 9.1978337301067e-06, 'epoch': 0.04}
{'loss': 2.5087, 'grad_norm': 5.960926055908203, 'learning_rate': 1.403879885121549e-05, 'epoch': 0.06}
{'loss': 2.5146, 'grad_norm': 5.8559489250183105, 'learning_rate': 1.887976397232428e-05, 'epoch': 0.08}
{'eval_loss': 2.323354959487915, 'eval_runtime': 108.8195, 'eval_samples_per_second': 91.895, 'eval_steps_per_second': 45.948, 'epoch': 0.08}
{'loss': 2.4934, 'grad_norm': 8.447162628173828, 'learning_rate': 2.3720729093433068e-05, 'epoch': 0.1}
{'loss': 2.4375, 'grad_norm': 6.878093242645264, 'learning_rate': 2.8561694214541856e-05, 'epoch': 0.12}
{'loss': 2.4361, 'grad_norm': 7.386343002319336, 'learning_rate': 2.8938654613316725e-05, 'epoch': 0.14}
{'loss': 2.4112, 'grad_norm': 5.265451431274414, 'learning_rate': 2.881961448738782e-05, 'epoch': 0.16}
{'eval_loss': 2.253026485

2025-07-23 14:43:58,211 - INFO - Early stopping triggered after 3 evaluations with eval_loss=2.1152210235595703


{'eval_loss': 2.1152210235595703, 'eval_runtime': 110.3425, 'eval_samples_per_second': 90.627, 'eval_steps_per_second': 45.313, 'epoch': 1.12}
{'train_runtime': 7008.5215, 'train_samples_per_second': 57.073, 'train_steps_per_second': 1.784, 'train_loss': 2.2598079817635672, 'epoch': 1.12}


2025-07-23 14:46:12,376 - INFO - Early stopping triggered after 3 evaluations with eval_loss=2.116762399673462


{'eval_loss': 2.116762399673462, 'eval_runtime': 132.8909, 'eval_samples_per_second': 75.25, 'eval_steps_per_second': 37.625, 'epoch': 1.12}


2025-07-23 14:46:12,723 - INFO - VRAM usage after training: 4.43GB / 7.96GB
[I 2025-07-23 14:46:13,097] Trial 9 pruned. 
2025-07-23 14:46:14,267 - INFO - Gradient checkpointing enabled: True
2025-07-23 14:46:14,316 - INFO - Trial 10 parameters: learning_rate=1.777063571721834e-05, weight_decay=0.010139048090380883, warmup_steps=500, lr_scheduler_type=cosine
2025-07-23 14:46:14,317 - INFO - VRAM usage before training (trial 10): 1.53GB / 7.96GB
2025-07-23 14:46:14,376 - INFO - Starting new training for trial 10
2025-07-23 14:46:24,949 - INFO - Initialized CustomEarlyStoppingCallback


{'loss': 2.805, 'grad_norm': 7.441847801208496, 'learning_rate': 1.5993572145496504e-06, 'epoch': 0.02}
{'loss': 2.6883, 'grad_norm': 6.414226055145264, 'learning_rate': 3.3764207862714844e-06, 'epoch': 0.04}
{'loss': 2.5711, 'grad_norm': 5.537060737609863, 'learning_rate': 5.153484357993318e-06, 'epoch': 0.06}
{'loss': 2.5739, 'grad_norm': 5.890913963317871, 'learning_rate': 6.8950066582807155e-06, 'epoch': 0.08}
{'eval_loss': 2.3837172985076904, 'eval_runtime': 114.0084, 'eval_samples_per_second': 87.713, 'eval_steps_per_second': 43.856, 'epoch': 0.08}
{'loss': 2.5295, 'grad_norm': 9.38420295715332, 'learning_rate': 8.67207023000255e-06, 'epoch': 0.1}
{'loss': 2.4724, 'grad_norm': 9.816500663757324, 'learning_rate': 1.0449133801724382e-05, 'epoch': 0.12}
{'loss': 2.4626, 'grad_norm': 10.634675025939941, 'learning_rate': 1.2226197373446217e-05, 'epoch': 0.14}
{'loss': 2.4386, 'grad_norm': 5.767257213592529, 'learning_rate': 1.4003260945168053e-05, 'epoch': 0.16}
{'eval_loss': 2.283661

2025-07-23 18:24:12,837 - INFO - Early stopping triggered after 3 evaluations with eval_loss=2.0440101623535156


{'eval_loss': 2.0440101623535156, 'eval_runtime': 117.5975, 'eval_samples_per_second': 85.036, 'eval_steps_per_second': 42.518, 'epoch': 2.08}
{'train_runtime': 13067.8884, 'train_samples_per_second': 30.609, 'train_steps_per_second': 0.957, 'train_loss': 2.165654651935284, 'epoch': 2.08}


2025-07-23 18:27:41,070 - INFO - Early stopping triggered after 3 evaluations with eval_loss=2.044877290725708


{'eval_loss': 2.044877290725708, 'eval_runtime': 206.9434, 'eval_samples_per_second': 48.322, 'eval_steps_per_second': 24.161, 'epoch': 2.08}


2025-07-23 18:27:41,428 - INFO - VRAM usage after training: 4.43GB / 7.96GB
[I 2025-07-23 18:27:41,453] Trial 10 pruned. 
2025-07-23 18:27:47,859 - INFO - Gradient checkpointing enabled: True
2025-07-23 18:27:47,899 - INFO - Trial 11 parameters: learning_rate=1.8217753575067095e-05, weight_decay=0.08084716496554312, warmup_steps=400, lr_scheduler_type=cosine
2025-07-23 18:27:47,899 - INFO - VRAM usage before training (trial 11): 1.53GB / 7.96GB
2025-07-23 18:27:47,960 - INFO - Starting new training for trial 11
2025-07-23 18:27:58,650 - INFO - Initialized CustomEarlyStoppingCallback


{'loss': 2.7965, 'grad_norm': 8.028253555297852, 'learning_rate': 2.049497277195048e-06, 'epoch': 0.02}
{'loss': 2.6687, 'grad_norm': 6.364377498626709, 'learning_rate': 4.326716474078435e-06, 'epoch': 0.04}
{'loss': 2.5534, 'grad_norm': 5.582976818084717, 'learning_rate': 6.603935670961822e-06, 'epoch': 0.06}
{'loss': 2.5553, 'grad_norm': 5.4468464851379395, 'learning_rate': 8.83561048390754e-06, 'epoch': 0.08}
{'eval_loss': 2.369004249572754, 'eval_runtime': 111.9859, 'eval_samples_per_second': 89.297, 'eval_steps_per_second': 44.648, 'epoch': 0.08}
{'loss': 2.5191, 'grad_norm': 18.223304748535156, 'learning_rate': 1.1112829680790928e-05, 'epoch': 0.1}
{'loss': 2.4607, 'grad_norm': 8.88865852355957, 'learning_rate': 1.3390048877674315e-05, 'epoch': 0.12}
{'loss': 2.452, 'grad_norm': 10.940591812133789, 'learning_rate': 1.56672680745577e-05, 'epoch': 0.14}
{'loss': 2.4312, 'grad_norm': 5.937512397766113, 'learning_rate': 1.7944487271441088e-05, 'epoch': 0.16}
{'eval_loss': 2.265991926

2025-07-23 20:26:30,131 - INFO - Early stopping triggered after 3 evaluations with eval_loss=2.0929675102233887


{'eval_loss': 2.0929675102233887, 'eval_runtime': 111.6971, 'eval_samples_per_second': 89.528, 'eval_steps_per_second': 44.764, 'epoch': 1.12}
{'train_runtime': 7111.4809, 'train_samples_per_second': 56.247, 'train_steps_per_second': 1.758, 'train_loss': 2.278565913609096, 'epoch': 1.12}


2025-07-23 20:29:17,183 - INFO - Early stopping triggered after 3 evaluations with eval_loss=2.0943987369537354


{'eval_loss': 2.0943987369537354, 'eval_runtime': 165.7739, 'eval_samples_per_second': 60.323, 'eval_steps_per_second': 30.162, 'epoch': 1.12}


2025-07-23 20:29:17,530 - INFO - VRAM usage after training: 4.43GB / 7.96GB
[I 2025-07-23 20:29:17,566] Trial 11 pruned. 
2025-07-23 20:29:18,742 - INFO - Gradient checkpointing enabled: True
2025-07-23 20:29:18,785 - INFO - Trial 12 parameters: learning_rate=1.7067425832878712e-05, weight_decay=0.055640433189825755, warmup_steps=400, lr_scheduler_type=cosine
2025-07-23 20:29:18,786 - INFO - VRAM usage before training (trial 12): 1.53GB / 7.96GB
2025-07-23 20:29:18,829 - INFO - Starting new training for trial 12
2025-07-23 20:29:29,396 - INFO - Initialized CustomEarlyStoppingCallback


{'loss': 2.7988, 'grad_norm': 7.836755275726318, 'learning_rate': 1.9200854061988552e-06, 'epoch': 0.02}
{'loss': 2.6739, 'grad_norm': 6.441662788391113, 'learning_rate': 4.053513635308694e-06, 'epoch': 0.04}
{'loss': 2.5583, 'grad_norm': 6.105821132659912, 'learning_rate': 6.186941864418533e-06, 'epoch': 0.06}
{'loss': 2.5603, 'grad_norm': 5.595389366149902, 'learning_rate': 8.277701528946175e-06, 'epoch': 0.08}
{'eval_loss': 2.374535083770752, 'eval_runtime': 110.3739, 'eval_samples_per_second': 90.601, 'eval_steps_per_second': 45.301, 'epoch': 0.08}
{'loss': 2.5204, 'grad_norm': 9.701536178588867, 'learning_rate': 1.0411129758056014e-05, 'epoch': 0.1}
{'loss': 2.4616, 'grad_norm': 10.57320499420166, 'learning_rate': 1.2544557987165854e-05, 'epoch': 0.12}
{'loss': 2.4532, 'grad_norm': 11.511216163635254, 'learning_rate': 1.4677986216275691e-05, 'epoch': 0.14}
{'loss': 2.433, 'grad_norm': 5.691565036773682, 'learning_rate': 1.681141444538553e-05, 'epoch': 0.16}
{'eval_loss': 2.2705593

2025-07-23 22:27:49,276 - INFO - Early stopping triggered after 3 evaluations with eval_loss=2.1045470237731934


{'eval_loss': 2.1045470237731934, 'eval_runtime': 109.9518, 'eval_samples_per_second': 90.949, 'eval_steps_per_second': 45.474, 'epoch': 1.12}
{'train_runtime': 7099.8801, 'train_samples_per_second': 56.339, 'train_steps_per_second': 1.761, 'train_loss': 2.281704698290144, 'epoch': 1.12}


2025-07-23 22:30:33,961 - INFO - Early stopping triggered after 3 evaluations with eval_loss=2.1059975624084473


{'eval_loss': 2.1059975624084473, 'eval_runtime': 163.3983, 'eval_samples_per_second': 61.2, 'eval_steps_per_second': 30.6, 'epoch': 1.12}


2025-07-23 22:30:34,295 - INFO - VRAM usage after training: 4.43GB / 7.96GB
[I 2025-07-23 22:30:34,597] Trial 12 pruned. 
2025-07-23 22:30:35,804 - INFO - Gradient checkpointing enabled: True
2025-07-23 22:30:35,851 - INFO - Trial 13 parameters: learning_rate=1.4123623126220836e-05, weight_decay=0.036053084954851045, warmup_steps=400, lr_scheduler_type=cosine
2025-07-23 22:30:35,851 - INFO - VRAM usage before training (trial 13): 1.53GB / 7.96GB
2025-07-23 22:30:35,898 - INFO - Starting new training for trial 13
2025-07-23 22:30:46,550 - INFO - Initialized CustomEarlyStoppingCallback


{'loss': 2.8052, 'grad_norm': 7.4107985496521, 'learning_rate': 1.5889076016998441e-06, 'epoch': 0.02}
{'loss': 2.6888, 'grad_norm': 6.402698040008545, 'learning_rate': 3.354360492477448e-06, 'epoch': 0.04}
{'loss': 2.5716, 'grad_norm': 5.511441230773926, 'learning_rate': 5.119813383255053e-06, 'epoch': 0.06}
{'loss': 2.573, 'grad_norm': 5.848320484161377, 'learning_rate': 6.849957216217105e-06, 'epoch': 0.08}
{'eval_loss': 2.3838706016540527, 'eval_runtime': 109.8067, 'eval_samples_per_second': 91.069, 'eval_steps_per_second': 45.535, 'epoch': 0.08}
{'loss': 2.5301, 'grad_norm': 9.32917594909668, 'learning_rate': 8.61541010699471e-06, 'epoch': 0.1}
{'loss': 2.4713, 'grad_norm': 10.759674072265625, 'learning_rate': 1.0380862997772315e-05, 'epoch': 0.12}
{'loss': 2.4609, 'grad_norm': 13.287156105041504, 'learning_rate': 1.2146315888549918e-05, 'epoch': 0.14}
{'loss': 2.4395, 'grad_norm': 5.726579666137695, 'learning_rate': 1.3911768779327524e-05, 'epoch': 0.16}
{'eval_loss': 2.275125741

2025-07-24 01:11:36,273 - INFO - Early stopping triggered after 3 evaluations with eval_loss=2.075291872024536


{'eval_loss': 2.075291872024536, 'eval_runtime': 110.6569, 'eval_samples_per_second': 90.369, 'eval_steps_per_second': 45.185, 'epoch': 1.52}
{'train_runtime': 9649.7327, 'train_samples_per_second': 41.452, 'train_steps_per_second': 1.295, 'train_loss': 2.2328046738473994, 'epoch': 1.52}
{'eval_loss': 2.07149338722229, 'eval_runtime': 133.5055, 'eval_samples_per_second': 74.903, 'eval_steps_per_second': 37.452, 'epoch': 1.52}


2025-07-24 01:13:51,392 - INFO - VRAM usage after training: 4.43GB / 7.96GB
[I 2025-07-24 01:13:51,694] Trial 13 pruned. 
2025-07-24 01:13:52,922 - INFO - Gradient checkpointing enabled: True
2025-07-24 01:13:52,964 - INFO - Trial 14 parameters: learning_rate=2.06855389280456e-05, weight_decay=0.050876264401976144, warmup_steps=500, lr_scheduler_type=linear
2025-07-24 01:13:52,965 - INFO - VRAM usage before training (trial 14): 1.53GB / 7.96GB
2025-07-24 01:13:53,013 - INFO - Starting new training for trial 14
2025-07-24 01:14:03,552 - INFO - Initialized CustomEarlyStoppingCallback


{'loss': 2.7999, 'grad_norm': 7.756847858428955, 'learning_rate': 1.8616985035241038e-06, 'epoch': 0.02}
{'loss': 2.6764, 'grad_norm': 6.367199420928955, 'learning_rate': 3.930252396328664e-06, 'epoch': 0.04}
{'loss': 2.5601, 'grad_norm': 5.619592666625977, 'learning_rate': 5.998806289133223e-06, 'epoch': 0.06}
{'loss': 2.567, 'grad_norm': 5.711325645446777, 'learning_rate': 8.025989104081693e-06, 'epoch': 0.08}
{'eval_loss': 2.372591018676758, 'eval_runtime': 110.1708, 'eval_samples_per_second': 90.768, 'eval_steps_per_second': 45.384, 'epoch': 0.08}
{'loss': 2.5197, 'grad_norm': 11.38612174987793, 'learning_rate': 1.0094542996886252e-05, 'epoch': 0.1}
{'loss': 2.4635, 'grad_norm': 9.956722259521484, 'learning_rate': 1.2163096889690812e-05, 'epoch': 0.12}
{'loss': 2.457, 'grad_norm': 13.943159103393555, 'learning_rate': 1.423165078249537e-05, 'epoch': 0.14}
{'loss': 2.433, 'grad_norm': 5.7905473709106445, 'learning_rate': 1.6300204675299932e-05, 'epoch': 0.16}
{'eval_loss': 2.27673053

2025-07-24 03:12:12,208 - INFO - Early stopping triggered after 3 evaluations with eval_loss=2.100008249282837


{'eval_loss': 2.100008249282837, 'eval_runtime': 118.5215, 'eval_samples_per_second': 84.373, 'eval_steps_per_second': 42.186, 'epoch': 1.12}
{'train_runtime': 7088.6564, 'train_samples_per_second': 56.428, 'train_steps_per_second': 1.763, 'train_loss': 2.2784713472638813, 'epoch': 1.12}


2025-07-24 03:15:31,921 - INFO - Early stopping triggered after 3 evaluations with eval_loss=2.101543426513672


{'eval_loss': 2.101543426513672, 'eval_runtime': 198.3846, 'eval_samples_per_second': 50.407, 'eval_steps_per_second': 25.204, 'epoch': 1.12}


2025-07-24 03:15:32,287 - INFO - VRAM usage after training: 4.43GB / 7.96GB
[I 2025-07-24 03:15:32,311] Trial 14 pruned. 
2025-07-24 03:15:32,345 - INFO - Saved trial results to optuna_trials.csv
2025-07-24 03:15:32,350 - INFO - Best hyperparameters: {'learning_rate': 1.2853916978930139e-05, 'weight_decay': 0.011430983876313222, 'warmup_steps': 450, 'lr_scheduler_type': 'cosine'}
2025-07-24 03:15:32,355 - INFO - Best objective value (eval_loss): 2.0840227603912354
2025-07-24 03:15:32,360 - INFO - Deleted non-best trial directory: trial_0
2025-07-24 03:15:32,360 - INFO - Deleted non-best trial directory: trial_10
2025-07-24 03:15:32,361 - INFO - Deleted non-best trial directory: trial_11
2025-07-24 03:15:32,362 - INFO - Deleted non-best trial directory: trial_12
2025-07-24 03:15:32,363 - INFO - Deleted non-best trial directory: trial_13
2025-07-24 03:15:32,363 - INFO - Deleted non-best trial directory: trial_14
2025-07-24 03:15:32,364 - INFO - Deleted non-best trial directory: trial_2
2

In [23]:
# Train final model with best hyperparameters
logger.info("Training final model with best hyperparameters")
best_params = study.best_params
model = BartForConditionalGeneration.from_pretrained(MODEL_PATH, trust_remote_code=True, use_safetensors=True)
model.to(device)
model.gradient_checkpointing_enable()
model.generation_config.no_repeat_ngram_size = 3
model.generation_config.min_length = 5

# Log VRAM usage before final training
if torch.cuda.is_available():
    vram_used = torch.cuda.memory_allocated(device) / 1024**3
    vram_total = torch.cuda.get_device_properties(device).total_memory / 1024**3
    logger.info(f"VRAM usage before final training: {vram_used:.2f}GB / {vram_total:.2f}GB")

# Define training arguments with best parameters
training_args = Seq2SeqTrainingArguments(
    output_dir=os.path.join(OUTPUT_DIR, "final_model"),
    num_train_epochs=5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=16,
    dataloader_num_workers=0,
    dataloader_pin_memory=torch.cuda.is_available(),
    lr_scheduler_type=best_params["lr_scheduler_type"],
    learning_rate=best_params["learning_rate"],
    warmup_steps=best_params["warmup_steps"],
    weight_decay=best_params["weight_decay"],
    fp16=torch.cuda.is_available(),
    logging_strategy="steps",
    logging_steps=50,
    eval_strategy="epoch",  # Changed from evaluation_strategy
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    predict_with_generate=True,
    generation_max_length=64,
    generation_num_beams=3,
    group_by_length=True,
    skip_memory_metrics=True,
    disable_tqdm=True,
)

# Initialize data collator
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model, padding=True)

# Initialize trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=processed_train_dataset,
    eval_dataset=processed_val_dataset,
    compute_metrics=compute_metrics,
    data_collator=data_collator,
    callbacks=[CustomEarlyStoppingCallback(early_stopping_patience=3)]
)

# Train the model
try:
    logger.info("Starting final training")
    with torch.amp.autocast('cuda'):  # Updated to use correct autocast API
        trainer.train()
    logger.info("Final training completed successfully")
except Exception as e:
    logger.error(f"Final training failed: {e}")
    raise


# Log VRAM usage after training
if torch.cuda.is_available():
    vram_used = torch.cuda.memory_allocated(device) / 1024**3
    logger.info(f"VRAM usage after final training: {vram_used:.2f}GB / {vram_total:.2f}GB")

# Clean up memory
torch.cuda.empty_cache()
gc.collect()


2025-07-24 03:18:04,084 - INFO - Training final model with best hyperparameters
2025-07-24 03:18:05,551 - INFO - VRAM usage before final training: 5.95GB / 7.96GB
2025-07-24 03:18:05,734 - INFO - Starting final training
2025-07-24 03:18:16,375 - INFO - Initialized CustomEarlyStoppingCallback


{'loss': 2.8115, 'grad_norm': 7.2612762451171875, 'learning_rate': 1.285391697893014e-06, 'epoch': 0.02}
{'loss': 2.7058, 'grad_norm': 6.481954574584961, 'learning_rate': 2.713604695551918e-06, 'epoch': 0.04}
{'loss': 2.5877, 'grad_norm': 5.4879150390625, 'learning_rate': 4.141817693210823e-06, 'epoch': 0.06}
{'loss': 2.5881, 'grad_norm': 6.2297821044921875, 'learning_rate': 5.541466430916549e-06, 'epoch': 0.08}
{'loss': 2.5418, 'grad_norm': 8.759868621826172, 'learning_rate': 6.969679428575454e-06, 'epoch': 0.1}
{'loss': 2.4848, 'grad_norm': 9.387632369995117, 'learning_rate': 8.397892426234358e-06, 'epoch': 0.12}
{'loss': 2.4736, 'grad_norm': 13.297496795654297, 'learning_rate': 9.826105423893262e-06, 'epoch': 0.14}
{'loss': 2.4506, 'grad_norm': 5.881244659423828, 'learning_rate': 1.1254318421552166e-05, 'epoch': 0.16}
{'loss': 2.4455, 'grad_norm': 6.311375141143799, 'learning_rate': 1.268253141921107e-05, 'epoch': 0.18}
{'loss': 2.4191, 'grad_norm': 12.817657470703125, 'learning_rat

2025-07-24 05:12:17,566 - INFO - Using default tokenizer.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'eval_loss': 2.1029350757598877, 'eval_bleu-1': 0.2700138367266258, 'eval_bleu-2': 0.148682834700573, 'eval_bleu-3': 0.10192163977284457, 'eval_bleu-4': 0.07242951093638948, 'eval_rouge-l': 0.2205406619320127, 'eval_meteor': 0.23111950149719418, 'eval_bertscore': 0.8767882585525513, 'eval_runtime': 2078.7477, 'eval_samples_per_second': 4.811, 'eval_steps_per_second': 2.405, 'epoch': 1.0}
{'loss': 2.1202, 'grad_norm': 5.572562217712402, 'learning_rate': 1.1920584233401736e-05, 'epoch': 1.02}
{'loss': 2.1231, 'grad_norm': 8.109247207641602, 'learning_rate': 1.187663767737425e-05, 'epoch': 1.04}
{'loss': 2.0864, 'grad_norm': 5.420586585998535, 'learning_rate': 1.1831765079852086e-05, 'epoch': 1.06}
{'loss': 2.1186, 'grad_norm': 7.683208465576172, 'learning_rate': 1.1785974065850561e-05, 'epoch': 1.08}
{'loss': 2.1085, 'grad_norm': 9.568700790405273, 'learning_rate': 1.173927241644771e-05, 'epoch': 1.1}
{'loss': 2.1213, 'grad_norm': 4.8004841804504395, 'learning_rate': 1.1691668067462093e

2025-07-24 07:10:10,170 - INFO - Using default tokenizer.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'eval_loss': 2.057978630065918, 'eval_bleu-1': 0.2645873183010486, 'eval_bleu-2': 0.14799917622265413, 'eval_bleu-3': 0.10225881430572212, 'eval_bleu-4': 0.07299661793154594, 'eval_rouge-l': 0.22071261938481257, 'eval_meteor': 0.2357890755636357, 'eval_bertscore': 0.8763830065727234, 'eval_runtime': 2268.5744, 'eval_samples_per_second': 4.408, 'eval_steps_per_second': 2.204, 'epoch': 2.0}
{'loss': 1.9341, 'grad_norm': 4.783074378967285, 'learning_rate': 8.771053030405965e-06, 'epoch': 2.02}
{'loss': 1.9291, 'grad_norm': 5.118368148803711, 'learning_rate': 8.692847717630605e-06, 'epoch': 2.04}
{'loss': 1.9138, 'grad_norm': 5.140848159790039, 'learning_rate': 8.61425737164957e-06, 'epoch': 2.06}
{'loss': 1.9574, 'grad_norm': 4.8419084548950195, 'learning_rate': 8.535295346996024e-06, 'epoch': 2.08}
{'loss': 1.9885, 'grad_norm': 5.008954048156738, 'learning_rate': 8.455975061360957e-06, 'epoch': 2.1}
{'loss': 1.9541, 'grad_norm': 4.736142158508301, 'learning_rate': 8.37630999331317e-06, 

2025-07-24 09:05:56,368 - INFO - Using default tokenizer.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'eval_loss': 2.0274946689605713, 'eval_bleu-1': 0.27264447494418337, 'eval_bleu-2': 0.1524797718063744, 'eval_bleu-3': 0.10511095432569854, 'eval_bleu-4': 0.07490457733100264, 'eval_rouge-l': 0.23465419073840293, 'eval_meteor': 0.24675927683619292, 'eval_bertscore': 0.879502534866333, 'eval_runtime': 2173.4878, 'eval_samples_per_second': 4.601, 'eval_steps_per_second': 2.3, 'epoch': 3.0}
{'loss': 1.8691, 'grad_norm': 8.324563026428223, 'learning_rate': 4.663682593389126e-06, 'epoch': 3.02}
{'loss': 1.8803, 'grad_norm': 4.338119983673096, 'learning_rate': 4.583269859507152e-06, 'epoch': 3.04}
{'loss': 1.8481, 'grad_norm': 4.917848587036133, 'learning_rate': 4.503170416025539e-06, 'epoch': 3.06}
{'loss': 1.865, 'grad_norm': 4.235669136047363, 'learning_rate': 4.423397873912161e-06, 'epoch': 3.08}
{'loss': 1.8531, 'grad_norm': 5.017637729644775, 'learning_rate': 4.343965788585893e-06, 'epoch': 3.1}
{'loss': 1.8491, 'grad_norm': 5.7430548667907715, 'learning_rate': 4.264887657613198e-06, 

2025-07-24 11:00:17,091 - INFO - Using default tokenizer.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'eval_loss': 2.0270698070526123, 'eval_bleu-1': 0.2761500317205484, 'eval_bleu-2': 0.15425701872424658, 'eval_bleu-3': 0.10619294363686237, 'eval_bleu-4': 0.07559452968756013, 'eval_rouge-l': 0.2341221712719213, 'eval_meteor': 0.24466703876821397, 'eval_bertscore': 0.8797650337219238, 'eval_runtime': 2073.9605, 'eval_samples_per_second': 4.822, 'eval_steps_per_second': 2.411, 'epoch': 4.0}
{'loss': 1.8125, 'grad_norm': 6.050045967102051, 'learning_rate': 1.2771246392009642e-06, 'epoch': 4.02}
{'loss': 1.8176, 'grad_norm': 5.4402899742126465, 'learning_rate': 1.2274398493454873e-06, 'epoch': 4.04}
{'loss': 1.8312, 'grad_norm': 4.517246723175049, 'learning_rate': 1.178638592235167e-06, 'epoch': 4.06}
{'loss': 1.7958, 'grad_norm': 6.796860218048096, 'learning_rate': 1.1307291604662274e-06, 'epoch': 4.08}
{'loss': 1.8404, 'grad_norm': 4.849963188171387, 'learning_rate': 1.0837196950907004e-06, 'epoch': 4.1}
{'loss': 1.8371, 'grad_norm': 4.885039329528809, 'learning_rate': 1.03761818423303

2025-07-24 12:55:46,301 - INFO - Using default tokenizer.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'eval_loss': 2.0252857208251953, 'eval_bleu-1': 0.27332187196680857, 'eval_bleu-2': 0.15301182716600392, 'eval_bleu-3': 0.10548378318198912, 'eval_bleu-4': 0.07516052867158196, 'eval_rouge-l': 0.23357904550707687, 'eval_meteor': 0.2461827601471226, 'eval_bertscore': 0.8796655535697937, 'eval_runtime': 2139.2749, 'eval_samples_per_second': 4.674, 'eval_steps_per_second': 2.337, 'epoch': 5.0}


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].
2025-07-24 12:56:23,212 - INFO - Final training completed successfully
2025-07-24 12:56:23,222 - INFO - VRAM usage after final training: 4.43GB / 7.96GB


{'train_runtime': 34686.8464, 'train_samples_per_second': 11.532, 'train_steps_per_second': 0.36, 'train_loss': 2.013610836791992, 'epoch': 5.0}


1615

In [46]:
# Save final model and tokenizer
reload_model_dir = os.path.join(final_model_dir, "reload_model")
os.makedirs(reload_model_dir, exist_ok=True)
try:
    model.save_pretrained(reload_model_dir, safe_serialization=True)
    tokenizer.save_pretrained(reload_model_dir)
    necessary_files = [
        "model.safetensors",
        "config.json",
        "tokenizer_config.json",
        "vocab.json",
        "merges.txt",
        "special_tokens_map.json"
    ]
    for file in os.listdir(reload_model_dir):
        if file not in necessary_files:
            os.remove(os.path.join(reload_model_dir, file))
    logger.info(f"Saved model and tokenizer to {reload_model_dir}")
except Exception as e:
    logger.error(f"Failed to save model/tokenizer to {reload_model_dir}: {e}")
    raise

2025-07-24 16:09:04,593 - INFO - Saved model and tokenizer to D:\A_CSE499\outputLarge_B_phase2\final_model\reload_model


In [49]:
import os
import time
from tqdm import tqdm
import pandas as pd
import numpy as np
import torch

# Set output directory
OUTPUT_DIR = r"D:\A_CSE499\outputLarge_B_phase2"  # Use raw string to handle backslashes

# Evaluate on test set and save results
model.eval()
torch.cuda.empty_cache()

logger.info("Starting prediction on test set...")
start = time.time()
test_results = trainer.predict(processed_test_dataset)
end = time.time()
logger.info(f"Prediction completed in {(end - start)/60:.2f} minutes.")

# Ensure output directory exists
os.makedirs(OUTPUT_DIR, exist_ok=True)  # Create directory if it doesn't exist

# Save test metrics
test_metrics = test_results.metrics
test_results_df = pd.DataFrame([test_metrics])
test_results_path = os.path.join(OUTPUT_DIR, "test_results.csv")
try:
    test_results_df.to_csv(test_results_path, index=False)
    logger.info(f"Saved test results to {test_results_path}")
except Exception as e:
    logger.error(f"Failed to save test results: {e}")
    raise

# Handle predictions
test_preds = test_results.predictions[0] if isinstance(test_results.predictions, tuple) else test_results.predictions
test_preds = np.clip(test_preds, 0, tokenizer.vocab_size - 1)

# Decode with progress bars
logger.info("Decoding predictions...")
decoded_preds = tokenizer.batch_decode(
    list(tqdm(test_preds, desc="Decoding Predictions")), skip_special_tokens=True
)

label_ids = np.clip(test_results.label_ids, 0, tokenizer.vocab_size - 1)
logger.info("Decoding references...")
decoded_refs = tokenizer.batch_decode(
    list(tqdm(label_ids, desc="Decoding References")), skip_special_tokens=True
)

# Reconstruct context from input_ids
logger.info("Decoding original context inputs...")
decoded_contexts = [
    tokenizer.decode(sample["input_ids"], skip_special_tokens=True)
    for sample in tqdm(processed_test_dataset, desc="Decoding Contexts")
]

# Save predictions and references
pred_ref_df = pd.DataFrame({
    "context": decoded_contexts,
    "predicted_question": decoded_preds,
    "reference_question": decoded_refs
})
pred_ref_path = os.path.join(OUTPUT_DIR, "test_predictions.csv")
try:
    pred_ref_df.to_csv(pred_ref_path, index=False)
    logger.info(f"Saved test predictions to {pred_ref_path}")
except Exception as e:
    logger.error(f"Failed to save test predictions: {e}")
    raise

2025-07-24 19:36:56,669 - INFO - Starting prediction on test set...
2025-07-24 20:19:05,396 - INFO - Using default tokenizer.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
2025-07-24 20:20:19,691 - INFO - Prediction completed in 43.38 minutes.
2025-07-24 20:20:19,696 - INFO - Saved test results to D:\A_CSE499\outputLarge_B_phase2\test_results.csv
2025-07-24 20:20:19,696 - INFO - Decoding predictions...
Decoding Predictions: 100%|██████████| 10000/10000 [00:00<00:00, 999643.45it/s]
2025-07-24 20:20:19,865 - INFO - Decoding references...
Decoding References: 100%|██████████| 10000/10000 [00:00<00:00, 1466796.29it/s]
2025-07-24 20:20:20,032 - INFO - Decoding original context inputs...
Decoding Contexts: 100%|██████████| 10000/10000 [00:01<00:00, 5434.00it/s]
2025

In [51]:
try:
    num_examples = min(5, len(decoded_preds), len(decoded_refs), len(decoded_contexts))
    logger.info("\nExample Predictions and References:")
    for i in range(num_examples):
        logger.info(f"\nExample {i+1}:")
        logger.info(f"Context: {decoded_contexts[i]}")
        logger.info(f"Prediction: {decoded_preds[i]}")
        logger.info(f"Reference: {decoded_refs[i]}")
except Exception as e:
    logger.error(f"Failed to print example predictions: {e}")
    raise

2025-07-24 20:33:57,316 - INFO - 
Example Predictions and References:
2025-07-24 20:33:57,316 - INFO - 
Example 1:
2025-07-24 20:33:57,317 - INFO - Context: we used it on our ford f250, it runs off the truck battery — lifts 1000lbs easily probably more, works great, goes flat with ground and brings whatever level w truck bed.
2025-07-24 20:33:57,318 - INFO - Prediction: What may happen after seeing JM?
2025-07-24 20:33:57,318 - INFO - Reference: How would you feel if you two decided to be more than friends?
2025-07-24 20:33:57,319 - INFO - 
Example 2:
2025-07-24 20:33:57,319 - INFO - Context: Michelle Trachtenberg: Michelle Christine Trachtenberg (born October 11, 1985) is an American actress. She is known for portraying Nona F. Mecklenberg in"The Adventures of Pete & Pete"(1994 — 96), Dawn Summers in"Buffy the Vampire Slayer"(2000 —
2025-07-24 20:33:57,320 - INFO - Prediction: Are Erin Wiedner and Jessie Bear both directors?
2025-07-24 20:33:57,320 - INFO - Reference: Which director i