In [None]:
import os
import json
import random
import numpy as np
import pandas as pd
import torch
import time
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from torch.optim.lr_scheduler import OneCycleLR
import transformers
from transformers import (
    T5ForConditionalGeneration,
    T5Tokenizer,
    Trainer,
    TrainingArguments,
    DataCollatorForSeq2Seq,
    EarlyStoppingCallback,
    TrainerCallback,
    TrainerState,
    TrainerControl,
    get_cosine_schedule_with_warmup
)
from transformers.trainer_utils import set_seed
import evaluate
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer
import bert_score
from collections import Counter
import nltk
import warnings
from typing import List, Tuple
from tqdm import tqdm
from datetime import datetime
import logging
from tabulate import tabulate
from transformers.utils.logging import set_verbosity_error
import gc
import optuna  # --- NEW ---

warnings.filterwarnings("ignore")


# Setup logging for VS Code
def setup_logging():
    """Setup logging configuration"""
    logging.basicConfig(
        level=logging.INFO,
        format='%(asctime)s | %(levelname)s | %(message)s',
        datefmt='%H:%M:%S',
        handlers=[
            logging.StreamHandler(),
            logging.FileHandler('training.log', encoding='utf-8')
        ]
    )
    return logging.getLogger(__name__)

logger = setup_logging()
#set_verbosity_error()

# Ensure NLTK data is available
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')

class QuestionGenerationDataset(Dataset):
    def __init__(self, contexts, questions, tokenizer, max_length=512):
        self.contexts = contexts
        self.questions = questions
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.contexts)

    def __getitem__(self, idx):
        context = self.contexts[idx]
        question = self.questions[idx]

        input_text = f"Generate a question from context: {context}"

        input_encoding = self.tokenizer(
            input_text,
            max_length=self.max_length,
            padding=False,
            truncation=True,
            return_tensors="pt"
        )

        target_encoding = self.tokenizer(
            question,
            max_length=128,
            padding=False,
            truncation=True,
            return_tensors="pt"
        )

        return {
            "input_ids": input_encoding.input_ids.flatten(),
            "attention_mask": input_encoding.attention_mask.flatten(),
            "labels": target_encoding.input_ids.flatten()
        }

class DataProcessor:
    def __init__(self, data_dir, tokenizer):
        self.data_dir = data_dir
        self.tokenizer = tokenizer

    def load_merged_csv(self, file_path: str, split_name: str) -> Tuple[List[str], List[str]]:
        contexts, questions = [], []
        
        # Counters for statistics
        total_processed = 0
        dropped_length = 0
        dropped_empty = 0
        
        if os.path.exists(file_path):
            logger.info(f"Loading {split_name} data from: {file_path}")
            try:
                df = pd.read_csv(file_path, encoding='utf-8')
                
                if 'context' not in df.columns or 'question' not in df.columns:
                    raise ValueError(f"CSV must contain 'context' and 'question' columns. Found: {df.columns}")

                # Use tqdm to show progress bar since encoding checks can take a moment
                for _, row in tqdm(df.iterrows(), total=len(df), desc=f"Checking {split_name}"):
                    total_processed += 1
                    
                    # 1. Basic extraction and existence check
                    if pd.notna(row.get('context', '')) and pd.notna(row.get('question', '')):
                        context = str(row['context']).strip()
                        question = str(row['question']).strip()
                        
                        if not context or not question:
                            dropped_empty += 1
                            continue
                        
                        # 2. Check Token Length
                        # We encode without special tokens first to get rough count, 
                        # or with them to be precise. 'truncation=False' ensures we get actual length.
                        token_ids = self.tokenizer.encode(context, add_special_tokens=True)
                        
                        if len(token_ids) <= 512:
                            contexts.append(context)
                            questions.append(question)
                        else:
                            dropped_length += 1
                    else:
                        dropped_empty += 1

                # --- LOGGING THE STATISTICS ---
                logger.info(f"--- DATA STATS FOR {split_name.upper()} ---")
                logger.info(f"  Total Rows:      {total_processed}")
                logger.info(f"  Kept:            {len(contexts)} ({len(contexts)/total_processed:.1%})")
                logger.info(f"  Dropped (Empty): {dropped_empty}")
                logger.info(f"  Dropped (>512):  {dropped_length} ({dropped_length/total_processed:.1%})")
                logger.info(f"---------------------------------------")

            except Exception as e:
                logger.error(f"Error reading {file_path}: {e}")
                raise e
        else:
            logger.error(f"File not found: {file_path}")
        
        return contexts, questions

    def load_phase2_datasets(self) -> Tuple[List[str], List[str], List[str], List[str]]:
        """
        Loads merged_train.csv and merged_val.csv specifically for Phase 2
        """
        # Construct paths
        train_path = os.path.join(self.data_dir, "merged_train.csv")
        val_path = os.path.join(self.data_dir, "merged_val.csv")

        # Load Data
        train_contexts, train_questions = self.load_merged_csv(train_path, "Phase 2 Train")
        val_contexts, val_questions = self.load_merged_csv(val_path, "Phase 2 Validation")

        # Shuffle logic
        train_combined = list(zip(train_contexts, train_questions))
        val_combined = list(zip(val_contexts, val_questions))
        
        random.seed(42) 
        random.shuffle(train_combined)
        random.shuffle(val_combined)

        if train_combined:
            train_contexts, train_questions = zip(*train_combined)
        if val_combined:
            val_contexts, val_questions = zip(*val_combined)

        return (list(train_contexts), list(train_questions),
                list(val_contexts), list(val_questions))

class AdvancedEvaluationMetrics:
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer
        self.rouge_scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
        self.smoothing = SmoothingFunction().method1

    def compute_bleu(self, references, predictions):
        bleu_scores = {f"bleu_{i}": [] for i in range(1, 5)}
        for ref, pred in zip(references, predictions):
            ref_tokens, pred_tokens = ref.split(), pred.split()
            for i in range(1, 5):
                weights = [1/i] * i
                score = sentence_bleu([ref_tokens], pred_tokens, weights=weights, smoothing_function=self.smoothing)
                bleu_scores[f"bleu_{i}"].append(score)
        return {k: np.mean(v) for k, v in bleu_scores.items()}

    def compute_rouge_l(self, references, predictions):
        scores = [self.rouge_scorer.score(ref, pred)['rougeL'].fmeasure for ref, pred in zip(references, predictions)]
        return {"rouge_l": np.mean(scores)}

    def compute_meteor(self, references, predictions):
        try:
            meteor = evaluate.load("meteor")
            return {"meteor": meteor.compute(predictions=predictions, references=references)["meteor"]}
        except Exception:
            return {"meteor": 0.0}

    def compute_bert_score(self, references, predictions):
        try:
            P, R, F1 = bert_score.score(predictions, references, lang="en", verbose=False)
            return {"bert_score": F1.mean().item()}
        except Exception:
            return {"bert_score": 0.0}

    # --- MODIFICATION 1: SELF-BLEU FIXED ---
    def compute_self_bleu(self, predictions):
        if len(predictions) < 2: return {"self_bleu": 0.0}
        scores = []
        for i, pred in enumerate(predictions):
            others = predictions[:i] + predictions[i+1:]
            pred_tokens = pred.split()
            # The fix: Using all other sentences, not just the first 10
            other_tokens = [other.split() for other in others]
            if other_tokens:
                score = sentence_bleu(other_tokens, pred_tokens, smoothing_function=self.smoothing)
                scores.append(score)
        return {"self_bleu": np.mean(scores) if scores else 0.0}

    def compute_distinct_n(self, predictions, n):
        all_ngrams = [tuple(tokens[i:i+n]) for pred in predictions for tokens in [pred.split()] for i in range(len(tokens)-n+1)]
        if not all_ngrams: return 0.0
        return len(set(all_ngrams)) / len(all_ngrams)

    def compute_all_metrics(self, references, predictions):
        if not references or not predictions:
            return {m: 0.0 for m in ["bleu_1", "bleu_2", "bleu_3", "bleu_4", "rouge_l", "meteor", "bert_score", "self_bleu", "distinct_1", "distinct_2"]}
        
        metrics = {}
        logger.info("Computing quality and diversity metrics...")
        metrics.update(self.compute_bleu(references, predictions))
        metrics.update(self.compute_rouge_l(references, predictions))
        metrics.update(self.compute_meteor(references, predictions))
        metrics.update(self.compute_bert_score(references, predictions))
        metrics.update(self.compute_self_bleu(predictions))
        metrics["distinct_1"] = self.compute_distinct_n(predictions, 1)
        metrics["distinct_2"] = self.compute_distinct_n(predictions, 2)
        return metrics

class DiverseDecoder:
    def __init__(self, model, tokenizer):
        self.model = model
        self.tokenizer = tokenizer

    def diverse_generate(self, input_ids, attention_mask, num_return_sequences=1):
        outputs = self.model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_length=128,
            num_beams=8,
            num_return_sequences=num_return_sequences,
            num_beam_groups=4,
            diversity_penalty=1.5,
            no_repeat_ngram_size=2,
            early_stopping=True,
            pad_token_id=self.tokenizer.pad_token_id,
            eos_token_id=self.tokenizer.eos_token_id
        )
        return outputs

class MetricsLogger:
    def __init__(self):
        self.evaluation_history = []

    def log_epoch_progress(self, epoch, loss, learning_rate):
        logger.info(f"Epoch {epoch:>.2f} | Loss: {loss:.4f} | LR: {learning_rate:.2e}")

    def log_evaluation(self, metrics, epoch=None, step=None):
        eval_record = {'epoch': epoch, 'step': step, 'timestamp': datetime.now(), **metrics}
        self.evaluation_history.append(eval_record)
        self.display_metrics_table(metrics, epoch)

    def display_metrics_table(self, metrics, epoch=None, step=None):
        print("\n" + "="*80)
        print(f"EVALUATION RESULTS - Epoch {epoch}")
        print("="*80)
        
        quality = {k: v for k, v in metrics.items() if any(x in k for x in ['bleu', 'rouge', 'meteor', 'bert_score'])}
        diversity = {k: v for k, v in metrics.items() if any(x in k for x in ['self_bleu', 'distinct'])}
        other = {k: v for k, v in metrics.items() if k not in quality and k not in diversity}
        
        for title, metric_dict in [("QUALITY", quality), ("DIVERSITY", diversity), ("OTHER", other)]:
            if metric_dict:
                print(f"\n{title} METRICS:")
                table_data = [[k.replace('eval_', '').replace('_', '-').upper(), f"{v:.4f}"] for k, v in metric_dict.items()]
                print(tabulate(table_data, headers=['Metric', 'Score'], tablefmt='grid'))
        print("="*80 + "\n")

    def display_training_summary(self):
        if not self.evaluation_history: return
        print("\n" + "="*80)
        print("TRAINING SUMMARY")
        print("="*80)

        eval_logs = [log for log in self.evaluation_history if 'eval_bleu_4' in log]
        if not eval_logs: return
        
        best_bleu4 = max(eval_logs, key=lambda x: x.get('eval_bleu_4', 0))
        best_rouge = max(eval_logs, key=lambda x: x.get('eval_rouge_l', 0))
        best_meteor = max(eval_logs, key=lambda x: x.get('eval_meteor', 0))
        
        summary_data = [
            ['Best BLEU-4', f"{best_bleu4.get('eval_bleu_4', 0):.4f}", f"Epoch {best_bleu4.get('epoch', 'N/A')}"],
            ['Best ROUGE-L', f"{best_rouge.get('eval_rouge_l', 0):.4f}", f"Epoch {best_rouge.get('epoch', 'N/A')}"],
            ['Best METEOR', f"{best_meteor.get('eval_meteor', 0):.4f}", f"Epoch {best_meteor.get('epoch', 'N/A')}"],
        ]
        print(tabulate(summary_data, headers=['Metric', 'Best Score', 'Achieved At'], tablefmt='grid'))
        print("="*80 + "\n")

class CustomLoggingCallback(TrainerCallback):
    def __init__(self, metrics_logger=None):
        self.metrics_logger = metrics_logger if metrics_logger is not None else MetricsLogger()

    def on_log(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, logs=None, **kwargs):
        if state.is_world_process_zero and logs:
            eval_metrics = {k: v for k, v in logs.items() if k.startswith("eval_") and isinstance(v, (int, float))}
            if eval_metrics:
                self.metrics_logger.log_evaluation(
                    metrics=eval_metrics,
                    epoch=int(logs.get('epoch', state.epoch or 0)),
                    step=state.global_step
                )
            elif any(key in logs for key in ['train_loss', 'loss']) and 'learning_rate' in logs:
                loss = logs.get('train_loss', logs.get('loss', 0))
                self.metrics_logger.log_epoch_progress(
                    epoch=logs.get('epoch', state.epoch or 0),
                    loss=loss,
                    learning_rate=logs['learning_rate']
                )

    def on_train_end(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs):
        if state.is_world_process_zero:
            self.metrics_logger.display_training_summary()

# --- MODIFICATION 2, PART 1: ADD NEW CALLBACK CLASS ---
class AverageTrainLossLogger(TrainerCallback):
    def __init__(self):
        self.epoch_train_losses = []

    def on_epoch_begin(self, args, state, control, **kwargs):
        """Reset the list of losses at the start of each epoch."""
        self.epoch_train_losses = []

    def on_log(self, args, state, control, logs=None, **kwargs):
        """On each log step, if it's a training log, append the loss."""
        if 'loss' in logs and 'learning_rate' in logs:
            self.epoch_train_losses.append(logs['loss'])

    def on_epoch_end(self, args, state, control, **kwargs):
        """At the end of the epoch, calculate and log the average."""
        if self.epoch_train_losses:
            avg_epoch_loss = np.mean(self.epoch_train_losses)
            logger.info(f"===== Average Training Loss for Epoch {int(state.epoch)}: {avg_epoch_loss:.4f} =====")

# --- NEW: CALLBACK FOR OPTUNA PRUNING ---
class OptunaPruningCallback(TrainerCallback):
    """
    A TrainerCallback to report evaluation metrics to Optuna for pruning.
    """
    def __init__(self, trial: optuna.Trial):
        self.trial = trial

    def on_evaluate(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, metrics: dict, **kwargs):
        # We report the metric we're optimizing for (bleu_4)
        if "eval_bleu_4" in metrics:
            metric_value = metrics["eval_bleu_4"]
            self.trial.report(metric_value, state.global_step)
            
            # Check if the trial should be pruned
            if self.trial.should_prune():
                logger.warning(f"Trial {self.trial.number} pruned at step {state.global_step} with BLEU-4: {metric_value}.")
                raise optuna.TrialPruned()
        elif "eval_loss" in metrics:
            # Fallback to loss if bleu_4 isn't available for some reason
            self.trial.report(metrics["eval_loss"], state.global_step)
            if self.trial.should_prune():
                logger.warning(f"Trial {self.trial.number} pruned at step {state.global_step} with Loss: {metrics['eval_loss']}.")
                raise optuna.TrialPruned()

def log_gpu_memory(prefix=""):
    """Log current GPU memory usage"""
    if torch.cuda.is_available():
        allocated = torch.cuda.memory_allocated() / 1024**3
        reserved = torch.cuda.memory_reserved() / 1024**3
        logger.info(f"{prefix} GPU Memory - Allocated: {allocated:.2f}GB, Reserved: {reserved:.2f}GB")
    else:
        logger.info(f"{prefix} GPU not available")

def log_model_state(model, prefix=""):
    """Log current model state"""
    logger.info(f"{prefix} Model - Training mode: {model.training}, Device: {next(model.parameters()).device}")

# --- MODIFIED: CustomTrainer now accepts an `is_tuning_trial` flag ---
class CustomTrainer(Trainer):
    def __init__(self, evaluator, metrics_logger=None, is_tuning_trial=False, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.evaluator = evaluator
        self.decoder = DiverseDecoder(self.model, self.processing_class)
        self.metrics_logger = metrics_logger or MetricsLogger()
        self.is_tuning_trial = is_tuning_trial  # <-- Store the flag

    def evaluate(self, eval_dataset=None, ignore_keys=None, metric_key_prefix="eval"):
        # --- MODIFIED: Check if we are in a tuning trial ---
        if self.is_tuning_trial:
            logger.info(f"=== TUNING TRIAL EVALUATION (BLEU-4 Only) ===")
        else:
            logger.info("=== BEFORE EVALUATION ===")
            log_gpu_memory("BEFORE EVAL")
            log_model_state(self.model, "BEFORE EVAL")
        
        eval_dataloader = self.get_eval_dataloader(eval_dataset)
        self.model.eval()
        
        predictions, references = [], []
        total_loss = 0.0
        num_batches = 0
        
        epoch = self.state.epoch if self.state.epoch is not None else 0
        if not self.is_tuning_trial:
             logger.info("Starting evaluation on full validation set...")
        
        eval_desc = f"Evaluating Epoch {int(epoch)}"
        if self.is_tuning_trial:
            eval_desc = f"Tuning Trial {self.state.trial_number} Eval" if hasattr(self.state, 'trial_number') else "Tuning Trial Eval"
            
        eval_progress = tqdm(eval_dataloader, desc=eval_desc, unit="batch", disable=False) # Disable tqdm in tuning

        try:
            for batch_idx, batch in enumerate(eval_progress):
                batch_device = {k: v.to(self.args.device) for k, v in batch.items()}
                
                with torch.no_grad():
                    outputs = self.model(**batch_device)
                    total_loss += outputs.loss.item()
                    num_batches += 1
                    
                    inputs = {k: v for k, v in batch_device.items() if k != "labels"}
                    generated_ids = self.decoder.diverse_generate(inputs["input_ids"], inputs["attention_mask"])
                    batch_predictions = self.processing_class.batch_decode(generated_ids, skip_special_tokens=True)
                    predictions.extend(batch_predictions)
                    
                    labels = batch["labels"].cpu().numpy()
                    labels = np.where(labels != -100, labels, self.processing_class.pad_token_id)
                    batch_references = self.processing_class.batch_decode(labels, skip_special_tokens=True)
                    references.extend(batch_references)
                    
                    if (batch_idx + 1) % 20 == 0:
                        torch.cuda.empty_cache()
                        
        except Exception as e:
            logger.error(f"Error during evaluation: {e}")
            return {f"{metric_key_prefix}_loss": float('inf')}

        # --- MODIFIED: Conditional metric computation ---
        metrics = {}
        if self.is_tuning_trial:
            # For tuning, ONLY compute loss and BLEU-4
            if not self.is_tuning_trial: logger.info(f"Tuning trial: Computing only BLEU-4 for {len(predictions)} predictions...")
            bleu_scores = self.evaluator.compute_bleu(references, predictions)
            metrics = {
                f"{metric_key_prefix}_bleu_4": bleu_scores.get('bleu_4', 0.0)
            }
        else:
            # Full evaluation for the final run
            logger.info(f"Computing metrics for {len(predictions)} predictions...")
            metrics = self.evaluator.compute_all_metrics(references, predictions)
            metrics = {f"{metric_key_prefix}_{k}": v for k, v in metrics.items()}
        # --- END OF MODIFICATION ---
        
        avg_loss = total_loss / max(num_batches, 1)
        metrics[f"{metric_key_prefix}_loss"] = avg_loss
        
        self.log(metrics)
        
        del predictions, references, batch_device
        torch.cuda.empty_cache()
        gc.collect()
        
        self.model.train()
        
        if not self.is_tuning_trial:
            logger.info("=== AFTER EVALUATION ===")
            log_gpu_memory("AFTER EVAL")
            log_model_state(self.model, "AFTER EVAL")
            logger.info(f"Evaluation completed successfully. Average loss: {avg_loss:.4f}")
            
        return metrics

def generate_sample_predictions(model, tokenizer, eval_contexts, eval_questions, num_samples=20):
    logger.info("Generating sample predictions...")
    indices = random.sample(range(len(eval_contexts)), min(num_samples, len(eval_contexts)))
    model.eval()
    decoder = DiverseDecoder(model, tokenizer)
    
    print("\n" + "="*100)
    print("SAMPLE PREDICTIONS - 20 Context-Question Pairs")
    print("="*100)
    
    for i, idx in enumerate(indices, 1):
        context, actual_question = eval_contexts[idx], eval_questions[idx]
        input_text = f"Generate a question from context: {context}"
        input_encoding = tokenizer(input_text, max_length=512, truncation=True, return_tensors="pt").to(model.device)
        
        with torch.no_grad():
            generated_ids = decoder.diverse_generate(input_encoding["input_ids"], input_encoding["attention_mask"])
            predicted_question = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
        
        print(f"\n--- SAMPLE {i:2d} ---")
        print(f"CONTEXT: {context[:200]}{'...' if len(context) > 200 else ''}")
        print(f"ACTUAL:   {actual_question}")
        print(f"PREDICTED: {predicted_question}")
        print("-" * 80)
    print("="*100 + "\n")

def setup_training_args(output_dir, num_train_epochs=5, train_dataset_size=0):
    effective_batch_size = 8 * 2
    steps_per_epoch = max(1, train_dataset_size // effective_batch_size)
    
    logger.info(f"Training Configuration: Dataset size: {train_dataset_size:,}, Effective batch size: {effective_batch_size}, Steps per epoch: {steps_per_epoch}")
    
    return TrainingArguments(
        output_dir=output_dir,
        num_train_epochs=num_train_epochs,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=2,
        gradient_accumulation_steps=2,
        torch_compile=True,
        learning_rate=5e-5,  # This will be overridden by Optuna
        weight_decay=0.01, # This will be overridden by Optuna
        warmup_steps=min(500, steps_per_epoch), # This will be overridden by Optuna
        logging_steps=max(10, steps_per_epoch // 10),
        eval_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="eval_bleu_4",
        greater_is_better=True,
        bf16=torch.cuda.is_bf16_supported(),
        dataloader_pin_memory=True,
        dataloader_num_workers=0,
        remove_unused_columns=False,
        gradient_checkpointing=True,
        lr_scheduler_type="cosine", # This will be overridden by Optuna
        save_total_limit=3, # Will use 1 for tuning
        report_to="none",
        seed=42,
        data_seed=42,
        group_by_length=True,
    )

# --- CONFIGURATION FOR PHASE 2 ---
# REPLACE THIS PATH with the actual folder where Phase 1 saved the 'final' model
# Example: "./t5-flan-question-generation-tuned/final" or "E:/Models/Phase1/final"
g_model_name = "./t5-flan-question-generation-tuned/final" 

# Global variables (same as before)
g_tokenizer = None
g_train_dataset = None
g_eval_dataset = None
g_data_collator = None
g_evaluator = None
g_eval_contexts = None
g_eval_questions = None

# --- NEW: Optuna Objective Function ---
def objective(trial: optuna.Trial):
    global g_tokenizer, g_model_name, g_train_dataset, g_eval_dataset, g_data_collator, g_evaluator
    
    logger.info(f"--- Starting Optuna Trial {trial.number} ---")
    
    # --- 1. Define Search Space ---
    learning_rate = trial.suggest_float("learning_rate", 1e-5, 1e-4, log=True)
    weight_decay = trial.suggest_float("weight_decay", 0.0, 0.1)
    lr_scheduler_type = trial.suggest_categorical("lr_scheduler_type", ["linear", "cosine", "constant"])
    
    # Calculate steps_per_epoch for warmup suggestion
    effective_batch_size = 8 * 2
    steps_per_epoch = max(1, len(g_train_dataset) // effective_batch_size)
    warmup_steps = trial.suggest_int("warmup_steps", 100, steps_per_epoch) 
    
    # --- USE SMALL EVAL SUBSET FOR TUNING (e.g., 10% or 5000 samples max) ---
    eval_subset_size = min(5000, int(len(g_eval_dataset) * 0.3))  # Use 10% of eval data, max 5000
    eval_indices = random.sample(range(len(g_eval_dataset)), eval_subset_size)
    eval_subset = torch.utils.data.Subset(g_eval_dataset, eval_indices)
    
    logger.info(f"Trial {trial.number}: Using {len(g_train_dataset)} train samples and {eval_subset_size} eval samples (subset for speed)")
    
    # --- 2. Configure Training ---
    model = T5ForConditionalGeneration.from_pretrained(g_model_name)
    if g_tokenizer.pad_token is not None and g_tokenizer.pad_token_id > g_tokenizer.vocab_size:
         model.resize_token_embeddings(len(g_tokenizer))
    
    output_dir = f"./optuna-trials/trial_{trial.number}"
    
    training_args = TrainingArguments(
        output_dir=output_dir,
        num_train_epochs=8,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=2,
        gradient_accumulation_steps=2,
        torch_compile=True,
        learning_rate=learning_rate,
        weight_decay=weight_decay,
        warmup_steps=warmup_steps,
        lr_scheduler_type=lr_scheduler_type,
        logging_steps=max(10, steps_per_epoch // 10),
        eval_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="eval_bleu_4",
        greater_is_better=True,
        bf16=torch.cuda.is_bf16_supported(),
        dataloader_pin_memory=True,
        dataloader_num_workers=0,
        remove_unused_columns=False,
        gradient_checkpointing=True,
        save_total_limit=1,
        report_to="none",
        seed=42,
        data_seed=42,
        group_by_length=True,
        logging_dir=f"{output_dir}/logs",
    )
    
    pruning_callback = OptunaPruningCallback(trial)
    trial_data_collator = DataCollatorForSeq2Seq(tokenizer=g_tokenizer, model=model, padding=True, max_length=512, label_pad_token_id=-100)

    trainer = CustomTrainer(
        evaluator=g_evaluator,
        metrics_logger=None,
        is_tuning_trial=True,
        model=model,
        args=training_args,
        train_dataset=g_train_dataset,  # Full training data
        eval_dataset=eval_subset,        # <--- SUBSET for fast evaluation
        tokenizer=g_tokenizer,
        data_collator=trial_data_collator,
        callbacks=[
            pruning_callback,
            EarlyStoppingCallback(early_stopping_patience=4)
        ]
    )
    
    trainer.state.trial_number = trial.number
    
    # --- 3. Train ---
    try:
        trainer.train()
    except optuna.TrialPruned:
        logger.info(f"Trial {trial.number} was pruned.")
        del model, trainer, trial_data_collator
        torch.cuda.empty_cache()
        gc.collect()
        return 0.0
    except Exception as e:
        logger.error(f"Error in trial {trial.number}: {e}")
        del model, trainer, trial_data_collator
        torch.cuda.empty_cache()
        gc.collect()
        return 0.0

    # --- 4. Report & Return ---
    best_metric = trainer.state.best_metric
    logger.info(f"--- Finished Optuna Trial {trial.number} | Best BLEU-4: {best_metric} ---")
    
    del model, trainer, trial_data_collator
    torch.cuda.empty_cache()
    gc.collect()
    
    return best_metric if best_metric is not None else 0.0


# --- MODIFIED: Main function now orchestrates setup, tuning, and final training ---
def main():
    global g_tokenizer, g_model_name, g_train_dataset, g_eval_dataset, g_data_collator, g_evaluator, g_eval_contexts, g_eval_questions
    
    set_seed(42)
    logger.info("Starting PHASE 2: T5 Question Generation Training")
    
    # --- CONFIGURATION ---
    data_dir = "E:/A_CSE499/data"  # Directory containing merged_train.csv and merged_val.csv
    output_dir = "./t5-phase2-tuned" # New output directory for Phase 2 results
    
    # --- 1. SETUP ---
    # IMPORTANT: This loads the model and tokenizer from your Phase 1 local directory
    logger.info(f"Loading Phase 1 model from: {g_model_name}...")
    try:
        g_tokenizer = T5Tokenizer.from_pretrained(g_model_name)
        
        # We load the model here to verify it loads correctly, then release it
        # It will be re-loaded inside the Optuna objective/Final training
        dummy_model = T5ForConditionalGeneration.from_pretrained(g_model_name)
        
        # Ensure special tokens are handled if they were saved with the tokenizer
        if g_tokenizer.pad_token is None:
            g_tokenizer.add_special_tokens({"pad_token": "<pad>"})
            dummy_model.resize_token_embeddings(len(g_tokenizer))
            
    except OSError as e:
        logger.error(f"Could not load Phase 1 model from {g_model_name}. Check the path.")
        logger.error(str(e))
        return

    # Load Phase 2 Data
    logger.info("Loading Phase 2 (Merged) datasets...")
    data_processor = DataProcessor(data_dir, g_tokenizer)
    
    # CALLING THE NEW DATA LOADER
    train_contexts, train_questions, g_eval_contexts, g_eval_questions = data_processor.load_phase2_datasets()
    
    if not train_contexts:
        logger.error("No training data loaded! Please check merged_train.csv path.")
        return
        
    g_train_dataset = QuestionGenerationDataset(train_contexts, train_questions, g_tokenizer)
    g_eval_dataset = QuestionGenerationDataset(g_eval_contexts, g_eval_questions, g_tokenizer)
    
    # Setup Collator (using dummy model for config)
    g_data_collator = DataCollatorForSeq2Seq(tokenizer=g_tokenizer, model=dummy_model, padding=True, max_length=512, label_pad_token_id=-100)
    del dummy_model
    
    g_evaluator = AdvancedEvaluationMetrics(g_tokenizer)
    
    # --- 2. OPTUNA TUNING (Optional for Phase 2, but recommended) ---
    logger.info("=== STARTING PHASE 2 HYPERPARAMETER TUNING ===")
    
    sampler = optuna.samplers.TPESampler(seed=42)
    pruner = optuna.pruners.MedianPruner(n_startup_trials=3, n_warmup_steps=3)
    
    study = optuna.create_study(
        study_name="t5-phase2-tuning", # Changed name
        direction="maximize",
        sampler=sampler,
        pruner=pruner,
        storage="sqlite:///t5_phase2_tuning.db", # Changed DB file
        load_if_exists=True
    )
    
    n_trials = 10
    n_completed_trials = len([t for t in study.trials if t.state == optuna.trial.TrialState.COMPLETE])
    
    if n_completed_trials >= n_trials:
        logger.info(f"Study already has {n_completed_trials} completed trials. Skipping optimization.")
    else:
        n_remaining_trials = n_trials - n_completed_trials
        logger.info(f"Resuming study. {n_completed_trials} trials complete, running {n_remaining_trials} more.")
        try:
            study.optimize(objective, n_trials=n_remaining_trials)
        except KeyboardInterrupt:
            logger.warning("Optuna optimization interrupted.")
    
    logger.info("=== TUNING COMPLETE ===")
    best_trial = study.best_trial
    logger.info(f"Best parameters: {json.dumps(best_trial.params, indent=2)}")

    # --- 3. FINAL PHASE 2 TRAINING ---
    logger.info("=== STARTING PHASE 2 FINAL TRAINING ===")
    
    # Clean up memory
    torch.cuda.empty_cache()
    gc.collect()

    # Load the Phase 1 model again for final training
    model = T5ForConditionalGeneration.from_pretrained(g_model_name)
    if g_tokenizer.pad_token_id > model.config.vocab_size:
         model.resize_token_embeddings(len(g_tokenizer))
         
    shared_metrics_logger = MetricsLogger()
    custom_logging_callback = CustomLoggingCallback(shared_metrics_logger)
    avg_loss_callback = AverageTrainLossLogger()
    
    best_params = best_trial.params
    
    training_args = setup_training_args(
        output_dir,
        num_train_epochs=8, 
        train_dataset_size=len(g_train_dataset)
    )
    
    training_args.learning_rate = best_params["learning_rate"]
    training_args.weight_decay = best_params["weight_decay"]
    training_args.warmup_steps = best_params["warmup_steps"]
    training_args.lr_scheduler_type = best_params["lr_scheduler_type"]
    
    final_data_collator = DataCollatorForSeq2Seq(tokenizer=g_tokenizer, model=model, padding=True, max_length=512, label_pad_token_id=-100)

    trainer = CustomTrainer(
        evaluator=g_evaluator,
        metrics_logger=shared_metrics_logger,
        is_tuning_trial=False,
        model=model,
        args=training_args,
        train_dataset=g_train_dataset,
        eval_dataset=g_eval_dataset,
        tokenizer=g_tokenizer,
        data_collator=final_data_collator,
        callbacks=[
            custom_logging_callback,
            avg_loss_callback,
            EarlyStoppingCallback(early_stopping_patience=4)
        ]
    )
    
    logger.info("Starting Phase 2 training...")
    try:
        trainer.train()
    except KeyboardInterrupt:
        logger.warning("Training interrupted. Saving current state...")
        trainer.save_model(os.path.join(output_dir, "interrupted"))
        return
    
    logger.info(f"Saving Phase 2 model to {os.path.join(output_dir, 'final')}")
    trainer.save_model(os.path.join(output_dir, "final"))
    g_tokenizer.save_pretrained(os.path.join(output_dir, "final"))
    
    logger.info("Performing final evaluation...")
    final_metrics = trainer.evaluate()
    
    generate_sample_predictions(model, g_tokenizer, g_eval_contexts, g_eval_questions)
    
    # Print Final Table
    final_results = [[k.replace('eval_', '').upper(), f"{v:.4f}"] for k, v in sorted(final_metrics.items()) if isinstance(v, (int, float))]
    if final_results:
        print("\nPHASE 2 FINAL EVALUATION RESULTS:")
        print(tabulate(final_results, headers=['Metric', 'Final Score'], tablefmt='fancy_grid'))
    
    logger.info("Phase 2 pipeline completed successfully!")

if __name__ == "__main__":
    main()

23:44:11 | INFO | Starting PHASE 2: T5 Question Generation Training
23:44:11 | INFO | Loading Phase 1 model from: ./t5-flan-question-generation-tuned/final...
23:44:11 | INFO | Loading Phase 2 (Merged) datasets...
23:44:11 | INFO | Loading Phase 2 Train data from: E:/A_CSE499/data\merged_train.csv
Checking Phase 2 Train:   0%|          | 0/90000 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (547 > 512). Running this sequence through the model will result in indexing errors
Checking Phase 2 Train: 100%|██████████| 90000/90000 [01:38<00:00, 916.94it/s]
23:45:50 | INFO | --- DATA STATS FOR PHASE 2 TRAIN ---
23:45:50 | INFO |   Total Rows:      90000
23:45:50 | INFO |   Kept:            67260 (74.7%)
23:45:50 | INFO |   Dropped (Empty): 0
23:45:50 | INFO |   Dropped (>512):  22740 (25.3%)
23:45:50 | INFO | ---------------------------------------
23:45:50 | INFO | Loading Phase 2 Validation data from: E:/A_CSE499/data\merge

Epoch,Training Loss,Validation Loss


00:26:58 | INFO | === TUNING TRIAL EVALUATION (BLEU-4 Only) ===
Tuning Trial Eval:   0%|          | 0/1109 [00:00<?, ?batch/s]Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.
Group Beam Search is scheduled to be moved to a `custom_generate` repository in v4.55.0. To prevent loss of backward compatibility, add `trust_remote_code=True` to your `generate` call.
Tuning Trial Eval: 100%|██████████| 1109/1109 [31:20<00:00,  1.70s/batch]
01:37:20 | INFO | === TUNING TRIAL EVALUATION (BLEU-4 Only) ===
Tuning Trial Eval: 100%|██████████| 1109/1109 [30:16<00:00,  1.64s/batch]
02:46:32 | INFO | === TUNING TRIAL EVALUATION (BLEU-4 Only) ===
Tuning Trial Eval: 100%|██████████| 1109/1109 [30:25<00:00,  1.65s/batch]
03:55:58 | INFO | === TUNING TRIAL EVALUATION (BLEU-4 Only) ===
Tuning Trial Eval: 100%|██████████| 1

Epoch,Training Loss,Validation Loss


09:42:09 | INFO | === TUNING TRIAL EVALUATION (BLEU-4 Only) ===
Tuning Trial Eval: 100%|██████████| 1109/1109 [30:41<00:00,  1.66s/batch] 
10:51:54 | INFO | === TUNING TRIAL EVALUATION (BLEU-4 Only) ===
Tuning Trial Eval: 100%|██████████| 1109/1109 [31:53<00:00,  1.73s/batch]
12:02:41 | INFO | === TUNING TRIAL EVALUATION (BLEU-4 Only) ===
Tuning Trial Eval: 100%|██████████| 1109/1109 [31:08<00:00,  1.69s/batch]
13:12:44 | INFO | === TUNING TRIAL EVALUATION (BLEU-4 Only) ===
Tuning Trial Eval: 100%|██████████| 1109/1109 [29:57<00:00,  1.62s/batch]
14:21:37 | INFO | === TUNING TRIAL EVALUATION (BLEU-4 Only) ===
Tuning Trial Eval: 100%|██████████| 1109/1109 [30:51<00:00,  1.67s/batch]
15:31:24 | INFO | === TUNING TRIAL EVALUATION (BLEU-4 Only) ===
Tuning Trial Eval: 100%|██████████| 1109/1109 [31:05<00:00,  1.68s/batch]
16:41:26 | INFO | === TUNING TRIAL EVALUATION (BLEU-4 Only) ===
Tuning Trial Eval: 100%|██████████| 1109/1109 [30:33<00:00,  1.65s/batch]
17:50:58 | INFO | === TUNING TRIA

Epoch,Training Loss,Validation Loss


19:02:25 | INFO | === TUNING TRIAL EVALUATION (BLEU-4 Only) ===
Tuning Trial Eval: 100%|██████████| 1109/1109 [30:47<00:00,  1.67s/batch] 
20:12:11 | INFO | === TUNING TRIAL EVALUATION (BLEU-4 Only) ===
Tuning Trial Eval: 100%|██████████| 1109/1109 [28:31<00:00,  1.54s/batch]
21:19:40 | INFO | === TUNING TRIAL EVALUATION (BLEU-4 Only) ===
Tuning Trial Eval: 100%|██████████| 1109/1109 [28:14<00:00,  1.53s/batch]
22:26:55 | INFO | === TUNING TRIAL EVALUATION (BLEU-4 Only) ===
Tuning Trial Eval: 100%|██████████| 1109/1109 [25:54<00:00,  1.40s/batch]
23:31:46 | INFO | === TUNING TRIAL EVALUATION (BLEU-4 Only) ===
Tuning Trial Eval: 100%|██████████| 1109/1109 [29:56<00:00,  1.62s/batch]
00:40:42 | INFO | === TUNING TRIAL EVALUATION (BLEU-4 Only) ===
Tuning Trial Eval: 100%|██████████| 1109/1109 [28:19<00:00,  1.53s/batch]
01:47:57 | INFO | === TUNING TRIAL EVALUATION (BLEU-4 Only) ===
Tuning Trial Eval: 100%|██████████| 1109/1109 [28:39<00:00,  1.55s/batch]
02:55:33 | INFO | === TUNING TRIA

Epoch,Training Loss,Validation Loss


04:01:03 | INFO | === TUNING TRIAL EVALUATION (BLEU-4 Only) ===
Tuning Trial Eval: 100%|██████████| 1109/1109 [31:54<00:00,  1.73s/batch] 
05:11:51 | INFO | === TUNING TRIAL EVALUATION (BLEU-4 Only) ===
Tuning Trial Eval: 100%|██████████| 1109/1109 [30:14<00:00,  1.64s/batch]
06:21:00 | INFO | === TUNING TRIAL EVALUATION (BLEU-4 Only) ===
Tuning Trial Eval: 100%|██████████| 1109/1109 [30:29<00:00,  1.65s/batch]
07:30:25 | INFO | === TUNING TRIAL EVALUATION (BLEU-4 Only) ===
Tuning Trial Eval: 100%|██████████| 1109/1109 [28:15<00:00,  1.53s/batch]
08:37:35 | INFO | === TUNING TRIAL EVALUATION (BLEU-4 Only) ===
Tuning Trial Eval: 100%|██████████| 1109/1109 [30:25<00:00,  1.65s/batch] 
09:46:53 | INFO | === TUNING TRIAL EVALUATION (BLEU-4 Only) ===
Tuning Trial Eval: 100%|██████████| 1109/1109 [29:56<00:00,  1.62s/batch]
10:55:43 | INFO | === TUNING TRIAL EVALUATION (BLEU-4 Only) ===
Tuning Trial Eval: 100%|██████████| 1109/1109 [29:44<00:00,  1.61s/batch]
12:04:22 | INFO | === TUNING TRI

Epoch,Training Loss,Validation Loss


13:13:38 | INFO | === TUNING TRIAL EVALUATION (BLEU-4 Only) ===
Tuning Trial Eval: 100%|██████████| 1109/1109 [30:58<00:00,  1.68s/batch] 
14:23:30 | INFO | === TUNING TRIAL EVALUATION (BLEU-4 Only) ===
Tuning Trial Eval: 100%|██████████| 1109/1109 [29:34<00:00,  1.60s/batch]
15:31:56 | INFO | === TUNING TRIAL EVALUATION (BLEU-4 Only) ===
Tuning Trial Eval: 100%|██████████| 1109/1109 [29:44<00:00,  1.61s/batch]
16:40:41 | INFO | === TUNING TRIAL EVALUATION (BLEU-4 Only) ===
Tuning Trial Eval: 100%|██████████| 1109/1109 [27:33<00:00,  1.49s/batch]
17:47:09 | INFO | === TUNING TRIAL EVALUATION (BLEU-4 Only) ===
Tuning Trial Eval: 100%|██████████| 1109/1109 [30:58<00:00,  1.68s/batch] 
18:57:03 | INFO | === TUNING TRIAL EVALUATION (BLEU-4 Only) ===
Tuning Trial Eval: 100%|██████████| 1109/1109 [29:05<00:00,  1.57s/batch]
20:05:03 | INFO | === TUNING TRIAL EVALUATION (BLEU-4 Only) ===
Tuning Trial Eval: 100%|██████████| 1109/1109 [30:09<00:00,  1.63s/batch]
21:14:05 | INFO | === TUNING TRI

Epoch,Training Loss,Validation Loss


22:22:22 | INFO | === TUNING TRIAL EVALUATION (BLEU-4 Only) ===
Tuning Trial Eval: 100%|██████████| 1109/1109 [30:47<00:00,  1.67s/batch] 
23:32:06 | INFO | === TUNING TRIAL EVALUATION (BLEU-4 Only) ===
Tuning Trial Eval: 100%|██████████| 1109/1109 [28:40<00:00,  1.55s/batch]
00:39:38 | INFO | === TUNING TRIAL EVALUATION (BLEU-4 Only) ===
Tuning Trial Eval: 100%|██████████| 1109/1109 [28:49<00:00,  1.56s/batch]
01:47:26 | INFO | === TUNING TRIAL EVALUATION (BLEU-4 Only) ===
Tuning Trial Eval: 100%|██████████| 1109/1109 [26:46<00:00,  1.45s/batch]
02:53:04 | INFO | === TUNING TRIAL EVALUATION (BLEU-4 Only) ===
Tuning Trial Eval: 100%|██████████| 1109/1109 [30:45<00:00,  1.66s/batch] 
04:02:43 | INFO | === TUNING TRIAL EVALUATION (BLEU-4 Only) ===
Tuning Trial Eval: 100%|██████████| 1109/1109 [28:49<00:00,  1.56s/batch]
05:10:26 | INFO | === TUNING TRIAL EVALUATION (BLEU-4 Only) ===
Tuning Trial Eval: 100%|██████████| 1109/1109 [29:30<00:00,  1.60s/batch]
06:18:54 | INFO | === TUNING TRI

Epoch,Training Loss,Validation Loss


07:26:27 | INFO | === TUNING TRIAL EVALUATION (BLEU-4 Only) ===
Tuning Trial Eval: 100%|██████████| 1109/1109 [31:25<00:00,  1.70s/batch] 
08:36:46 | INFO | === TUNING TRIAL EVALUATION (BLEU-4 Only) ===
Tuning Trial Eval: 100%|██████████| 1109/1109 [30:42<00:00,  1.66s/batch]
09:46:22 | INFO | === TUNING TRIAL EVALUATION (BLEU-4 Only) ===
Tuning Trial Eval: 100%|██████████| 1109/1109 [30:13<00:00,  1.63s/batch]
10:55:27 | INFO | === TUNING TRIAL EVALUATION (BLEU-4 Only) ===
Tuning Trial Eval: 100%|██████████| 1109/1109 [29:15<00:00,  1.58s/batch]
12:03:41 | INFO | === TUNING TRIAL EVALUATION (BLEU-4 Only) ===
Tuning Trial Eval: 100%|██████████| 1109/1109 [30:39<00:00,  1.66s/batch]
13:13:12 | INFO | === TUNING TRIAL EVALUATION (BLEU-4 Only) ===
Tuning Trial Eval: 100%|██████████| 1109/1109 [30:33<00:00,  1.65s/batch]
14:22:42 | INFO | === TUNING TRIAL EVALUATION (BLEU-4 Only) ===
Tuning Trial Eval: 100%|██████████| 1109/1109 [30:22<00:00,  1.64s/batch]
15:31:59 | INFO | === TUNING TRIA

Epoch,Training Loss,Validation Loss


16:42:24 | INFO | === TUNING TRIAL EVALUATION (BLEU-4 Only) ===
Tuning Trial Eval: 100%|██████████| 1109/1109 [30:50<00:00,  1.67s/batch] 
17:52:12 | INFO | === TUNING TRIAL EVALUATION (BLEU-4 Only) ===
Tuning Trial Eval: 100%|██████████| 1109/1109 [32:18<00:00,  1.75s/batch]
19:03:28 | INFO | === TUNING TRIAL EVALUATION (BLEU-4 Only) ===
Tuning Trial Eval: 100%|██████████| 1109/1109 [30:48<00:00,  1.67s/batch]
20:13:16 | INFO | === TUNING TRIAL EVALUATION (BLEU-4 Only) ===
Tuning Trial Eval: 100%|██████████| 1109/1109 [30:12<00:00,  1.63s/batch]
21:22:22 | INFO | === TUNING TRIAL EVALUATION (BLEU-4 Only) ===
Tuning Trial Eval: 100%|██████████| 1109/1109 [31:01<00:00,  1.68s/batch] 
22:32:19 | INFO | === TUNING TRIAL EVALUATION (BLEU-4 Only) ===
Tuning Trial Eval: 100%|██████████| 1109/1109 [31:24<00:00,  1.70s/batch]
23:42:43 | INFO | === TUNING TRIAL EVALUATION (BLEU-4 Only) ===
Tuning Trial Eval: 100%|██████████| 1109/1109 [30:44<00:00,  1.66s/batch]
00:52:26 | INFO | === TUNING TRI

Epoch,Training Loss,Validation Loss


02:04:03 | INFO | === TUNING TRIAL EVALUATION (BLEU-4 Only) ===
Tuning Trial Eval: 100%|██████████| 1109/1109 [32:03<00:00,  1.73s/batch] 
03:15:02 | INFO | === TUNING TRIAL EVALUATION (BLEU-4 Only) ===
Tuning Trial Eval: 100%|██████████| 1109/1109 [29:15<00:00,  1.58s/batch]
04:23:12 | INFO | === TUNING TRIAL EVALUATION (BLEU-4 Only) ===
Tuning Trial Eval: 100%|██████████| 1109/1109 [29:59<00:00,  1.62s/batch]
05:32:03 | INFO | === TUNING TRIAL EVALUATION (BLEU-4 Only) ===
Tuning Trial Eval: 100%|██████████| 1109/1109 [27:50<00:00,  1.51s/batch]
06:38:46 | INFO | === TUNING TRIAL EVALUATION (BLEU-4 Only) ===
Tuning Trial Eval: 100%|██████████| 1109/1109 [30:03<00:00,  1.63s/batch]
07:47:41 | INFO | === TUNING TRIAL EVALUATION (BLEU-4 Only) ===
Tuning Trial Eval: 100%|██████████| 1109/1109 [29:09<00:00,  1.58s/batch]
08:55:45 | INFO | === TUNING TRIAL EVALUATION (BLEU-4 Only) ===
Tuning Trial Eval: 100%|██████████| 1109/1109 [29:34<00:00,  1.60s/batch]
10:04:17 | INFO | === TUNING TRIA

Epoch,Training Loss,Validation Loss


11:13:28 | INFO | === TUNING TRIAL EVALUATION (BLEU-4 Only) ===
Tuning Trial Eval: 100%|██████████| 1109/1109 [31:29<00:00,  1.70s/batch]
12:23:55 | INFO | === TUNING TRIAL EVALUATION (BLEU-4 Only) ===
Tuning Trial Eval: 100%|██████████| 1109/1109 [28:55<00:00,  1.57s/batch]
13:31:48 | INFO | === TUNING TRIAL EVALUATION (BLEU-4 Only) ===
Tuning Trial Eval: 100%|██████████| 1109/1109 [29:18<00:00,  1.59s/batch]
14:40:02 | INFO | === TUNING TRIAL EVALUATION (BLEU-4 Only) ===
Tuning Trial Eval: 100%|██████████| 1109/1109 [27:22<00:00,  1.48s/batch]
15:46:20 | INFO | === TUNING TRIAL EVALUATION (BLEU-4 Only) ===
Tuning Trial Eval: 100%|██████████| 1109/1109 [29:46<00:00,  1.61s/batch]
16:55:06 | INFO | === TUNING TRIAL EVALUATION (BLEU-4 Only) ===
Tuning Trial Eval: 100%|██████████| 1109/1109 [29:23<00:00,  1.59s/batch]
18:03:28 | INFO | === TUNING TRIAL EVALUATION (BLEU-4 Only) ===
Tuning Trial Eval: 100%|██████████| 1109/1109 [28:56<00:00,  1.57s/batch]
19:11:21 | INFO | === TUNING TRIAL

Epoch,Training Loss,Validation Loss


19:45:41 | INFO | Epoch 0.10 | Loss: 1.7660 | LR: 1.34e-05
19:49:31 | INFO | Epoch 0.20 | Loss: 1.5576 | LR: 2.36e-05
19:53:23 | INFO | Epoch 0.30 | Loss: 1.4929 | LR: 2.33e-05
19:57:15 | INFO | Epoch 0.40 | Loss: 1.4635 | LR: 2.30e-05
20:01:05 | INFO | Epoch 0.50 | Loss: 1.4497 | LR: 2.27e-05
20:05:00 | INFO | Epoch 0.60 | Loss: 1.4319 | LR: 2.24e-05
20:08:50 | INFO | Epoch 0.70 | Loss: 1.4148 | LR: 2.21e-05
20:12:43 | INFO | Epoch 0.80 | Loss: 1.4173 | LR: 2.18e-05
20:16:34 | INFO | Epoch 0.90 | Loss: 1.3920 | LR: 2.15e-05
20:20:22 | INFO | Epoch 1.00 | Loss: 1.3890 | LR: 2.12e-05
20:20:24 | INFO | ===== Average Training Loss for Epoch 1: 1.4775 =====
20:20:24 | INFO | === BEFORE EVALUATION ===
20:20:24 | INFO | BEFORE EVAL GPU Memory - Allocated: 2.80GB, Reserved: 6.36GB
20:20:24 | INFO | BEFORE EVAL Model - Training mode: True, Device: cuda:0
20:20:32 | INFO | Starting evaluation on full validation set...
Evaluating Epoch 1: 100%|██████████| 3696/3696 [1:45:25<00:00,  1.71s/batch] 


EVALUATION RESULTS - Epoch 1

QUALITY METRICS:
+------------+---------+
| Metric     |   Score |
| BLEU-1     |  0.2363 |
+------------+---------+
| BLEU-2     |  0.1375 |
+------------+---------+
| BLEU-3     |  0.0946 |
+------------+---------+
| BLEU-4     |  0.0714 |
+------------+---------+
| ROUGE-L    |  0.2576 |
+------------+---------+
| METEOR     |  0.2824 |
+------------+---------+
| BERT-SCORE |  0.8836 |
+------------+---------+
| SELF-BLEU  |  0.9166 |
+------------+---------+

DIVERSITY METRICS:
+------------+---------+
| Metric     |   Score |
| SELF-BLEU  |  0.9166 |
+------------+---------+
| DISTINCT-1 |  0.122  |
+------------+---------+
| DISTINCT-2 |  0.3106 |
+------------+---------+

OTHER METRICS:
+----------+---------+
| Metric   |   Score |
| LOSS     |  1.3436 |
+----------+---------+



22:42:42 | INFO | === AFTER EVALUATION ===
22:42:42 | INFO | AFTER EVAL GPU Memory - Allocated: 2.81GB, Reserved: 3.08GB
22:42:42 | INFO | AFTER EVAL Model - Training mode: True, Device: cuda:0
22:42:42 | INFO | Evaluation completed successfully. Average loss: 1.3436
22:46:54 | INFO | Epoch 1.10 | Loss: 1.3584 | LR: 2.09e-05
22:50:45 | INFO | Epoch 1.20 | Loss: 1.3341 | LR: 2.06e-05
22:54:35 | INFO | Epoch 1.30 | Loss: 1.3365 | LR: 2.03e-05
22:58:28 | INFO | Epoch 1.40 | Loss: 1.3107 | LR: 2.00e-05
23:02:17 | INFO | Epoch 1.50 | Loss: 1.3283 | LR: 1.97e-05
23:06:11 | INFO | Epoch 1.60 | Loss: 1.3167 | LR: 1.94e-05
23:10:02 | INFO | Epoch 1.70 | Loss: 1.3297 | LR: 1.91e-05
23:13:53 | INFO | Epoch 1.80 | Loss: 1.3042 | LR: 1.88e-05
23:17:45 | INFO | Epoch 1.90 | Loss: 1.2927 | LR: 1.85e-05
23:21:34 | INFO | Epoch 2.00 | Loss: 1.2909 | LR: 1.82e-05
23:21:38 | INFO | ===== Average Training Loss for Epoch 2: 1.3202 =====
23:21:38 | INFO | === BEFORE EVALUATION ===
23:21:38 | INFO | BEFORE E


EVALUATION RESULTS - Epoch 2

QUALITY METRICS:
+------------+---------+
| Metric     |   Score |
| BLEU-1     |  0.245  |
+------------+---------+
| BLEU-2     |  0.146  |
+------------+---------+
| BLEU-3     |  0.1033 |
+------------+---------+
| BLEU-4     |  0.0795 |
+------------+---------+
| ROUGE-L    |  0.2651 |
+------------+---------+
| METEOR     |  0.2931 |
+------------+---------+
| BERT-SCORE |  0.8847 |
+------------+---------+
| SELF-BLEU  |  0.9165 |
+------------+---------+

DIVERSITY METRICS:
+------------+---------+
| Metric     |   Score |
| SELF-BLEU  |  0.9165 |
+------------+---------+
| DISTINCT-1 |  0.1213 |
+------------+---------+
| DISTINCT-2 |  0.3043 |
+------------+---------+

OTHER METRICS:
+----------+---------+
| Metric   |   Score |
| LOSS     |  1.3139 |
+----------+---------+



01:39:29 | INFO | === AFTER EVALUATION ===
01:39:29 | INFO | AFTER EVAL GPU Memory - Allocated: 2.82GB, Reserved: 3.08GB
01:39:29 | INFO | AFTER EVAL Model - Training mode: True, Device: cuda:0
01:39:29 | INFO | Evaluation completed successfully. Average loss: 1.3139
01:43:39 | INFO | Epoch 2.10 | Loss: 1.2534 | LR: 1.79e-05
01:47:30 | INFO | Epoch 2.20 | Loss: 1.2744 | LR: 1.76e-05
01:51:20 | INFO | Epoch 2.30 | Loss: 1.2677 | LR: 1.73e-05
01:55:15 | INFO | Epoch 2.40 | Loss: 1.2544 | LR: 1.70e-05
01:59:04 | INFO | Epoch 2.50 | Loss: 1.2575 | LR: 1.67e-05
02:02:57 | INFO | Epoch 2.60 | Loss: 1.2487 | LR: 1.64e-05
02:06:49 | INFO | Epoch 2.70 | Loss: 1.2430 | LR: 1.61e-05
02:10:38 | INFO | Epoch 2.80 | Loss: 1.2484 | LR: 1.58e-05
02:14:33 | INFO | Epoch 2.90 | Loss: 1.2347 | LR: 1.55e-05
02:18:24 | INFO | Epoch 3.00 | Loss: 1.2395 | LR: 1.51e-05
02:18:29 | INFO | ===== Average Training Loss for Epoch 3: 1.2522 =====
02:18:29 | INFO | === BEFORE EVALUATION ===
02:18:29 | INFO | BEFORE E


EVALUATION RESULTS - Epoch 3

QUALITY METRICS:
+------------+---------+
| Metric     |   Score |
| BLEU-1     |  0.2543 |
+------------+---------+
| BLEU-2     |  0.1545 |
+------------+---------+
| BLEU-3     |  0.109  |
+------------+---------+
| BLEU-4     |  0.0835 |
+------------+---------+
| ROUGE-L    |  0.274  |
+------------+---------+
| METEOR     |  0.3013 |
+------------+---------+
| BERT-SCORE |  0.8872 |
+------------+---------+
| SELF-BLEU  |  0.9173 |
+------------+---------+

DIVERSITY METRICS:
+------------+---------+
| Metric     |   Score |
| SELF-BLEU  |  0.9173 |
+------------+---------+
| DISTINCT-1 |  0.1231 |
+------------+---------+
| DISTINCT-2 |  0.3091 |
+------------+---------+

OTHER METRICS:
+----------+---------+
| Metric   |   Score |
| LOSS     |  1.3077 |
+----------+---------+



04:37:44 | INFO | === AFTER EVALUATION ===
04:37:44 | INFO | AFTER EVAL GPU Memory - Allocated: 2.81GB, Reserved: 3.07GB
04:37:44 | INFO | AFTER EVAL Model - Training mode: True, Device: cuda:0
04:37:44 | INFO | Evaluation completed successfully. Average loss: 1.3077
04:41:50 | INFO | Epoch 3.10 | Loss: 1.2085 | LR: 1.48e-05
04:45:42 | INFO | Epoch 3.20 | Loss: 1.1894 | LR: 1.45e-05
04:49:29 | INFO | Epoch 3.30 | Loss: 1.2188 | LR: 1.42e-05
04:53:23 | INFO | Epoch 3.40 | Loss: 1.2114 | LR: 1.39e-05
04:57:13 | INFO | Epoch 3.50 | Loss: 1.1973 | LR: 1.36e-05
05:01:05 | INFO | Epoch 3.60 | Loss: 1.2033 | LR: 1.33e-05
05:04:58 | INFO | Epoch 3.70 | Loss: 1.1939 | LR: 1.30e-05
05:08:48 | INFO | Epoch 3.80 | Loss: 1.2085 | LR: 1.27e-05
05:12:42 | INFO | Epoch 3.90 | Loss: 1.2060 | LR: 1.24e-05
05:16:33 | INFO | Epoch 4.00 | Loss: 1.1912 | LR: 1.21e-05
05:16:41 | INFO | ===== Average Training Loss for Epoch 4: 1.2028 =====
05:16:41 | INFO | === BEFORE EVALUATION ===
05:16:41 | INFO | BEFORE E


EVALUATION RESULTS - Epoch 4

QUALITY METRICS:
+------------+---------+
| Metric     |   Score |
| BLEU-1     |  0.2518 |
+------------+---------+
| BLEU-2     |  0.1522 |
+------------+---------+
| BLEU-3     |  0.108  |
+------------+---------+
| BLEU-4     |  0.0834 |
+------------+---------+
| ROUGE-L    |  0.2685 |
+------------+---------+
| METEOR     |  0.3008 |
+------------+---------+
| BERT-SCORE |  0.8856 |
+------------+---------+
| SELF-BLEU  |  0.9173 |
+------------+---------+

DIVERSITY METRICS:
+------------+---------+
| Metric     |   Score |
| SELF-BLEU  |  0.9173 |
+------------+---------+
| DISTINCT-1 |  0.1213 |
+------------+---------+
| DISTINCT-2 |  0.3069 |
+------------+---------+

OTHER METRICS:
+----------+---------+
| Metric   |   Score |
| LOSS     |  1.3057 |
+----------+---------+



07:37:25 | INFO | === AFTER EVALUATION ===
07:37:25 | INFO | AFTER EVAL GPU Memory - Allocated: 2.82GB, Reserved: 3.08GB
07:37:25 | INFO | AFTER EVAL Model - Training mode: True, Device: cuda:0
07:37:25 | INFO | Evaluation completed successfully. Average loss: 1.3057
07:41:27 | INFO | Epoch 4.10 | Loss: 1.1860 | LR: 1.18e-05
07:45:20 | INFO | Epoch 4.20 | Loss: 1.1604 | LR: 1.15e-05
07:49:08 | INFO | Epoch 4.30 | Loss: 1.1794 | LR: 1.12e-05
07:53:01 | INFO | Epoch 4.40 | Loss: 1.1618 | LR: 1.09e-05
07:56:52 | INFO | Epoch 4.50 | Loss: 1.1543 | LR: 1.06e-05
08:00:42 | INFO | Epoch 4.60 | Loss: 1.1781 | LR: 1.03e-05
08:04:36 | INFO | Epoch 4.70 | Loss: 1.1435 | LR: 1.00e-05
08:08:25 | INFO | Epoch 4.80 | Loss: 1.1682 | LR: 9.70e-06
08:12:19 | INFO | Epoch 4.90 | Loss: 1.1638 | LR: 9.40e-06
08:16:09 | INFO | Epoch 5.00 | Loss: 1.1679 | LR: 9.10e-06
08:16:18 | INFO | ===== Average Training Loss for Epoch 5: 1.1663 =====
08:16:18 | INFO | === BEFORE EVALUATION ===
08:16:18 | INFO | BEFORE E


EVALUATION RESULTS - Epoch 5

QUALITY METRICS:
+------------+---------+
| Metric     |   Score |
| BLEU-1     |  0.2569 |
+------------+---------+
| BLEU-2     |  0.1567 |
+------------+---------+
| BLEU-3     |  0.1114 |
+------------+---------+
| BLEU-4     |  0.0861 |
+------------+---------+
| ROUGE-L    |  0.2727 |
+------------+---------+
| METEOR     |  0.3048 |
+------------+---------+
| BERT-SCORE |  0.8867 |
+------------+---------+
| SELF-BLEU  |  0.9163 |
+------------+---------+

DIVERSITY METRICS:
+------------+---------+
| Metric     |   Score |
| SELF-BLEU  |  0.9163 |
+------------+---------+
| DISTINCT-1 |  0.1189 |
+------------+---------+
| DISTINCT-2 |  0.3    |
+------------+---------+

OTHER METRICS:
+----------+---------+
| Metric   |   Score |
| LOSS     |  1.3046 |
+----------+---------+



10:34:12 | INFO | === AFTER EVALUATION ===
10:34:12 | INFO | AFTER EVAL GPU Memory - Allocated: 2.82GB, Reserved: 3.08GB
10:34:12 | INFO | AFTER EVAL Model - Training mode: True, Device: cuda:0
10:34:12 | INFO | Evaluation completed successfully. Average loss: 1.3046
10:38:12 | INFO | Epoch 5.10 | Loss: 1.1329 | LR: 8.80e-06
10:42:06 | INFO | Epoch 5.20 | Loss: 1.1313 | LR: 8.49e-06
10:45:56 | INFO | Epoch 5.29 | Loss: 1.1428 | LR: 8.19e-06
10:49:49 | INFO | Epoch 5.39 | Loss: 1.1464 | LR: 7.89e-06
10:53:42 | INFO | Epoch 5.49 | Loss: 1.1411 | LR: 7.59e-06
10:57:30 | INFO | Epoch 5.59 | Loss: 1.1425 | LR: 7.28e-06
11:01:24 | INFO | Epoch 5.69 | Loss: 1.1428 | LR: 6.98e-06
11:05:15 | INFO | Epoch 5.79 | Loss: 1.1361 | LR: 6.68e-06
11:09:07 | INFO | Epoch 5.89 | Loss: 1.1428 | LR: 6.38e-06
11:12:59 | INFO | Epoch 5.99 | Loss: 1.1516 | LR: 6.07e-06
11:13:09 | INFO | ===== Average Training Loss for Epoch 6: 1.1410 =====
11:13:09 | INFO | === BEFORE EVALUATION ===
11:13:09 | INFO | BEFORE E


EVALUATION RESULTS - Epoch 6

QUALITY METRICS:
+------------+---------+
| Metric     |   Score |
| BLEU-1     |  0.2579 |
+------------+---------+
| BLEU-2     |  0.1579 |
+------------+---------+
| BLEU-3     |  0.1125 |
+------------+---------+
| BLEU-4     |  0.0871 |
+------------+---------+
| ROUGE-L    |  0.2743 |
+------------+---------+
| METEOR     |  0.3082 |
+------------+---------+
| BERT-SCORE |  0.8873 |
+------------+---------+
| SELF-BLEU  |  0.9161 |
+------------+---------+

DIVERSITY METRICS:
+------------+---------+
| Metric     |   Score |
| SELF-BLEU  |  0.9161 |
+------------+---------+
| DISTINCT-1 |  0.1199 |
+------------+---------+
| DISTINCT-2 |  0.3036 |
+------------+---------+

OTHER METRICS:
+----------+---------+
| Metric   |   Score |
| LOSS     |  1.3032 |
+----------+---------+



13:33:03 | INFO | === AFTER EVALUATION ===
13:33:03 | INFO | AFTER EVAL GPU Memory - Allocated: 2.82GB, Reserved: 3.08GB
13:33:03 | INFO | AFTER EVAL Model - Training mode: True, Device: cuda:0
13:33:03 | INFO | Evaluation completed successfully. Average loss: 1.3032
13:37:04 | INFO | Epoch 6.09 | Loss: 1.1304 | LR: 5.77e-06
13:40:57 | INFO | Epoch 6.19 | Loss: 1.1122 | LR: 5.47e-06
13:44:48 | INFO | Epoch 6.29 | Loss: 1.1196 | LR: 5.17e-06
13:48:39 | INFO | Epoch 6.39 | Loss: 1.1175 | LR: 4.86e-06
13:52:32 | INFO | Epoch 6.49 | Loss: 1.1300 | LR: 4.56e-06
13:56:22 | INFO | Epoch 6.59 | Loss: 1.1201 | LR: 4.26e-06
14:00:15 | INFO | Epoch 6.69 | Loss: 1.1208 | LR: 3.96e-06
14:04:07 | INFO | Epoch 6.79 | Loss: 1.1127 | LR: 3.65e-06
14:07:58 | INFO | Epoch 6.89 | Loss: 1.1327 | LR: 3.35e-06
14:11:51 | INFO | Epoch 6.99 | Loss: 1.1173 | LR: 3.05e-06
14:12:03 | INFO | ===== Average Training Loss for Epoch 7: 1.1213 =====
14:12:03 | INFO | === BEFORE EVALUATION ===
14:12:03 | INFO | BEFORE E


EVALUATION RESULTS - Epoch 7

QUALITY METRICS:
+------------+---------+
| Metric     |   Score |
| BLEU-1     |  0.2592 |
+------------+---------+
| BLEU-2     |  0.1586 |
+------------+---------+
| BLEU-3     |  0.1131 |
+------------+---------+
| BLEU-4     |  0.0875 |
+------------+---------+
| ROUGE-L    |  0.2762 |
+------------+---------+
| METEOR     |  0.3071 |
+------------+---------+
| BERT-SCORE |  0.8871 |
+------------+---------+
| SELF-BLEU  |  0.9162 |
+------------+---------+

DIVERSITY METRICS:
+------------+---------+
| Metric     |   Score |
| SELF-BLEU  |  0.9162 |
+------------+---------+
| DISTINCT-1 |  0.1203 |
+------------+---------+
| DISTINCT-2 |  0.3025 |
+------------+---------+

OTHER METRICS:
+----------+---------+
| Metric   |   Score |
| LOSS     |  1.3025 |
+----------+---------+



16:26:13 | INFO | === AFTER EVALUATION ===
16:26:13 | INFO | AFTER EVAL GPU Memory - Allocated: 2.82GB, Reserved: 3.08GB
16:26:13 | INFO | AFTER EVAL Model - Training mode: True, Device: cuda:0
16:26:13 | INFO | Evaluation completed successfully. Average loss: 1.3025
16:30:13 | INFO | Epoch 7.09 | Loss: 1.1222 | LR: 2.75e-06
16:34:06 | INFO | Epoch 7.19 | Loss: 1.0950 | LR: 2.44e-06
16:37:58 | INFO | Epoch 7.29 | Loss: 1.1060 | LR: 2.14e-06
16:41:46 | INFO | Epoch 7.39 | Loss: 1.1134 | LR: 1.84e-06
16:45:40 | INFO | Epoch 7.49 | Loss: 1.1026 | LR: 1.54e-06
16:49:31 | INFO | Epoch 7.59 | Loss: 1.1228 | LR: 1.23e-06
16:53:23 | INFO | Epoch 7.69 | Loss: 1.1080 | LR: 9.31e-07
16:57:14 | INFO | Epoch 7.79 | Loss: 1.1116 | LR: 6.29e-07
17:01:03 | INFO | Epoch 7.89 | Loss: 1.1096 | LR: 3.26e-07
17:04:57 | INFO | Epoch 7.99 | Loss: 1.1057 | LR: 2.38e-08
17:05:12 | INFO | ===== Average Training Loss for Epoch 8: 1.1097 =====
17:05:12 | INFO | === BEFORE EVALUATION ===
17:05:12 | INFO | BEFORE E


EVALUATION RESULTS - Epoch 8

QUALITY METRICS:
+------------+---------+
| Metric     |   Score |
| BLEU-1     |  0.2583 |
+------------+---------+
| BLEU-2     |  0.1578 |
+------------+---------+
| BLEU-3     |  0.1123 |
+------------+---------+
| BLEU-4     |  0.087  |
+------------+---------+
| ROUGE-L    |  0.2746 |
+------------+---------+
| METEOR     |  0.3065 |
+------------+---------+
| BERT-SCORE |  0.887  |
+------------+---------+
| SELF-BLEU  |  0.917  |
+------------+---------+

DIVERSITY METRICS:
+------------+---------+
| Metric     |   Score |
| SELF-BLEU  |  0.917  |
+------------+---------+
| DISTINCT-1 |  0.1204 |
+------------+---------+
| DISTINCT-2 |  0.3038 |
+------------+---------+

OTHER METRICS:
+----------+---------+
| Metric   |   Score |
| LOSS     |  1.3066 |
+----------+---------+



19:19:46 | INFO | === AFTER EVALUATION ===
19:19:46 | INFO | AFTER EVAL GPU Memory - Allocated: 2.82GB, Reserved: 3.08GB
19:19:46 | INFO | AFTER EVAL Model - Training mode: True, Device: cuda:0
19:19:46 | INFO | Evaluation completed successfully. Average loss: 1.3066
There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight'].



TRAINING SUMMARY
+--------------+--------------+---------------+
| Metric       |   Best Score | Achieved At   |
| Best BLEU-4  |       0.0875 | Epoch 7       |
+--------------+--------------+---------------+
| Best ROUGE-L |       0.2762 | Epoch 7       |
+--------------+--------------+---------------+
| Best METEOR  |       0.3082 | Epoch 6       |
+--------------+--------------+---------------+



19:20:08 | INFO | Saving Phase 2 model to ./t5-phase2-tuned\final
19:20:17 | INFO | Performing final evaluation...
19:20:17 | INFO | === BEFORE EVALUATION ===
19:20:17 | INFO | BEFORE EVAL GPU Memory - Allocated: 2.80GB, Reserved: 3.08GB
19:20:17 | INFO | BEFORE EVAL Model - Training mode: True, Device: cuda:0
19:20:24 | INFO | Starting evaluation on full validation set...
Evaluating Epoch 8: 100%|██████████| 3696/3696 [1:38:46<00:00,  1.60s/batch]  
20:59:11 | INFO | Computing metrics for 7392 predictions...
20:59:11 | INFO | Computing quality and diversity metrics...
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Adeptus_Mechanicus\AppData\Roaming\nltk_data.
[nltk_data]     ..
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Adeptus_Mechanicus\AppData\Roaming\nltk_data.
[nltk_data]     ..
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk


EVALUATION RESULTS - Epoch 8

QUALITY METRICS:
+------------+---------+
| Metric     |   Score |
| BLEU-1     |  0.259  |
+------------+---------+
| BLEU-2     |  0.1581 |
+------------+---------+
| BLEU-3     |  0.1127 |
+------------+---------+
| BLEU-4     |  0.0871 |
+------------+---------+
| ROUGE-L    |  0.2759 |
+------------+---------+
| METEOR     |  0.3067 |
+------------+---------+
| BERT-SCORE |  0.8871 |
+------------+---------+
| SELF-BLEU  |  0.9176 |
+------------+---------+

DIVERSITY METRICS:
+------------+---------+
| Metric     |   Score |
| SELF-BLEU  |  0.9176 |
+------------+---------+
| DISTINCT-1 |  0.1205 |
+------------+---------+
| DISTINCT-2 |  0.3027 |
+------------+---------+

OTHER METRICS:
+----------+---------+
| Metric   |   Score |
| LOSS     |  1.3048 |
+----------+---------+



21:34:34 | INFO | === AFTER EVALUATION ===
21:34:34 | INFO | AFTER EVAL GPU Memory - Allocated: 2.82GB, Reserved: 3.08GB
21:34:34 | INFO | AFTER EVAL Model - Training mode: True, Device: cuda:0
21:34:34 | INFO | Evaluation completed successfully. Average loss: 1.3048
21:34:34 | INFO | Generating sample predictions...



SAMPLE PREDICTIONS - 20 Context-Question Pairs

--- SAMPLE  1 ---
CONTEXT: Last Thursday Kayla and I had appointments to get our hair done . We had two different goals in mind . Kayla wanted to get a perm and add some curl to her hair . Though she would n't let me take the c...
ACTUAL:   What do we know about Kayla 's hair before she got her hair done ?
PREDICTED: What may be a fact about this person ?
--------------------------------------------------------------------------------

--- SAMPLE  2 ---
CONTEXT: Britain is facing a sharp rise in its rat population as growing numbers of people leave what they cannot finish of the fast food in the street, an environment group warned .Keep Britain Tidy said the ...
ACTUAL:   What was the rat population in Britain in 2000 according to the writer?
PREDICTED: How many people get Weil's Disease every year?
--------------------------------------------------------------------------------

--- SAMPLE  3 ---
CONTEXT: Turtle Bay Resort: The Turtle B

21:34:54 | INFO | Phase 2 pipeline completed successfully!



--- SAMPLE 20 ---
CONTEXT: Daniel comes from Sydney. He is now staying in Beijing with his family. He usually gets up at about 6:30 am and has breakfast at 7:00 am. Then he leaves home at 7:15 am.
He gets to school at 7:45 am. ...
ACTUAL:   How many classes do they have a day?
PREDICTED: How many lessons does Daniel have in the morning?
--------------------------------------------------------------------------------


PHASE 2 FINAL EVALUATION RESULTS:
╒════════════╤═══════════════╕
│ Metric     │   Final Score │
╞════════════╪═══════════════╡
│ EPOCH      │        8      │
├────────────┼───────────────┤
│ BERT_SCORE │        0.8871 │
├────────────┼───────────────┤
│ BLEU_1     │        0.259  │
├────────────┼───────────────┤
│ BLEU_2     │        0.1581 │
├────────────┼───────────────┤
│ BLEU_3     │        0.1127 │
├────────────┼───────────────┤
│ BLEU_4     │        0.0871 │
├────────────┼───────────────┤
│ DISTINCT_1 │        0.1205 │
├────────────┼───────────────┤
│ DISTINCT_2 │  