In [1]:
import os
%pwd

'd:\\MLOPs\\End to end NLP Project with HuggingFace and Transformers\\research'

In [2]:
os.chdir("../")

In [3]:
%pwd

'd:\\MLOPs\\End to end NLP Project with HuggingFace and Transformers'

In [4]:
from dataclasses import dataclass
from pathlib import Path


# @dataclass
# class ModelTrainerConfig:
#     root_dir: Path
#     data_path: Path
#     model_ckpt: Path
#     num_train_epoches: int
#     warmup_steps: int
#     per_device_train_batch_size: int
#     weight_decay: float
#     logging_steps: int
#     evaluation_strategy: str 
#     eval_steps: int
#     save_steps: float
#     gradient_accumulation_steps: int

class ModelTrainerConfig:
    root_dir: Path
    data_path: Path
    model_ckpt: str
    num_train_epochs: int
    warmup_steps: int
    per_device_train_batch_size: int
    weight_decay: float
    logging_steps: int
    evaluation_strategy: str 
    eval_steps: int
    save_steps: float
    gradient_accumulation_steps: int


In [5]:
from src.textSummarizer.constants import *
from src.textSummarizer.utils.common import read_yaml, create_directories

In [6]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath=CONFIG_FILE_PATH,
        params_filepath=PARAMS_FILE_PATH
    ):
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])

    def get_model_trainer_config(self) -> ModelTrainerConfig:
        config = self.config.model_trainer
        params = self.params.TrainingArguments

        create_directories([config.root_dir])

        model_trainer_config = ModelTrainerConfig(
            root_dir=config.root_dir,
            data_path=config.data_path,
            model_ckpt=config.model_ckpt,
            num_train_epochs=params.num_train_epochs,
            warmup_steps=params.warmup_steps,
            per_device_train_batch_size=params.per_device_train_batch_size,
            weight_decay=params.weight_decay,
            logging_steps=params.logging_steps,
            evaluation_strategy=params.evaluation_strategy,
            eval_steps=params.eval_steps,  # ✅ FIXED
            save_steps=params.save_steps,
            gradient_accumulation_steps=params.gradient_accumulation_steps
        )

        return model_trainer_config


In [7]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from transformers import TrainingArguments, Trainer
from transformers import DataCollatorForSeq2Seq
import torch
from datasets import load_from_disk

  from .autonotebook import tqdm as notebook_tqdm


In [8]:
# import os
# import torch
# from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Trainer, TrainingArguments
# from datasets import load_from_disk
# from transformers import TrainingArguments, Trainer


# class ModelTrainer:
#     def __init__(self, config: "ModelTrainerConfig"):
#         self.config = config

#     def train(self):
#         device = "cuda" if torch.cuda.is_available() else "cpu"

#         # Load tokenizer and model
#         tokenizer = AutoTokenizer.from_pretrained(self.config.model_ckpt)
#         model_pegasus = AutoModelForSeq2SeqLM.from_pretrained(self.config.model_ckpt).to(device)

#         # Data collator
#         seq2seq_data_collator = DataCollatorForSeq2Seq(tokenizer, model=model_pegasus)

#         # Load dataset
#         # dataset_samsum_pt = load_from_disk(self.config.data_path)
#         from pathlib import Path

#         dataset_path = Path(self.config.data_path)
#         print("Trying to load dataset from:", dataset_path.resolve())
#         print("Exists?", dataset_path.exists())
#         print("Contents:", list(dataset_path.glob("*")))

#         dataset_samsum_pt = load_from_disk(dataset_path)


#         # Training arguments
#         trainer_args = TrainingArguments(
#             output_dir=self.config.root_dir,
#             num_train_epochs=1,
#             warmup_steps=500,
#             per_device_train_batch_size=1,
#             per_device_eval_batch_size=1,
#             weight_decay=0.01,
#             logging_steps=10,
#             eval_strategy="steps",
#             eval_steps=500,
#             save_steps=1e6,
#             gradient_accumulation_steps=16
#         )

#         # Trainer
#         trainer = Trainer(
#             model=model_pegasus, 
#             args=trainer_args,
#             tokenizer=tokenizer, 
#             data_collator=seq2seq_data_collator,
#             train_dataset=dataset_samsum_pt["test"],
#             eval_dataset=dataset_samsum_pt["validation"]
#         )

#         # Train
#         trainer.train()

#         # Save model and tokenizer
#         model_pegasus.save_pretrained(os.path.join(self.config.root_dir, "pegasus-samsum-model"))
#         tokenizer.save_pretrained(os.path.join(self.config.root_dir, "tokenizer"))


In [9]:
import os
import torch
from pathlib import Path
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Trainer,
    TrainingArguments
)
from datasets import load_from_disk


class ModelTrainer:
    def __init__(self, config: "ModelTrainerConfig"):
        self.config = config

    def train(self):
        device = "cuda" if torch.cuda.is_available() else "cpu"
        print(f"🔧 Using device: {device}")

        # ✅ Resolve dataset path to absolute path
        dataset_path = Path(self.config.data_path).resolve()
        print(f"📂 Trying to load dataset from: {dataset_path}")
        if not dataset_path.exists():
            raise FileNotFoundError(f"❌ Dataset folder not found at: {dataset_path}")

        # Print contents to debug
        print(f"📑 Found files: {list(dataset_path.glob('*'))}")

        # ✅ Load dataset from disk
        dataset_samsum_pt = load_from_disk(dataset_path)

        # ✅ Load tokenizer and model
        tokenizer = AutoTokenizer.from_pretrained(self.config.model_ckpt)
        model_pegasus = AutoModelForSeq2SeqLM.from_pretrained(self.config.model_ckpt).to(device)

        # ✅ Data collator for padding batches
        seq2seq_data_collator = DataCollatorForSeq2Seq(tokenizer, model=model_pegasus)

        # ✅ Training arguments (fixed evaluation_strategy)
        trainer_args = TrainingArguments(
            output_dir=str(Path(self.config.root_dir).resolve()),
            num_train_epochs=1,
            warmup_steps=500,
            per_device_train_batch_size=1,
            per_device_eval_batch_size=1,
            weight_decay=0.01,
            logging_strategy="steps",
            logging_steps=10,
            eval_strategy="steps",  # ✅ correct argument name
            eval_steps=500,
            save_strategy="no",  # ✅ disables frequent checkpoints (can adjust)
            gradient_accumulation_steps=16,
            report_to="none"  # ✅ disables WandB/Comet logging unless you need it
        )

        # ✅ Trainer setup
        trainer = Trainer(
            model=model_pegasus,
            args=trainer_args,
            tokenizer=tokenizer,
            data_collator=seq2seq_data_collator,
            train_dataset=dataset_samsum_pt["train"],  # ✅ use train split
            eval_dataset=dataset_samsum_pt["validation"]
        )

        print("🚀 Starting training...")
        trainer.train()

        # ✅ Save trained model and tokenizer
        save_model_path = Path(self.config.root_dir) / "pegasus-samsum-model"
        save_tokenizer_path = Path(self.config.root_dir) / "tokenizer"

        model_pegasus.save_pretrained(save_model_path)
        tokenizer.save_pretrained(save_tokenizer_path)

        print(f"✅ Model saved at: {save_model_path}")
        print(f"✅ Tokenizer saved at: {save_tokenizer_path}")


In [10]:
from pathlib import Path
from src.textSummarizer.config.configuration import ConfigurationManager

config_path = Path(r"D:\MLOPs\End to end NLP Project with HuggingFace and Transformers\config\config.yaml")
params_path = Path(r"D:\MLOPs\End to end NLP Project with HuggingFace and Transformers\params.yaml")

config_manager = ConfigurationManager(config_path=config_path, params_filepath=params_path)


[2025-09-19 16:07:13,557:INFO:common:yaml file: D:\MLOPs\End to end NLP Project with HuggingFace and Transformers\config\config.yaml loaded successfully]
[2025-09-19 16:07:13,580:INFO:common:yaml file: D:\MLOPs\End to end NLP Project with HuggingFace and Transformers\params.yaml loaded successfully]
[2025-09-19 16:07:13,590:INFO:common:created directory at: artifacts]


In [11]:
import importlib
import src.textSummarizer.config.configuration as cfg
importlib.reload(cfg)
ConfigurationManager = cfg.ConfigurationManager


In [None]:
config=ConfigurationManager()
model_trainer_config=config.get_model_trainer_config()
model_trainer_config=ModelTrainer(config=model_trainer_config)
model_trainer_config.train()

[2025-09-19 16:07:13,683:INFO:common:yaml file: D:\MLOPs\End to end NLP Project with HuggingFace and Transformers\config\config.yaml loaded successfully]
[2025-09-19 16:07:13,700:INFO:common:yaml file: D:\MLOPs\End to end NLP Project with HuggingFace and Transformers\params.yaml loaded successfully]
[2025-09-19 16:07:13,705:INFO:common:created directory at: artifacts]
[2025-09-19 16:07:13,710:INFO:common:created directory at: artifacts\model_trainer]
🔧 Using device: cpu
📂 Trying to load dataset from: D:\MLOPs\End to end NLP Project with HuggingFace and Transformers\artifacts\data_transformation\samsum_dataset
📑 Found files: [WindowsPath('D:/MLOPs/End to end NLP Project with HuggingFace and Transformers/artifacts/data_transformation/samsum_dataset/dataset_dict.json'), WindowsPath('D:/MLOPs/End to end NLP Project with HuggingFace and Transformers/artifacts/data_transformation/samsum_dataset/test'), WindowsPath('D:/MLOPs/End to end NLP Project with HuggingFace and Transformers/artifacts/d

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-cnn_dailymail and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None}.


🚀 Starting training...




In [None]:
from transformers import TrainingArguments
print(TrainingArguments.__module__)
print(TrainingArguments.__init__.__code__.co_varnames)


transformers.training_args
('self', 'output_dir', 'overwrite_output_dir', 'do_train', 'do_eval', 'do_predict', 'eval_strategy', 'prediction_loss_only', 'per_device_train_batch_size', 'per_device_eval_batch_size', 'per_gpu_train_batch_size', 'per_gpu_eval_batch_size', 'gradient_accumulation_steps', 'eval_accumulation_steps', 'eval_delay', 'torch_empty_cache_steps', 'learning_rate', 'weight_decay', 'adam_beta1', 'adam_beta2', 'adam_epsilon', 'max_grad_norm', 'num_train_epochs', 'max_steps', 'lr_scheduler_type', 'lr_scheduler_kwargs', 'warmup_ratio', 'warmup_steps', 'log_level', 'log_level_replica', 'log_on_each_node', 'logging_dir', 'logging_strategy', 'logging_first_step', 'logging_steps', 'logging_nan_inf_filter', 'save_strategy', 'save_steps', 'save_total_limit', 'save_safetensors', 'save_on_each_node', 'save_only_model', 'restore_callback_states_from_checkpoint', 'no_cuda', 'use_cpu', 'use_mps_device', 'seed', 'data_seed', 'jit_mode_eval', 'use_ipex', 'bf16', 'fp16', 'fp16_opt_level'