In [1]:
import os 
%pwd 

'd:\\VS_code\\MLops\\11_End-to-End_Projects\\End-to-End_TextSummarizer_Project\\research'

In [2]:
os.chdir("..")
%pwd

'd:\\VS_code\\MLops\\11_End-to-End_Projects\\End-to-End_TextSummarizer_Project'

In [4]:
from pathlib import Path 
from dataclasses import dataclass 

@dataclass 
class ModelTrainerConfig:
    root_dir: Path 
    data_path: Path 
    model_ckpt: Path 
    num_train_epochs: int 
    warmup_steps: int 
    per_device_train_batch_size: int 
    weight_decay: float 
    logging_steps: int 
    evaluation_strategy: str 
    eval_steps: int 
    save_steps: float 
    gradient_accumulation_steps: int 

In [5]:
from src.TextSummarizer.constants import * 
from src.TextSummarizer.utils.common import read_yaml, create_directories

In [6]:
class ConfigurationManager:
    def __init__(self,
                 config_file_path =  CONFIG_FILE_PATH, 
                 params_file_path = PARAMS_FILE_PATH ):
        self.config = read_yaml(config_file_path)
        self.params = read_yaml(PARAMS_FILE_PATH)

        create_directories([self.config.artifacts_root_dir])

    def get_model_trainer_config(self)-> ModelTrainerConfig:
        config = self.config.model_trainer 
        params = self.params.TrainingArguments

        create_directories([config.root_dir])

        model_trainier_config = ModelTrainerConfig(
            root_dir= config.root_dir,
            data_path= config.data_path, 
            model_ckpt=config.model_ckpt,
            num_train_epochs=config.num_train_epochs, 
            warmup_steps=config.warmup_steps,
            per_device_train_batch_size=config.per_device_train_batch_size,
            weight_decay=config.weight_decay, 
            logging_steps=config.logging_steps,
            evaluation_strategy=config.evaluation_strategy,
            eval_steps=config.eval_steps,
            save_steps=config.save_steps,
            gradient_accumulation_steps=config.gradient_accumulation_steps
        )
        return model_trainier_config

In [10]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from transformers import TrainingArguments, Trainer 
from transformers import DataCollatorForSeq2Seq
import torch 
from datasets import load_from_disk

In [11]:
from src.TextSummarizer.logging import logger

In [8]:
class ModelTrainer:
    def __init__(self, config: ModelTrainerConfig):
        self.config = config 

    def train(self):
        device = "cuda" if torch.cuda.is_availabel() else "cpu"
        tokenizer = AutoTokenizer.from_pretrained(self.config.model_ckpt)
        model_pegasus = AutoModelForSeq2SeqLM.from_pretrained(self.config.model_ckpt).to(device)
        seq2seq_data_collector = DataCollatorForSeq2Seq(tokenizer=tokenizer, model= model_pegasus)

        # Loading the data 
        logger.info("Loading the dataset for training")
        dataset_samsum_pt = load_from_disk(self.config.data_path)

        trainer_args = TrainingArguments(
            output_dir=self.config.root_dir,
            num_train_epochs=self.config.num_train_epochs, 
            warmup_steps=self.config.warmup_steps,
            per_device_train_batch_size=self.config.per_device_train_batch_size,
            weight_decay=self.config.weight_decay, 
            logging_steps=self.config.logging_steps,
            evaluation_strategy=self.config.evaluation_strategy,
            eval_steps=self.config.eval_steps,
            save_steps=self.config.save_steps,
            gradient_accumulation_steps=self.config.gradient_accumulation_steps
            )
        
        trainer = Trainer(
            model = model_pegasus, args= trainer_args,
            data_collator= seq2seq_data_collector, tokenizer= tokenizer, 
            train_dataset=dataset_samsum_pt["train"],
            eval_dataset= dataset_samsum_pt["validation"]
        )
        logger.info("Training gets started")
        trainer.train()
        logger.info("Training successfully completed")

        # save the model
        model_pegasus.save_pretrained(os.path.join(self.config.root_dir, "pegasus-samsum-model"))
        # save tokenizer 
        tokenizer.save_pretrained(os.path.join(self.config.root_dir, "tokenizer"))
        logger.info("Saved our trained model and tokenizer into local disk")


[2025-01-22 19:37:31,285: INFO: config: PyTorch version 2.5.1 available.]


In [None]:
!pip install --upgrade accelerate 
!pip uninstall -y transformers accelerate 
!pip install transformers accelerate

In [None]:
config = ConfigurationManager()
model_trainer_config = config.get_model_trainer_config()
model_trainer = ModelTrainer(model_trainer_config)
model_trainer.train()