In [1]:
import os

In [2]:
%pwd

'c:\\1. Python Important\\Self Practice\\ML Practice\\NLP\\Text-Summarization\\research'

In [3]:
os.chdir("../")

In [4]:
%pwd

'c:\\1. Python Important\\Self Practice\\ML Practice\\NLP\\Text-Summarization'

In [5]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class ModelTrainerConfig:
    root_dir: Path
    data_path: Path
    model_ckpt: Path
    num_train_epochs: int
    warmup_steps: int
    per_device_train_batch_size: int
    weight_decay: float
    logging_steps: int
    evaluation_strategy: str
    eval_steps: int
    save_steps: float
    gradient_accumulation_steps: int

In [6]:
from textsumarrizer.constants import *
from textsumarrizer.utils.common import read_yaml, create_directories

In [7]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])


    
    def get_model_trainer_config(self) -> ModelTrainerConfig:
        config = self.config.model_trainer
        params = self.params.TrainingArguments

        create_directories([config.root_dir])

        model_trainer_config = ModelTrainerConfig(
            root_dir=config.root_dir,
            data_path=config.data_path,
            model_ckpt = config.model_ckpt,
            num_train_epochs = params.num_train_epochs,
            warmup_steps = params.warmup_steps,
            per_device_train_batch_size = params.per_device_train_batch_size,
            weight_decay = params.weight_decay,
            logging_steps = params.logging_steps,
            evaluation_strategy = params.evaluation_strategy,
            eval_steps = params.evaluation_strategy,
            save_steps = params.save_steps,
            gradient_accumulation_steps = params.gradient_accumulation_steps
        )

        return model_trainer_config

In [8]:
from transformers import TrainingArguments, Trainer
from transformers import DataCollatorForSeq2Seq
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from datasets import load_dataset, load_from_disk
import torch

  from .autonotebook import tqdm as notebook_tqdm


[2024-06-26 00:08:23,198: INFO: config: PyTorch version 2.3.1+cu121 available.]


In [9]:
class ModelTrainer:
    def __init__(self, config: ModelTrainerConfig):
        self.config = config


    
    def train(self):
        device = "cuda" if torch.cuda.is_available() else "cpu"
        tokenizer = AutoTokenizer.from_pretrained(self.config.model_ckpt)
        model_pegasus = AutoModelForSeq2SeqLM.from_pretrained(self.config.model_ckpt).to(device)
        seq2seq_data_collator = DataCollatorForSeq2Seq(tokenizer, model=model_pegasus)
        
        #loading data 
        dataset_samsum_pt = load_from_disk(self.config.data_path)

        """
        # Execute this if params need to be set via params.yaml file
        trainer_args = TrainingArguments(
             output_dir=self.config.root_dir, num_train_epochs=self.config.num_train_epochs, warmup_steps=self.config.warmup_steps,
             per_device_train_batch_size=self.config.per_device_train_batch_size, per_device_eval_batch_size=self.config.per_device_train_batch_size,
             weight_decay=self.config.weight_decay, logging_steps=self.config.logging_steps,
             evaluation_strategy=self.config.evaluation_strategy, eval_steps=self.config.eval_steps, save_steps=1e6,
             gradient_accumulation_steps=self.config.gradient_accumulation_steps
         )
        """

        trainer_args = TrainingArguments(
            output_dir=self.config.root_dir, num_train_epochs=0.01, warmup_steps=10,
            per_device_train_batch_size=1, per_device_eval_batch_size=1,
            weight_decay=0.01, logging_steps=10,
            evaluation_strategy='steps', eval_steps=10, save_steps=1e6,
            gradient_accumulation_steps=2,  # Adjust according to your memory constraints
            fp16=True,  # Enable mixed precision training
            gradient_checkpointing=True,  # Enable gradient checkpointing
            report_to="none"  # Disable reporting to minimize memory overhead
        ) 

        trainer = Trainer(model=model_pegasus, args=trainer_args,
                  tokenizer=tokenizer, data_collator=seq2seq_data_collator,
                  train_dataset=dataset_samsum_pt["train"], 
                  # dataset_samsum_pt["test"] : take test data to train as it has less datapoints and hence will be less computationally expensive
                  # ideally should us dataset_samsum_pt["train"]
                  eval_dataset=dataset_samsum_pt["validation"])
        
        trainer.train()

        ## Save model
        model_pegasus.save_pretrained(os.path.join(self.config.root_dir,"pegasus-samsum-model"))
        ## Save tokenizer
        tokenizer.save_pretrained(os.path.join(self.config.root_dir,"tokenizer"))

In [10]:
torch.cuda.empty_cache()
# Clearing the CUDA cache before training can sometimes free up enough memory.

WARNING: "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference."

In [11]:
try:
    config = ConfigurationManager()
    model_trainer_config = config.get_model_trainer_config()
    model_trainer_config = ModelTrainer(config=model_trainer_config)
    model_trainer_config.train()
except Exception as e:
    raise e

[2024-06-26 00:08:23,769: INFO: common: yaml file: config\config.yaml loaded successfully]
[2024-06-26 00:08:23,774: INFO: common: yaml file: params.yaml loaded successfully]
[2024-06-26 00:08:23,775: INFO: common: created directory at: artifacts]
[2024-06-26 00:08:23,776: INFO: common: created directory at: artifacts/model_trainer]


Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-cnn_dailymail and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 14%|█▎        | 10/74 [00:54<05:15,  4.92s/it]

{'loss': 3.1975, 'grad_norm': 23.971405029296875, 'learning_rate': 4e-05, 'epoch': 0.0}


                                               
 14%|█▎        | 10/74 [04:50<05:15,  4.92s/it]  

{'eval_loss': 2.4174280166625977, 'eval_runtime': 235.965, 'eval_samples_per_second': 3.467, 'eval_steps_per_second': 3.467, 'epoch': 0.0}


 27%|██▋       | 20/74 [05:38<07:00,  7.79s/it]  

{'loss': 2.7978, 'grad_norm': 19.381608963012695, 'learning_rate': 4.453125e-05, 'epoch': 0.0}


                                               
 27%|██▋       | 20/74 [09:45<07:00,  7.79s/it]  

{'eval_loss': 2.105271816253662, 'eval_runtime': 247.1257, 'eval_samples_per_second': 3.31, 'eval_steps_per_second': 3.31, 'epoch': 0.0}


 41%|████      | 30/74 [11:13<08:38, 11.78s/it]  

{'loss': 2.788, 'grad_norm': 154.64639282226562, 'learning_rate': 3.671875e-05, 'epoch': 0.0}


                                               
 41%|████      | 30/74 [16:21<08:38, 11.78s/it]  

{'eval_loss': 1.9806020259857178, 'eval_runtime': 308.6695, 'eval_samples_per_second': 2.65, 'eval_steps_per_second': 2.65, 'epoch': 0.0}


 54%|█████▍    | 40/74 [17:34<06:21, 11.22s/it]   

{'loss': 1.914, 'grad_norm': 12.137225151062012, 'learning_rate': 2.890625e-05, 'epoch': 0.01}


                                               
 54%|█████▍    | 40/74 [22:15<06:21, 11.22s/it]  

{'eval_loss': 1.9035159349441528, 'eval_runtime': 281.0449, 'eval_samples_per_second': 2.911, 'eval_steps_per_second': 2.911, 'epoch': 0.01}


 68%|██████▊   | 50/74 [23:24<04:16, 10.69s/it]

{'loss': 2.4647, 'grad_norm': 20.3756103515625, 'learning_rate': 2.1875e-05, 'epoch': 0.01}


                                               
 68%|██████▊   | 50/74 [29:15<04:16, 10.69s/it]  

{'eval_loss': 1.8697023391723633, 'eval_runtime': 351.2388, 'eval_samples_per_second': 2.329, 'eval_steps_per_second': 2.329, 'epoch': 0.01}


 81%|████████  | 60/74 [30:57<03:19, 14.25s/it] 

{'loss': 2.805, 'grad_norm': 11.69636058807373, 'learning_rate': 1.4062500000000001e-05, 'epoch': 0.01}


                                               
 81%|████████  | 60/74 [37:16<03:19, 14.25s/it]  

{'eval_loss': 1.8455625772476196, 'eval_runtime': 378.9315, 'eval_samples_per_second': 2.159, 'eval_steps_per_second': 2.159, 'epoch': 0.01}


 95%|█████████▍| 70/74 [38:39<00:52, 13.24s/it] 

{'loss': 2.4953, 'grad_norm': 41.65949249267578, 'learning_rate': 6.25e-06, 'epoch': 0.01}


                                               
 95%|█████████▍| 70/74 [45:05<00:52, 13.24s/it]  

{'eval_loss': 1.8350369930267334, 'eval_runtime': 385.2346, 'eval_samples_per_second': 2.123, 'eval_steps_per_second': 2.123, 'epoch': 0.01}


100%|██████████| 74/74 [45:43<00:00, 37.07s/it] 
Non-default generation parameters: {'max_length': 128, 'min_length': 32, 'num_beams': 8, 'length_penalty': 0.8, 'forced_eos_token_id': 1}


{'train_runtime': 2743.1187, 'train_samples_per_second': 0.054, 'train_steps_per_second': 0.027, 'train_loss': 2.6631710722639754, 'epoch': 0.01}


In [None]:
#torch.cuda.is_available()

In [None]:
#torch.__version__
# to check if torch version installed is CPU or CUDA

In [None]:
# to download latest CUDA version for Pytorch
# pip install torch==2.3.1+cu121 torchvision==0.18.1+cu121 torchaudio==2.3.1+cu121 -f https://download.pytorch.org/whl/torch_stable.html

In [None]:
"""
A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.0.0 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.


Due to compatibility issues between Numpy and Pytorch, I have downgraded Numpy-2.0.0 to 1.26.4 for this textS venv.
"""