In [1]:
import os

In [2]:
%pwd

'c:\\Users\\91931\\Text-Summarization-Project\\research'

In [3]:
os.chdir("../")

In [4]:
%pwd

'c:\\Users\\91931\\Text-Summarization-Project'

In [5]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class ModelTrainerConfig:
    root_dir: Path
    data_path: Path
    model_ckpt: Path
    num_train_epochs: int
    warmup_steps: int
    per_device_train_batch_size: int
    weight_decay: float
    logging_steps: int
    evaluation_strategy: str
    eval_steps: int
    save_steps: float
    gradient_accumulation_steps: int

In [6]:
from textSummarization.constants import *
from textSummarization.utils.common import read_yaml, create_directories

In [7]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])


    
    def get_model_trainer_config(self) -> ModelTrainerConfig:
        config = self.config.model_trainer
        params = self.params.TrainingArguments

        create_directories([config.root_dir])

        model_trainer_config = ModelTrainerConfig(
            root_dir=config.root_dir,
            data_path=config.data_path,
            model_ckpt = config.model_ckpt,
            num_train_epochs = params.num_train_epochs,
            warmup_steps = params.warmup_steps,
            per_device_train_batch_size = params.per_device_train_batch_size,
            weight_decay = params.weight_decay,
            logging_steps = params.logging_steps,
            evaluation_strategy = params.evaluation_strategy,
            eval_steps = params.evaluation_strategy,
            save_steps = params.save_steps,
            gradient_accumulation_steps = params.gradient_accumulation_steps
        )

        return model_trainer_config

In [9]:
from transformers import TrainingArguments, Trainer

# helps in preparing the inputs, attention masks, and labels in the correct format.
from transformers import DataCollatorForSeq2Seq
from transformers import AutoModelForSeq2SeqLM, T5Tokenizer, T5ForConditionalGeneration
from datasets import load_dataset, load_from_disk
import torch

In [10]:
torch.cuda.empty_cache()

In [11]:
class ModelTrainer:
    def __init__(self, config: ModelTrainerConfig):
        self.config = config


    
    def train(self):
        device = "cuda" if torch.cuda.is_available() else "cpu"
        tokenizer = T5Tokenizer.from_pretrained(self.config.model_ckpt)
        model_t5 = T5ForConditionalGeneration.from_pretrained(self.config.model_ckpt).to(device)
        seq2seq_data_collator = DataCollatorForSeq2Seq(tokenizer, model=model_t5)
        
        #loading data 
        dataset_samsum_pt = load_from_disk(self.config.data_path)

# Params.yaml file read
        # trainer_args = TrainingArguments(
        #     output_dir=self.config.root_dir, num_train_epochs=self.config.num_train_epochs, warmup_steps=self.config.warmup_steps,
        #     per_device_train_batch_size=self.config.per_device_train_batch_size, per_device_eval_batch_size=self.config.per_device_train_batch_size,
        #     weight_decay=self.config.weight_decay, logging_steps=self.config.logging_steps,
        #     evaluation_strategy=self.config.evaluation_strategy, eval_steps=self.config.eval_steps, save_steps=1e6,
        #     gradient_accumulation_steps=self.config.gradient_accumulation_steps
        # ) 


        trainer_args = TrainingArguments(
            output_dir=self.config.root_dir, num_train_epochs=1, warmup_steps=500,
            per_device_train_batch_size=1, per_device_eval_batch_size=1,
            weight_decay=0.01, logging_steps=10,
            evaluation_strategy='steps', eval_steps=500, save_steps=1e6,
            gradient_accumulation_steps=2 , fp16=True,  gradient_checkpointing=True
        ) 
        # Gradient accumulation reduced 16 -> 8 -> 2

        
        


        trainer = Trainer(model=model_t5, args=trainer_args,
                  tokenizer=tokenizer, data_collator=seq2seq_data_collator,
                  train_dataset=dataset_samsum_pt["test"], 
                  eval_dataset=dataset_samsum_pt["validation"])
        
        trainer.train()

        ## Save model
        model_t5.save_pretrained(os.path.join(self.config.root_dir,"t5-samsum-model"))
        ## Save tokenizer
        tokenizer.save_pretrained(os.path.join(self.config.root_dir,"tokenizer"))

        # os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'


In [13]:
import torch
print(torch.cuda.is_available())  # Should return True if CUDA is available
print(torch.cuda.device_count())  # Should return the number of GPUs available
print(torch.cuda.get_device_name(0))  # Should print the name of the GPU


True
1
NVIDIA GeForce GTX 1650


In [14]:
# Do Legacy = False -- Havent done yet  
try:
    config = ConfigurationManager()
    model_trainer_config = config.get_model_trainer_config()
    model_trainer_config = ModelTrainer(config=model_trainer_config)
    model_trainer_config.train()
except Exception as e:
    raise e

[2024-08-26 09:40:58,416: INFO: common: yaml file: config\config.yaml loaded successfully]
[2024-08-26 09:40:58,419: INFO: common: yaml file: params.yaml loaded successfully]
[2024-08-26 09:40:58,420: INFO: common: created directory at: artifacts]
[2024-08-26 09:40:58,422: INFO: common: created directory at: artifacts/model_trainer]


You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
  return fn(*args, **kwargs)
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  2%|▏         | 10/409 [00:21<13:50,  2.08s/it]

{'loss': 16.4981, 'grad_norm': 85.5018539428711, 'learning_rate': 6.000000000000001e-07, 'epoch': 0.02}


  5%|▍         | 20/409 [00:42<13:25,  2.07s/it]

{'loss': 16.6054, 'grad_norm': 108.46190643310547, 'learning_rate': 1.4000000000000001e-06, 'epoch': 0.05}


  7%|▋         | 30/409 [01:03<13:04,  2.07s/it]

{'loss': 17.9602, 'grad_norm': 94.32849884033203, 'learning_rate': 2.4000000000000003e-06, 'epoch': 0.07}


 10%|▉         | 40/409 [01:23<12:44,  2.07s/it]

{'loss': 16.421, 'grad_norm': 97.03682708740234, 'learning_rate': 3.4000000000000005e-06, 'epoch': 0.1}


 12%|█▏        | 50/409 [01:44<12:23,  2.07s/it]

{'loss': 15.2807, 'grad_norm': 91.72180938720703, 'learning_rate': 4.4e-06, 'epoch': 0.12}


 15%|█▍        | 60/409 [02:05<12:01,  2.07s/it]

{'loss': 14.7229, 'grad_norm': 123.54859161376953, 'learning_rate': 5.4e-06, 'epoch': 0.15}


 17%|█▋        | 70/409 [02:25<11:45,  2.08s/it]

{'loss': 15.276, 'grad_norm': 90.87459564208984, 'learning_rate': 6.4000000000000006e-06, 'epoch': 0.17}


 20%|█▉        | 80/409 [02:46<11:20,  2.07s/it]

{'loss': 14.5196, 'grad_norm': 83.3479232788086, 'learning_rate': 7.2e-06, 'epoch': 0.2}


 22%|██▏       | 90/409 [03:07<10:59,  2.07s/it]

{'loss': 13.7047, 'grad_norm': 76.27350616455078, 'learning_rate': 8.200000000000001e-06, 'epoch': 0.22}


 24%|██▍       | 100/409 [03:27<10:38,  2.07s/it]

{'loss': 13.5821, 'grad_norm': 83.12466430664062, 'learning_rate': 9.2e-06, 'epoch': 0.24}


 27%|██▋       | 110/409 [03:48<10:17,  2.07s/it]

{'loss': 11.3257, 'grad_norm': 89.51921844482422, 'learning_rate': 1.02e-05, 'epoch': 0.27}


 29%|██▉       | 120/409 [04:09<09:58,  2.07s/it]

{'loss': 10.9819, 'grad_norm': 62.97394943237305, 'learning_rate': 1.1200000000000001e-05, 'epoch': 0.29}


 32%|███▏      | 130/409 [04:30<09:39,  2.08s/it]

{'loss': 11.1954, 'grad_norm': 99.53350830078125, 'learning_rate': 1.22e-05, 'epoch': 0.32}


 34%|███▍      | 140/409 [04:50<09:16,  2.07s/it]

{'loss': 8.8964, 'grad_norm': 63.42692947387695, 'learning_rate': 1.32e-05, 'epoch': 0.34}


 37%|███▋      | 150/409 [05:11<08:56,  2.07s/it]

{'loss': 7.8294, 'grad_norm': 62.864864349365234, 'learning_rate': 1.42e-05, 'epoch': 0.37}


 39%|███▉      | 160/409 [05:32<08:34,  2.07s/it]

{'loss': 6.7895, 'grad_norm': 87.91905975341797, 'learning_rate': 1.52e-05, 'epoch': 0.39}


 42%|████▏     | 170/409 [05:52<08:14,  2.07s/it]

{'loss': 4.3732, 'grad_norm': 78.04576110839844, 'learning_rate': 1.62e-05, 'epoch': 0.42}


 44%|████▍     | 180/409 [06:13<07:53,  2.07s/it]

{'loss': 3.5391, 'grad_norm': 80.87353515625, 'learning_rate': 1.7199999999999998e-05, 'epoch': 0.44}


 46%|████▋     | 190/409 [06:34<07:32,  2.07s/it]

{'loss': 3.1368, 'grad_norm': 33.541908264160156, 'learning_rate': 1.8200000000000002e-05, 'epoch': 0.46}


 49%|████▉     | 200/409 [06:54<07:11,  2.07s/it]

{'loss': 2.3396, 'grad_norm': 40.85650634765625, 'learning_rate': 1.9200000000000003e-05, 'epoch': 0.49}


 51%|█████▏    | 210/409 [07:15<06:51,  2.07s/it]

{'loss': 1.9186, 'grad_norm': 4.90938663482666, 'learning_rate': 2.0200000000000003e-05, 'epoch': 0.51}


 54%|█████▍    | 220/409 [07:36<06:30,  2.07s/it]

{'loss': 1.705, 'grad_norm': 4.60228967666626, 'learning_rate': 2.12e-05, 'epoch': 0.54}


 56%|█████▌    | 230/409 [07:56<06:10,  2.07s/it]

{'loss': 1.408, 'grad_norm': 4.656174659729004, 'learning_rate': 2.22e-05, 'epoch': 0.56}


 59%|█████▊    | 240/409 [08:17<05:49,  2.07s/it]

{'loss': 1.4552, 'grad_norm': 11.213784217834473, 'learning_rate': 2.32e-05, 'epoch': 0.59}


 61%|██████    | 250/409 [08:38<05:28,  2.07s/it]

{'loss': 1.1437, 'grad_norm': 5.566282749176025, 'learning_rate': 2.4200000000000002e-05, 'epoch': 0.61}


 64%|██████▎   | 260/409 [08:58<05:07,  2.07s/it]

{'loss': 1.1864, 'grad_norm': 8.311253547668457, 'learning_rate': 2.5200000000000003e-05, 'epoch': 0.63}


 66%|██████▌   | 270/409 [09:19<04:47,  2.07s/it]

{'loss': 0.9916, 'grad_norm': 2.5509586334228516, 'learning_rate': 2.6200000000000003e-05, 'epoch': 0.66}


 68%|██████▊   | 280/409 [09:40<04:26,  2.07s/it]

{'loss': 0.8335, 'grad_norm': 4.856007099151611, 'learning_rate': 2.7200000000000004e-05, 'epoch': 0.68}


 71%|███████   | 290/409 [10:00<04:05,  2.07s/it]

{'loss': 0.7661, 'grad_norm': 2.9420177936553955, 'learning_rate': 2.8199999999999998e-05, 'epoch': 0.71}


 73%|███████▎  | 300/409 [10:21<03:46,  2.08s/it]

{'loss': 0.716, 'grad_norm': 3.070924758911133, 'learning_rate': 2.9199999999999998e-05, 'epoch': 0.73}


 76%|███████▌  | 310/409 [10:42<03:25,  2.07s/it]

{'loss': 0.8011, 'grad_norm': 3.9735524654388428, 'learning_rate': 3.02e-05, 'epoch': 0.76}


 78%|███████▊  | 320/409 [11:03<03:04,  2.07s/it]

{'loss': 0.8002, 'grad_norm': 2.7419612407684326, 'learning_rate': 3.12e-05, 'epoch': 0.78}


 81%|████████  | 330/409 [11:23<02:43,  2.07s/it]

{'loss': 0.7545, 'grad_norm': 6.2368316650390625, 'learning_rate': 3.2200000000000003e-05, 'epoch': 0.81}


 83%|████████▎ | 340/409 [11:44<02:22,  2.07s/it]

{'loss': 0.5719, 'grad_norm': 1.5200495719909668, 'learning_rate': 3.32e-05, 'epoch': 0.83}


 86%|████████▌ | 350/409 [12:05<02:01,  2.07s/it]

{'loss': 0.7931, 'grad_norm': 1.843642234802246, 'learning_rate': 3.4200000000000005e-05, 'epoch': 0.85}


 88%|████████▊ | 360/409 [12:25<01:41,  2.07s/it]

{'loss': 0.6575, 'grad_norm': 1.5774235725402832, 'learning_rate': 3.52e-05, 'epoch': 0.88}


 90%|█████████ | 370/409 [12:46<01:20,  2.07s/it]

{'loss': 0.637, 'grad_norm': 1.8393052816390991, 'learning_rate': 3.62e-05, 'epoch': 0.9}


 93%|█████████▎| 380/409 [13:07<00:59,  2.07s/it]

{'loss': 0.5576, 'grad_norm': 1.5030531883239746, 'learning_rate': 3.72e-05, 'epoch': 0.93}


 95%|█████████▌| 390/409 [13:27<00:39,  2.07s/it]

{'loss': 0.7194, 'grad_norm': 5.110471725463867, 'learning_rate': 3.82e-05, 'epoch': 0.95}


 98%|█████████▊| 400/409 [13:48<00:18,  2.07s/it]

{'loss': 0.5756, 'grad_norm': 1.3574801683425903, 'learning_rate': 3.9200000000000004e-05, 'epoch': 0.98}


100%|██████████| 409/409 [14:08<00:00,  2.08s/it]


{'train_runtime': 848.907, 'train_samples_per_second': 0.965, 'train_steps_per_second': 0.482, 'train_loss': 5.97938637628532, 'epoch': 1.0}
