In [17]:
!pip install torch
!pip install nltk

Collecting nltk
  Using cached nltk-3.8.1-py3-none-any.whl (1.5 MB)
Collecting click (from nltk)
  Obtaining dependency information for click from https://files.pythonhosted.org/packages/00/2e/d53fa4befbf2cfa713304affc7ca780ce4fc1fd8710527771b58311a3229/click-8.1.7-py3-none-any.whl.metadata
  Downloading click-8.1.7-py3-none-any.whl.metadata (3.0 kB)
Collecting joblib (from nltk)
  Obtaining dependency information for joblib from https://files.pythonhosted.org/packages/10/40/d551139c85db202f1f384ba8bcf96aca2f329440a844f924c8a0040b6d02/joblib-1.3.2-py3-none-any.whl.metadata
  Downloading joblib-1.3.2-py3-none-any.whl.metadata (5.4 kB)
Downloading click-8.1.7-py3-none-any.whl (97 kB)
   ---------------------------------------- 0.0/97.9 kB ? eta -:--:--
   ---------------------------------------- 97.9/97.9 kB 2.8 MB/s eta 0:00:00
Downloading joblib-1.3.2-py3-none-any.whl (302 kB)
   ---------------------------------------- 0.0/302.2 kB ? eta -:--:--
   ------------------------------------

In [21]:
!pip install transformers[torch]

Collecting accelerate>=0.20.3 (from transformers[torch])
  Obtaining dependency information for accelerate>=0.20.3 from https://files.pythonhosted.org/packages/4d/a7/05c67003d659a0035f2b3a8cf389c1d9645865aee84a73ce99ddab16682f/accelerate-0.22.0-py3-none-any.whl.metadata
  Downloading accelerate-0.22.0-py3-none-any.whl.metadata (17 kB)
Downloading accelerate-0.22.0-py3-none-any.whl (251 kB)
   ---------------------------------------- 0.0/251.2 kB ? eta -:--:--
   ---------------------------------------- 251.2/251.2 kB 7.5 MB/s eta 0:00:00
Installing collected packages: accelerate
Successfully installed accelerate-0.22.0


In [22]:
!pip install accelerate -U



In [1]:
import os
os.chdir("../")

In [12]:

import sys
from dataclasses import dataclass
from pathlib import Path
from src.constants.constants import CONFIG_PATH, PARAMS_PATH
from src.loging import logger
from src.utils.common import get_size, create_directories
from src.utils.common import read_yaml
from transformers import pipeline, set_seed
from datasets import load_dataset, load_from_disk, load_metric
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
import pandas as pd
from tqdm import tqdm
import py7zr
import nltk
import torch

nltk.download("punkt")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\I585498\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [13]:
# Create entity
@dataclass
class ModelTrainingConfig:
    root_dir: Path
    data_path: Path
    model_ckpt: str
    num_epochs: int
    warmup_steps: int
    per_device_train_batch_size: int
    per_device_eval_batch_size: int
    weight_decay: float
    logging_steps: int
    fp16: bool
    learning_rate: float

In [14]:
class ConfigurationManager:
    def __init__(self, config_path, params_path):
        self.config_path = config_path
        self.params_path = params_path
        
        self.config = read_yaml(self.config_path)
        self.params = read_yaml(self.params_path)
        
        create_directories([self.config["model_training"]["root_dir"]])
        
    def get_model_training_config(self):
        return ModelTrainingConfig(
            root_dir = Path(self.config["model_training"]["root_dir"]),
            data_path = Path(self.config["model_training"]["data_dir"]),
            model_ckpt = self.config["model_training"]["model_ckpt"],
            num_epochs = self.params["model_params"]["num_epochs"],
            warmup_steps = self.params["model_params"]["warmup_steps"],
            per_device_train_batch_size = self.params["model_params"]["per_device_train_batch_size"],
            per_device_eval_batch_size = self.params["model_params"]["per_device_eval_batch_size"],
            weight_decay = self.params["model_params"]["weight_decay"],
            logging_steps = self.params["model_params"]["logging_steps"],
            fp16 = self.params["model_params"]["fp16"],
            learning_rate = self.params["model_params"]["learning_rate"]
        )

In [15]:
torch.cuda.is_available()

False

In [16]:
class ModelTraining:
    def __init__(self, config: ConfigurationManager):
        self.config = config.get_model_training_config()
        
    def train_model(self):
        device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
        tokenizer = AutoTokenizer.from_pretrained(self.config.model_ckpt)
        model = AutoModelForSeq2SeqLM.from_pretrained(self.config.model_ckpt).to(device)
        seq2seq_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
        
        train_dataset = load_from_disk(self.config.data_path)
        
        training_args = Seq2SeqTrainingArguments(
            output_dir=self.config.root_dir,
            per_device_train_batch_size=self.config.per_device_train_batch_size,
            per_device_eval_batch_size=self.config.per_device_eval_batch_size,
            predict_with_generate=True,
            do_train=True,
            do_eval=True,
            logging_steps=self.config.logging_steps,
            save_steps=self.config.logging_steps,
            eval_steps=self.config.logging_steps,
            warmup_steps=self.config.warmup_steps,
            num_train_epochs=self.config.num_epochs,
            #fp16=self.config.fp16,
            learning_rate=self.config.learning_rate,
            weight_decay=self.config.weight_decay,
            overwrite_output_dir=True,
            save_total_limit=1,
        )
        
        trainer = Seq2SeqTrainer(
            model=model,
            tokenizer=tokenizer,
            args=training_args,
            compute_metrics=self.compute_metrics,
            train_dataset=train_dataset["test"],
            data_collator=seq2seq_collator,
            eval_dataset=train_dataset["validation"],
        )
        
        trainer.train()
        
        # save model
        model.save_model(os.path.join(self.config.root_dir, "model_pegasus"))
        tokenizer.save_pretrained(os.path.join(self.config.root_dir, "tokenizer_pegasus"))
        
   
        

In [17]:
try:
    logger.info(">>>>> stage 4: model training started")
    config_manager = ConfigurationManager(CONFIG_PATH, PARAMS_PATH)
    model_training = ModelTraining(config_manager)
    model_training.train_model()
    logger.info("stage 4: model training completed! ")

except Exception as e:
    logger.error(e)
    raise e

[2023-09-08 15:33:32,199]: INFO: 1828045357: >>>>> stage 4: model training started]
[2023-09-08 15:33:32,216]: INFO: common: Successfully read yaml file from config\config.yaml]
[2023-09-08 15:33:32,225]: INFO: common: Successfully read yaml file from params.yaml]
[2023-09-08 15:33:32,230]: INFO: common: Created directory: models/model_training]


Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-cnn_dailymail and are newly initialized: ['model.encoder.embed_positions.weight', 'model.decoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[2023-09-08 15:34:12,617]: ERROR: 1828045357: 'ModelTraining' object has no attribute 'compute_metrics']


AttributeError: 'ModelTraining' object has no attribute 'compute_metrics'