In [1]:
import os

In [2]:
# Go to the directory of the script (MLOPS_Project\)

os.chdir("/home/nicola/Projects/MLOPS_Project")

In [3]:
os.getcwd()

'/home/nicola/Projects/MLOPS_Project'

In [4]:
from dataclasses import dataclass 
from pathlib import Path

### Update: added predict_with_generate and eval_dataset_dimension to the config file
- predict_with_generate = True is necessary to use the generate method of the trainer
- eval_dataset_dimension is the number of examples to use for the evaluation (more than 50 could occupy all the memory, so stay low)

In [5]:
@dataclass(frozen=True)
class ModelTrainerConfig:
    root_dir : Path 
    data_path : Path 
    model_ckpt : str 
    num_train_epochs : int
    warmup_steps : int 
    per_device_train_batch_size : int
    per_device_eval_batch_size : int
    weight_decay : float 
    logging_steps : int
    evaluation_strategy : str 
    eval_steps : int 
    gradient_accumulation_steps : int
    predict_with_generate : bool 
    eval_dataset_dimension : int 
    

In [6]:
from mlopsProject.constants import *
from mlopsProject.utils.common import read_yaml, create_directory

[2024-01-28 23:10:54,743: INFO: config: PyTorch version 2.1.2 available.]
[2024-01-28 23:10:54,745: INFO: config: TensorFlow version 2.13.1 available.]


  from .autonotebook import tqdm as notebook_tqdm


In [7]:
class ConfigurationManager:
    def __init__(
            self,
            config_filepath = CONFIG_FILE_PATH,
            params_filepath = PARAMS_FILE_PATH):

            self.config = read_yaml(config_filepath)
            self.params = read_yaml(params_filepath)

            create_directory([self.config.artifacts_root])

    def get_model_trainer_config(self) -> ModelTrainerConfig:
        
        config = self.config.model_trainer
        params = self.params.TrainingArguments

        create_directory([config.root_dir])

        model_trainer_config = ModelTrainerConfig(
            root_dir = config.root_dir,
            data_path = config.data_path,
            model_ckpt = config.model_ckpt,
            num_train_epochs = params.num_train_epochs,
            warmup_steps = params.warmup_steps,
            per_device_train_batch_size = params.per_device_train_batch_size,
            per_device_eval_batch_size = params.per_device_eval_batch_size,
            weight_decay = params.weight_decay,
            logging_steps = params.logging_steps,
            evaluation_strategy = params.evaluation_strategy,
            eval_steps = params.eval_steps,
            gradient_accumulation_steps = params.gradient_accumulation_steps,
            predict_with_generate = params.predict_with_generate,
            eval_dataset_dimension = params.eval_dataset_dimension
        )

        return model_trainer_config

In [24]:
import torch
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq, AutoTokenizer, AutoModelForSeq2SeqLM 
from datasets import load_from_disk

In [25]:
class ModelTrainer:
    
    def __init__(self, config: ModelTrainerConfig):
        self.config = config
    
    def train(self):
        device = "cuda" if torch.cuda.is_available() else "cpu"
        model = AutoModelForSeq2SeqLM.from_pretrained(self.config.model_ckpt).to(device)
        tokenizer = AutoTokenizer.from_pretrained(self.config.model_ckpt)
        seq2seq_data_collator = DataCollatorForSeq2Seq(tokenizer = tokenizer, model = model, label_pad_token_id=-100, pad_to_multiple_of=8)
        trainer_args = Seq2SeqTrainingArguments(
            output_dir=self.config.root_dir,
            num_train_epochs=self.config.num_train_epochs,
            warmup_steps=self.config.warmup_steps,
            per_device_train_batch_size=self.config.per_device_train_batch_size,
            per_device_eval_batch_size=self.config.per_device_eval_batch_size,
            weight_decay=self.config.weight_decay,
            logging_steps=self.config.logging_steps,
            evaluation_strategy=self.config.evaluation_strategy,
            eval_steps=self.config.eval_steps,
            gradient_accumulation_steps=self.config.gradient_accumulation_steps,
            predict_with_generate=self.config.predict_with_generate
        )

        tokenized_data = load_from_disk(self.config.data_path)

        trainer = Seq2SeqTrainer(
            model = model,
            args = trainer_args,
            tokenizer = tokenizer,
            data_collator = seq2seq_data_collator,
            train_dataset = tokenized_data["train"],
            eval_dataset = tokenized_data["validation"].shuffle(seed=42).select(range(self.config.eval_dataset_dimension))
        )

        trainer.train()

        model.save_pretrained(os.path.join(self.config.root_dir,"trainer_model"))
        tokenizer.save_pretrained(os.path.join(self.config.root_dir,"trainer_tokenizer"))

In [26]:
try:
    config = ConfigurationManager() 
    model_trainer_config = config.get_model_trainer_config()
    model_trainer = ModelTrainer(config = model_trainer_config)
    model_trainer.train()
except Exception as e:
    raise e

[2024-01-28 23:19:11,177: INFO: common: file: config/config.yaml loaded correctly]
[2024-01-28 23:19:11,182: INFO: common: file: params.yaml loaded correctly]
[2024-01-28 23:19:11,183: INFO: common: directory artifacts created]
[2024-01-28 23:19:11,184: INFO: common: directory artifacts/model_trainer created]


Step,Training Loss,Validation Loss
