In [1]:
import os

In [2]:
%pwd

'/home/aman/Desktop/TextCraft/research'

In [3]:
os.chdir("../")

In [4]:
from pathlib import Path
from dataclasses import dataclass

In [5]:
@dataclass(frozen=True)
class ModelTrainerConfig:
    root_dir: Path
    data_path: Path
    model: str
    tokenizer: str
    num_train_epochs: int
    warmup_steps: int
    per_device_train_batch_size: int
    per_device_eval_batch_size: int
    weight_decay: float
    logging_steps: int
    evaluation_strategy: str
    eval_steps: int
    save_steps: float
    gradient_accumulation_steps: int
    optim: str
    lr_scheduler_type: str
    fp16:  bool
    bf16:  bool
    use_4bit: bool
    bnb_4bit_compute_dtype: str
    bnb_4bit_quant_type: str
    use_nested_quant: False
    lora_r: int
    lora_alpha: int
    lora_dropout: int


In [6]:
from TextCraft.utils.common import read_yaml, create_directories
from TextCraft.constants import *


In [12]:
class ConfigurationManager:
    def __init__(self,config_path = CONFIG_FILE_PATH, params_path = PARAMS_FILE_PATH):
        self.config = read_yaml(config_path)
        self.params = read_yaml(params_path)

        create_directories([self.config.artifacts_root])

    def get_model_config(self) -> ModelTrainerConfig:
        config = self.config.model_trainer
        train_params = self.params.TrainingArguments
        bnb_params = self.params.bnb_config
        peft_config = self.params.peft_config
        create_directories([config.root_dir])

        get_model_training_config = ModelTrainerConfig(
            root_dir= config.root_dir,
            data_path= config.data_path,
            model= config.model,
            tokenizer=config.tokenizer,
            num_train_epochs= train_params.num_train_epochs,
            warmup_steps= train_params.warmup_steps,
            per_device_train_batch_size=train_params.per_device_train_batch_size,
            per_device_eval_batch_size= train_params.per_device_eval_batch_size,
            weight_decay= train_params.weight_decay,
            logging_steps= train_params.logging_steps,
            evaluation_strategy= train_params.evaluation_strategy,
            eval_steps= train_params.eval_steps,
            save_steps= train_params.save_steps,
            gradient_accumulation_steps= train_params.gradient_accumulation_steps,
            optim= train_params.optim,
            lr_scheduler_type= train_params.lr_scheduler_type,
            fp16=  train_params.fp16,
            bf16= train_params.bf16,
            use_4bit= bnb_params.use_4bit,
            bnb_4bit_compute_dtype = bnb_params.bnb_4bit_compute_dtype,
            bnb_4bit_quant_type  = bnb_params.bnb_4bit_quant_type,
            use_nested_quant= bnb_params.use_nested_quant,
            lora_r = peft_config.lora_r,
            lora_alpha= peft_config.lora_alpha,
            lora_dropout= peft_config.lora_dropout
        )
        return get_model_training_config




In [8]:
from transformers import (GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    Trainer,
    logging)
from peft import LoraConfig, PeftModel
from trl import SFTTrainer

from datasets import load_dataset, load_from_disk
import torch

  from .autonotebook import tqdm as notebook_tqdm


[2024-09-12 16:45:54,214: INFO: config: PyTorch version 2.4.1 available.]


In [13]:
class ModelTrainer:
    def __init__(self, config: ModelTrainerConfig):
        self.config = config
    
    def train(self):
        device = 'cuda' if torch.cuda.is_available() else 'cpu'
        
        tokenizer = GPT2Tokenizer.from_pretrained(self.config.tokenizer)

        dataset_pt = load_from_disk(self.config.data_path)
        tokenized_train_dataset = dataset_pt["train"]
        tokenized_eval_dataset = dataset_pt["validation"]

        tokenized_train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
        tokenized_eval_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

        bnb_config = BitsAndBytesConfig(
            load_in_4bit = self.config.use_4bit,
            bnb_4bit_quant_type = self.config.bnb_4bit_quant_type,
            bnb_4bit_compute_dtype =self.config.bnb_4bit_compute_dtype,
            bnb_4bit_use_double_quant=self.config.use_nested_quant
        )
        model = GPT2LMHeadModel.from_pretrained(self.config.model, quantization_config=bnb_config).to(device)
        
        compute_type = getattr(torch, 'float16')
        if compute_type == torch.float16 and self.config.use_4bit:
            major, _ = torch.cuda.get_device_capability()
            if major >=8:
                print("=" * 80)
                print("Your GPU supports bfloat16: accelerate training with bf16=True")
                print("=" * 80)

        peft_config = LoraConfig(
            lora_alpha=self.config.lora_alpha,
            lora_dropout=self.config.lora_dropout,
            r=self.config.lora_r,
            bias="none",
            task_type="CAUSAL_LM",
        )
        
        
        trainer_args = TrainingArguments(
                output_dir=self.config.root_dir, num_train_epochs=self.config.num_train_epochs, warmup_steps=self.config.warmup_steps,
                per_device_train_batch_size=self.config.per_device_train_batch_size, per_device_eval_batch_size=self.config.per_device_train_batch_size,
                weight_decay=self.config.weight_decay, logging_steps=self.config.logging_steps,
                evaluation_strategy=self.config.evaluation_strategy, eval_steps=self.config.eval_steps, save_steps=1e6,
                gradient_accumulation_steps=self.config.gradient_accumulation_steps, optim = self.config.optim,
                lr_scheduler_type = self.config.lr_scheduler_type, fp16=self.config.fp16, bf16=self.config.bf16
        )

        trainer = SFTTrainer(
            model=model,
            args=trainer_args,
            peft_config=peft_config,
            train_dataset=tokenized_train_dataset,
            eval_dataset=tokenized_eval_dataset,
            tokenizer=tokenizer,
            packing=True
        )

        trainer.train()

        trainer.model.save_pretrained(os.path.join(self.config.root_dir,"gpt2"))
        tokenizer.save_pretrained(os.path.join(self.config.root_dir,"tokenizer"))   

        base_model = GPT2LMHeadModel.from_pretrained(
            model=model,
            low_cpu_mem_usage=True,
            return_dict=True,
            torch_dtype=torch.float16
        )

        final_model = PeftModel.from_pretrained(base_model, os.path.join(self.config.root_dir,"gpt2"))
        final_model = final_model.merge_and_unload()
        final_model.save_pretrained(os.path.join(self.config.root_dir,"gpt2"))


        


In [16]:
try:
    config = ConfigurationManager()
    model_trainer_config = config.get_model_config()
    model_trainer_config = ModelTrainer(config=model_trainer_config)

    model_path = os.path.join("artifacts", "model_trainer", "gpt2", "model.safetensors")  # or 'pytorch_model.bin'

    if os.path.exists(model_path):
        print(f"Model already exists at {model_path}. Skipping training...")
    else:
        print("Model not found. Starting training...")
        model_trainer_config.train()
except Exception as e:
    raise e

[2024-09-12 17:00:19,311: INFO: common: yaml file: config/config.yaml loaded successfully]
[2024-09-12 17:00:19,319: INFO: common: yaml file: params.yaml loaded successfully]
[2024-09-12 17:00:19,327: INFO: common: created directory at: artifacts]
[2024-09-12 17:00:19,334: INFO: common: created directory at: artifacts/model_trainer]
Model already exists at artifacts/model_trainer/gpt2/model.safetensors. Skipping training...
