[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/your-repo/your-notebook.ipynb)

## Install Necessary Packages
First, we need to install the required packages for our fine-tuning process.

In [None]:
!git clone https://github.com/JulianLopezB/LLMFinetuner.git
!pip install transformers datasets torch accelerate huggingface_hub scipy peft bitsandbytes python-dotenv hydra-core

## Import Necessary Libraries
Next, we import all the necessary libraries and modules that we will use throughout the notebook.

In [4]:
import os
import torch
from dotenv import load_dotenv
import hydra
from omegaconf import DictConfig, OmegaConf
from src.data_loader import DataLoader
from src.model_setup import ModelSetup
from src.trainer import CustomTrainer
from src.evaluator import Evaluator
from src.huggingface_integration import HuggingFaceIntegration
from src.config import Config

# Load environment variables
load_dotenv()

ModuleNotFoundError: No module named 'src'

## Check Environment
Ensure that CUDA is available and the necessary environment variables are set.

In [None]:
# Check for CUDA availability
if not torch.cuda.is_available():
    print("CUDA is not available. Please check your installation of CUDA and NVIDIA drivers.")

# Check for HUGGINGFACE_TOKEN environment variable
if 'HUGGINGFACE_TOKEN' not in os.environ:
    raise EnvironmentError("HUGGINGFACE_TOKEN is not set. Please set this environment variable.")

## Load Configuration
Load the configuration file using Hydra.

In [None]:
# Define the configuration path and name
config_path = './config'
config_name = 'finetuning_config'

# Load the configuration
config = OmegaConf.load(os.path.join(config_path, f'{config_name}.yaml'))
print(OmegaConf.to_yaml(config))

## Load Dataset
Load the dataset using the `DataLoader` class.

In [None]:
# Load the dataset
data_loader = DataLoader(config.dataset.path, from_huggingface=config.dataset.from_huggingface)
train_dataset, eval_dataset = data_loader.get_dataset()['train'].train_test_split(test_size=0.2).values()

## Setup Model and Tokenizer
Setup the model and tokenizer with quantization and device configuration if required.

In [None]:
# Setup the model and tokenizer
model_setup = ModelSetup(
    config.model.name,
    quantization_config=config.model.quantization,
    device_map=config.model.device_map
)
model, tokenizer = model_setup.get_model_and_tokenizer()

## Configure PEFT
Configure PEFT (Parameter-Efficient Fine-Tuning) if enabled.

In [None]:
# Configure PEFT if enabled
if config.training.peft_enabled:
    lora_config = config.training.lora_config
else:
    lora_config = None

## Setup and Run Trainer
Setup the `CustomTrainer` and start the training process.

In [None]:
# Setup and run the trainer
trainer = CustomTrainer(
    model,
    tokenizer,
    train_dataset,
    eval_dataset,
    config.training.output_dir,
    peft_config=lora_config,
    **config.training.trainer_args
)
trainer.train()

## Evaluate Model
Evaluate the model on the evaluation dataset.

In [None]:
# Evaluate the model
evaluator = Evaluator(model, tokenizer, eval_dataset)
results = evaluator.evaluate()
print("Evaluation Results:", results)

## Push Model to Hugging Face
If enabled, save and push the model to Hugging Face.

In [None]:
# If Hugging Face push is enabled
if config.training.hf_push:
    hf_integration = HuggingFaceIntegration(
        model,
        config.model.name,
        config.model.new_model,
        config.training.hf_org
    )
    hf_integration.save_and_push_model()

## Main Function
Define the main function to run the entire fine-tuning pipeline using Hydra for configuration management.

In [None]:
@hydra.main(version_base=None, config_path=config_path, config_name=config_name)
def main(config: DictConfig):
    # Load the dataset
    data_loader = DataLoader(config.dataset.path, from_huggingface=config.dataset.from_huggingface)
    train_dataset, eval_dataset = data_loader.get_dataset()['train'].train_test_split(test_size=0.2).values()

    # Setup the model and tokenizer with quantization and device configuration if required
    model_setup = ModelSetup(
        config.model.name,
        quantization_config=config.model.quantization,
        device_map=config.model.device_map
    )
    model, tokenizer = model_setup.get_model_and_tokenizer()

    # Configure PEFT if enabled
    if config.training.peft_enabled:
        lora_config = config.training.lora_config
    else:
        lora_config = None

    # Setup and run the trainer
    trainer = CustomTrainer(
        model,
        tokenizer,
        train_dataset,
        eval_dataset,
        config.training.output_dir,
        peft_config=lora_config,
        **config.training.trainer_args
    )
    trainer.train()

    # Evaluate the model
    evaluator = Evaluator(model, tokenizer, eval_dataset)
    results = evaluator.evaluate()
    print("Evaluation Results:", results)

    # If Hugging Face push is enabled
    if config.training.hf_push:
        hf_integration = HuggingFaceIntegration(
            model,
            config.model.name,
            config.model.new_model,
            config.training.hf_org
        )
        hf_integration.save_and_push_model()

if __name__ == "__main__":
    main()