INSTALL REQUIRED LIBRARIES

In [5]:
# Install necessary libraries
!pip install transformers==4.28.1 datasets accelerate==0.15.0 bitsandbytes optuna



In [6]:
import os
import torch

# Enable CUDA_LAUNCH_BLOCKING for better error tracking
%env CUDA_LAUNCH_BLOCKING=1

import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

env: CUDA_LAUNCH_BLOCKING=1


DATA LOADING AND PRE PROCESSING

In [7]:
from datasets import load_dataset
from transformers import AutoTokenizer

# Load the CNN/DailyMail dataset
dataset = load_dataset("cnn_dailymail", '3.0.0')

# Split dataset into train and validation sets
train_dataset = dataset['train']
val_dataset = dataset['validation']

# Load the tokenizer for Meta-LLaMA-3-8B-Instruct
tokenizer = AutoTokenizer.from_pretrained('meta-llama/Llama-2-7b-hf', use_fast=True)

if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})


def preprocess_data(examples):
    inputs = examples['article']
    targets = examples['highlights']

    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding='max_length')
    labels = tokenizer(targets, max_length=128, truncation=True, padding='max_length')

    labels['input_ids'] = [
        [(label if label != tokenizer.pad_token_id else -100) for label in labels_example]
        for labels_example in labels['input_ids']
    ]

    model_inputs['labels'] = labels['input_ids']
    return model_inputs

# Preprocess the dataset
train_dataset = train_dataset.map(preprocess_data, batched=True)
val_dataset = val_dataset.map(preprocess_data, batched=True)


README.md:   0%|          | 0.00/15.6k [00:00<?, ?B/s]

train-00000-of-00003.parquet:   0%|          | 0.00/257M [00:00<?, ?B/s]

train-00001-of-00003.parquet:   0%|          | 0.00/257M [00:00<?, ?B/s]

train-00002-of-00003.parquet:   0%|          | 0.00/259M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/34.7M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/30.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/287113 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/13368 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/11490 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/776 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

Using pad_token, but it is not set yet.


Map:   0%|          | 0/287113 [00:00<?, ? examples/s]

Map:   0%|          | 0/13368 [00:00<?, ? examples/s]

HYPERPARAMETER SEARCH WITH OPTUNA

In [8]:
torch.cuda.empty_cache()

In [9]:
# pip install --upgrade torch accelerate

In [10]:
import optuna
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments, AutoModelForCausalLM, AutoTokenizer
import torch
from accelerate import Accelerator

# Initialize the accelerator to handle CPU offloading
accelerator = Accelerator(mixed_precision="fp16", device_placement=True)

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained('meta-llama/Llama-2-7b-hf', use_fast=True)

# Load the model with the provided configuration
model = AutoModelForCausalLM.from_pretrained('meta-llama/Llama-2-7b-hf', ignore_mismatched_sizes=True)

# Resize model embeddings to accommodate the new tokens
model.resize_token_embeddings(len(tokenizer))

# Enable gradient checkpointing to reduce memory usage
model.gradient_checkpointing_enable()

# Define Optuna objective function
def objective(trial):
    # Suggest hyperparameters
    learning_rate = trial.suggest_float("learning_rate", 1e-5, 1e-3, log=True)
    batch_size = trial.suggest_int("batch_size", 8, 64)
    weight_decay = trial.suggest_float("weight_decay", 0.0, 0.1)

    # Define training arguments
    training_args = Seq2SeqTrainingArguments(
        output_dir="./results",
        evaluation_strategy="steps",
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        logging_dir='./logs',
        logging_steps=50,
        save_total_limit=3,
        learning_rate=learning_rate,
        num_train_epochs=3,
        weight_decay=weight_decay,
        predict_with_generate=True,
        fp16=True,  # Enable FP16 to save memory
        dataloader_pin_memory=False  # Avoid potential CUDA memory issues with pinning
    )

    # Prepare the model, datasets, and trainer with accelerator
    model_prepared, train_dataset_prepared, val_dataset_prepared = accelerator.prepare(model, train_dataset, val_dataset)

    trainer = Seq2SeqTrainer(
        model=model_prepared,
        args=training_args,
        train_dataset=train_dataset_prepared,
        eval_dataset=val_dataset_prepared,
        tokenizer=tokenizer,
    )

    # Train the model
    trainer.train()

    # Evaluate the model and return eval loss for optimization
    eval_results = trainer.evaluate()
    return eval_results['eval_loss']

# Create and optimize the Optuna study
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=10)  # You can increase n_trials for more optimization

# Output the best trial's hyperparameters
best_params = study.best_trial.params
print(f"Best hyperparameters: {best_params}")


config.json:   0%|          | 0.00/609 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

FINE TUNE WITH BEST PARAMETERS

In [None]:
# Extract best hyperparameters from Optuna
best_params = study.best_trial.params

# Define the final training arguments with best hyperparameters
training_args = Seq2SeqTrainingArguments(
    output_dir="./best_model",
    evaluation_strategy="steps",
    per_device_train_batch_size=best_params['batch_size'],  # Use the best batch size
    per_device_eval_batch_size=best_params['batch_size'],   # Use the best batch size
    gradient_accumulation_steps=8,
    logging_dir='./logs',
    logging_steps=50,
    save_total_limit=3,
    learning_rate=best_params['learning_rate'],              # Use the best learning rate
    num_train_epochs=best_params['num_train_epochs'],        # Use the best number of epochs
    weight_decay=best_params['weight_decay'],                # Use the best weight decay
    predict_with_generate=True,
    fp16=True,                                               # Enable FP16 for memory efficiency
    dataloader_pin_memory=False                              # Avoid potential CUDA memory issues with pinning
)

# Load the model again for final training with best hyperparameters
model = AutoModelForCausalLM.from_pretrained(
    'meta-llama/Llama-2-7b-hf',
    config=config,
    ignore_mismatched_sizes=True  # To handle size mismatches if any
)

# Initialize the final Trainer with best hyperparameters
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
)

# Prepare the trainer with the accelerator
trainer = accelerator.prepare(trainer)

# Fine-tune the model with best hyperparameters
trainer.train()

# Save the best model
trainer.save_model("./best_model")

EVALUATE FINAL MODEL

In [None]:
# Evaluate the fine-tuned model
best_results = trainer.evaluate()
print(best_results)
