In [1]:
!pip install -U transformers datasets accelerate peft trl bitsandbytes wandb



In [2]:
# Import libraries for garbage collection management
import gc

# Import library for interacting with the operating system
import os

# Import libraries for working with PyTorch (deep learning framework)
import torch
import wandb  # Likely for experiment tracking and visualization

# Import library for loading datasets (potentially for training)
from datasets import load_dataset

# Imports commented out as they might be Colab specific
# from google.colab import userdata  # Potentially for accessing user data on Google Colab

# Imports likely related to a specific deep learning model (Peft)
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training

# Imports from Transformers library for various NLP tasks
from transformers import (
    AutoModelForCausalLM,  # For autoregressive causal language models
    AutoTokenizer,  # For tokenization tasks
    BitsAndBytesConfig,  # Potentially for a specific model configuration
    TrainingArguments,  # For defining training arguments
    pipeline,  # For creating NLP pipelines
)

# Imports related to a Reinforcement Learning library (TRL)
from trl import ORPOConfig, ORPOTrainer, setup_chat_format


2024-04-25 07:33:18.537425: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-25 07:33:18.537482: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-25 07:33:18.538897: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:
# Import the class to access Kaggle User Secrets (secure storage for secrets)
from kaggle_secrets import UserSecretsClient

# Create a client object to interact with Kaggle User Secrets
user_secrets = UserSecretsClient()

# Get the secret value associated with the label "junaid"
secret_value_0 = user_secrets.get_secret("junaid")


In [4]:
 #Check if the GPU has a compute capability of at least 8
if torch.cuda.get_device_capability()[0] >= 8:
    # If the GPU supports it, install the Flash Attention library
    !pip install -qqq flash-attn
    # Use the Flash Attention implementation for attention
    attn_implementation = "flash_attention_2"
    # Use bfloat16 data type for higher precision and lower memory usage
    torch_dtype = torch.bfloat16
else:
    # If the GPU doesn't support Flash Attention, use the eager implementation
    attn_implementation = "eager"
    # Use float16 data type for lower precision and memory usage
    torch_dtype = torch.float16

In [6]:
# Import function for logging into Hugging Face Hub from a notebook
from huggingface_hub import notebook_login

In [8]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [9]:
# Model Names
base_model = "meta-llama/Meta-Llama-3-8B"  # Name of the pre-trained model to be used as a base
new_model = "OrpoLlama3-8B-FT"  # Name for the fine-tuned model (likely)

# QLoRA Configuration (Quantization)
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,  # Load weights in 4-bit format
    bnb_4bit_quant_type="nf4",  # Specific quantization type for 4-bit format
    bnb_4bit_compute_dtype=torch_dtype,  # Datatype for computations (potentially set elsewhere)
    bnb_4bit_use_double_quant=True,  # Use double quantization for potentially higher efficiency
)

# LoRA Configuration (Low-Rank Adaptation)
peft_config = LoraConfig(
    r=16,  # Rank for the low-rank adaptation
    lora_alpha=32,  # Hyperparameter for LoRA
    lora_dropout=0.05,  # Dropout rate for LoRA
    bias="none",  # Bias type for LoRA
    task_type="CAUSAL_LM",  # Specify the task type (causal language modeling)
    target_modules=[  # List of modules to apply LoRA to (likely transformer layers)
        'up_proj', 'down_proj', 'gate_proj', 'k_proj', 'q_proj', 'v_proj', 'o_proj'
    ]
)

# Load Tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model)  # Load tokenizer for the base model

# Load Model with Quantization Configuration
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=bnb_config,  # Apply quantization configuration
    device_map="auto",  # Automatically map to available device (CPU/GPU)
    attn_implementation=attn_implementation  # Attention implementation (potentially set elsewhere)
)

# Prepare Model for Chat Format (might involve formatting for conversation-like inputs)
model, tokenizer = setup_chat_format(model, tokenizer)

# Further Model Preparation for Low-Rank Training (likely for LoRA)
model = prepare_model_for_kbit_training(model)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [10]:
# Load dataset from Hugging Face Hub
dataset = load_dataset("HuggingFaceH4/ultrafeedback_binarized", split=["train_prefs", "test_prefs"])


In [11]:
dataset

[Dataset({
     features: ['prompt', 'prompt_id', 'chosen', 'rejected', 'messages', 'score_chosen', 'score_rejected'],
     num_rows: 61135
 }),
 Dataset({
     features: ['prompt', 'prompt_id', 'chosen', 'rejected', 'messages', 'score_chosen', 'score_rejected'],
     num_rows: 2000
 })]

In [12]:
from datasets import load_dataset  # Import library for loading datasets

# Load the dataset from Hugging Face Hub
dataset = load_dataset("HuggingFaceH4/ultrafeedback_binarized", split=["train_prefs", "test_prefs"])

# Define the desired number of training samples (can be adjusted)
train_samples = 5000

# Get the original size of the training set (potentially for reference)
original_train_samples = 61135  # Assuming this value is known or obtained elsewhere

# Calculate the proportional number of test samples based on new training size
test_samples = int((2000 / original_train_samples) * train_samples)

# Shuffle the training set (randomize order) and select a subset
train_subset = dataset[0].shuffle(seed=42).select(range(train_samples))

# Shuffle the testing set and select a proportional subset
test_subset = dataset[1].shuffle(seed=42).select(range(test_samples))

# Print the contents of the training subset (likely shows sample data)
print(train_subset)

# Print the contents of the testing subset (likely shows sample data)
print(test_subset)


In [13]:
import multiprocessing

In [14]:
import multiprocessing  # Import library for working with multiple processes

def process(row):
  """
  This function processes a single row from the dataset.

  Args:
      row (dict): A dictionary containing a data point from the dataset.

  Returns:
      dict: The processed data point.
  """
  # Apply chat template to "chosen" and "rejected" fields without tokenization
  row["chosen"] = tokenizer.apply_chat_template(row["chosen"], tokenize=False)
  row["rejected"] = tokenizer.apply_chat_template(row["rejected"], tokenize=False)
  return row

# Process the training subset using multiprocessing
dataset[0] = train_subset.map(
    process,
    num_proc=multiprocessing.cpu_count(),  # Use all available CPU cores
    load_from_cache_file=False,  # Don't load from cache, process all data
)

# Process the testing subset using multiprocessing
dataset[1] = test_subset.map(
    process,
    num_proc=multiprocessing.cpu_count(),  # Use all available CPU cores
    load_from_cache_file=False,  # Don't load from cache, process all data
)

# Print the entire dataset (might be very large depending on the dataset size)
print(dataset)


Map (num_proc=4):   0%|          | 0/5000 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/163 [00:00<?, ? examples/s]

In [15]:
orpo_args = ORPOConfig(
  learning_rate=8e-6,  # Learning rate for the optimizer (8 times 10^-6)
  beta=0.1,  # Hyperparameter for optimizer (potentially controls momentum)
  lr_scheduler_type="linear",  # Type of learning rate scheduler (linear decay here)
  max_length=1024,  # Maximum sequence length for training
  max_prompt_length=512,  # Maximum prompt length for training
  per_device_train_batch_size=2,  # Training batch size per device (likely GPU)
  per_device_eval_batch_size=2,  # Evaluation batch size per device
  gradient_accumulation_steps=4,  # Accumulate gradients for multiple steps before update
  optim="paged_adamw_8bit",  # Optimizer used for training (potentially AdamW with 8-bit optimization)
  max_steps=1000,  # Maximum training steps
  evaluation_strategy="steps",  # Perform evaluation every 'eval_steps' steps
  eval_steps=0.2,  # Evaluate every 0.2 portion of the total training steps
  logging_steps=1,  # Log training information every step
  warmup_steps=10,  # Warmup steps for the learning rate scheduler
  report_to="wandb",  # Report training metrics to Weights & Biases (if installed)
  output_dir="./results/",  # Directory to store training outputs
)


In [16]:
trainer = ORPOTrainer(
  model=model,  # The pre-trained model to be fine-tuned
  args=orpo_args,  # Configuration object for training parameters
  train_dataset=dataset[0],  # Training dataset split
  eval_dataset=dataset[1],  # Evaluation dataset split
  peft_config=peft_config,  # Configuration for Low-Rank Adaptation (LoRA)
  tokenizer=tokenizer,  # Tokenizer for processing text data
)


max_steps is given, it will override any value given in num_train_epochs


In [None]:
trainer.train()

In [None]:

trainer.save_model(new_model)

## How to Merge LoRA Adapater

In [None]:
# Free up memory after training
del trainer, model
gc.collect()  # Help Python's garbage collector clean up memory
if torch.cuda.is_available():
  torch.cuda.empty_cache()  # Free up GPU memory if applicable

# Reload tokenizer and model with potentially less memory usage
tokenizer = AutoTokenizer.from_pretrained(base_model)
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    low_cpu_mem_usage=True,  # Reduce CPU memory usage during loading
    return_dict=True,  # Ensure model returns a dictionary of outputs
    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,  # Use half-precision if GPU available
    device_map="auto"  # Automatically use CPU or GPU
)

# Prepare model for chat interactions (might involve formatting)
model, tokenizer = setup_chat_format(model, tokenizer)

# Combine the model with LoRA techniques (potentially for efficiency)
model = PeftModel.from_pretrained(model, new_model)
model = model.merge_and_unload()  # Optimize and potentially remove unused parts

# Upload the fine-tuned model and tokenizer to Hugging Face Hub
model.push_to_hub(new_model, use_temp_dir=False)
tokenizer.push_to_hub(new_model, use_temp_dir=False)
