**Author:** Pratik Vyas

**Classification:** Binary classification ( Cancer , non-Cancer)

**Usecase:** Distributed Finetuning LLM 'meta-llama/Llama-3.1-8B-Instruct' with LORA

In [None]:
!pip3 install -q -U transformers accelerate
!pip3 install -q -U bitsandbytes
!pip3 install -q -U wandb
# !pip3 install -q  -U trl
!pip3 install -q  "trl == 0.12.0"

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline

In [None]:
import os

os.environ["HF_TOKEN"] ="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"
os.environ["WB_KEY"] = "9d7decf681236b200a35c0121bca0fe725be724c"

In [None]:
model_id="meta-llama/Llama-3.1-8B-Instruct"
new_model="Llama-3.1-8B-Inst_cancer_classification_finetuned"


tokenizer = AutoTokenizer.from_pretrained(model_id,   use_auth_token=os.environ["HF_TOKEN"])

In [None]:

# Activate 4-bit precision base model loading
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4", # Quantization type (fp4 or nf4)
    bnb_4bit_compute_dtype="float16",  # Compute dtype for 4-bit base models
    bnb_4bit_use_double_quant=False,
    # Enable CPU offloading for specific layers
    llm_int8_enable_fp32_cpu_offload=False, # Activate nested quantization for 4-bit base models (double quantization)
)



##load model
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map="auto",  # Let Transformers automatically decide device placement
    use_auth_token=os.environ["HF_TOKEN"]
)

In [None]:
print(model)

In [None]:
# integrate Weights & Biases (W&B) with training process for tracking, monitoring, and collaboration
import os
import wandb

wandb.login(key=os.environ["WB_KEY"])
run = wandb.init(
    project="cancer_classification",
    job_type="training",
    anonymous="allow",
)


In [None]:

from transformers import (AutoModelForCausalLM,
                          AutoTokenizer,
                          BitsAndBytesConfig,
                          TrainingArguments,
                          pipeline,
                          logging)
from sklearn.metrics import (accuracy_score,
                             classification_report,
                             confusion_matrix)
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import os
from tqdm import tqdm
import bitsandbytes as bnb
import torch
import torch.nn as nn
import transformers
from datasets import Dataset
from peft import LoraConfig, PeftConfig
from trl import SFTTrainer
from trl import setup_chat_format

# Load, preprocess cancer data from zip file

In [None]:
import zipfile
import pandas as pd
import io
import os

def load_text_files_from_zip_to_dataframe(zip_path, directory_in_zip, **pandas_kwargs):
    """
    Loads text files from a specific directory inside a ZIP archive into a Pandas DataFrame.
    """
    data = []
    filenames = []
    try:
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            for filename in zip_ref.namelist():
                if filename.startswith(directory_in_zip) and filename.endswith('.txt'):  # Only process text files
                    try:
                        with zip_ref.open(filename) as text_file:
                            content = text_file.read().decode('utf-8', errors='ignore').strip() # Read file contents, decode from bytes
                            data.append(content)
                            # Extract the filename without the directory path and extension
                            file_no_path = os.path.basename(filename)
                            file_no_ext, _ = os.path.splitext(file_no_path)
                            filenames.append(file_no_ext)
                    except Exception as e:
                        print(f"Error reading file {filename}: {e}")
                        continue # Skip to the next file

        if not data:
            print(f"No text files found in directory: {directory_in_zip}")
            return None

        df = pd.DataFrame({'texts': data, 'filename': filenames})  # Create DataFrame
        return df

    except FileNotFoundError:
        print(f"Error: ZIP file not found: {zip_path}")
        return None
    except zipfile.BadZipFile as e:
        print(f"Error: Invalid ZIP file: {e}")
        return None
    except Exception as e:
        print(f"An error occurred: {e}")
        return None

In [None]:
zip_path="Dataset.zip"
cancer_df_row = load_text_files_from_zip_to_dataframe(zip_path, 'Dataset/Cancer')
cancer_df_row['labels'] = True

non_cancer_df_row = load_text_files_from_zip_to_dataframe(zip_path, 'Dataset/Non-Cancer')
non_cancer_df_row['labels'] = False

display(cancer_df_row.head())
display(non_cancer_df_row.head())

# concat cancer_df_row, non_cancer_df_row

In [None]:
data_df = pd.concat([cancer_df_row, non_cancer_df_row], ignore_index=True)
display(data_df)

# split cancer data into train,test,val

In [None]:
from sklearn.model_selection import train_test_split

def split_train_test_val(df, train_size=0.6, val_size=0.2, test_size=0.2, random_state=42):
    """
    Splits a DataFrame into train, test, and validation sets with specified ratios.
    """
    if not isinstance(df, pd.DataFrame):
        raise TypeError("Input must be a Pandas DataFrame.")

    if sum([train_size, test_size, val_size]) != 1.0:
        raise ValueError("Train, test, and validation sizes must sum to 1.")

    # First split into training and remaining data
    train_df, remaining_df = train_test_split(df, train_size=train_size, random_state=random_state)

    # Calculate the proportion for test and validation sets from the remaining data
    remaining_proportion = 1 - train_size
    test_proportion = test_size / remaining_proportion
    val_proportion = val_size / remaining_proportion

    # Split the remaining data into test and validation sets
    test_df, val_df = train_test_split(remaining_df, test_size=test_proportion, random_state=random_state)


    return train_df, test_df, val_df

In [None]:
train_df, test_df, val_df = split_train_test_val(data_df, train_size=0.6, val_size=0.2, test_size=0.2)


## train_df reset index
train_df.rename(columns={'filename': 'ID'}, inplace=True) # 1. Rename 'filename' column to 'ID'
train_df = train_df.set_index('ID') # 2. Set 'ID' as the index (this will shift current index to the side)
train_df = train_df.reset_index() # 3. Reset the index to remove the old index (the original integer index)

## test_df reset index
test_df.rename(columns={'filename': 'ID'}, inplace=True) # 1. Rename 'filename' column to 'ID'
test_df = test_df.set_index('ID') # 2. Set 'ID' as the index (this will shift current index to the side)
test_df = test_df.reset_index() # 3. Reset the index to remove the old index (the original integer index)

## val_df reset index
val_df.rename(columns={'filename': 'ID'}, inplace=True) # 1. Rename 'filename' column to 'ID'
val_df = val_df.set_index('ID') # 2. Set 'ID' as the index (this will shift current index to the side)
val_df = val_df.reset_index() # 3. Reset the index to remove the old index (the original integer index)

print("Cancer Train DataFrame:")
display(train_df.head())
print(f"Train shape: {train_df.shape}")

print("\nCancer Test DataFrame:")
display(test_df.head())
print(f"Test shape: {test_df.shape}")

print("\nCancer Validation DataFrame:")
display(val_df.head())
print(f"Validation shape: {val_df.shape}")

preprocess text , add prompt and label

In [None]:
# Function to perform the text processing

import re
def process_text(row,isTestSetOrValSet):
  # Compile the regex pattern (case-insensitive)
  abstract_pattern = re.compile(r"^\s*Abstract:\s*", re.IGNORECASE | re.MULTILINE)  # Precompile regex

  if not bool(abstract_pattern.search(row['texts'])):
    text="Missing Abstract"
    return text

  # Remove identifiers such as 'ID:', 'Title:', 'Abstract:'
  # normalized_text = re.sub(r"(ID:|Title:|Abstract:)", "", row['texts'], flags=re.IGNORECASE)
  normalized_text = re.sub(r"<(?:ID:\d+)>|Title:|Abstract:", "", row['texts'], flags=re.IGNORECASE)

  # Standardize common abbreviations
  normalized_text = re.sub(r"\bet al\.\b", "and others", normalized_text, flags=re.IGNORECASE)
  normalized_text = re.sub(r"\bvol\.\b", "volume", normalized_text, flags=re.IGNORECASE)

  # Remove common punctuation marks, but leave periods at the end of sentences
  normalized_text = re.sub(r"[,;@#$%^&*()]", "", normalized_text)

  # Ensure consistent spacing, replaces multiple spaces with single ones, and adds space after periods.
  normalized_text = " ".join(normalized_text.split())
  normalized_text = re.sub(r"\.(?=[A-Z])", ". ", normalized_text)

  # 2. create prompt ( I am adding this in dataset, alternativly it can be passed as function during training)
  prompt=f"You are a medical text classifier. Your task is to determine if the following text is related to cancer or not.\
          \nLabel text only with 'True' or 'False'.\
          \nIf text is related to cancer, Label as 'True' otherwise Label as 'False'.\
          \nDo not provide any additional explanation.\
          \n\nText:\n"



  text = prompt + normalized_text.strip() + "\n\nLabel: "



  # 3. Append actual label in case train dataset
  if not isTestSetOrValSet:
    text = text + str(row['labels'])

  return text

In [None]:
# Apply the processing function with the param isTestSetOrValSet
train_df['text'] = train_df.apply(lambda row: process_text(row, isTestSetOrValSet=False), axis=1)
train_df.drop(columns=['texts'], inplace=True) # Drop the original 'texts' column

test_df['text'] = test_df.apply(lambda row: process_text(row, isTestSetOrValSet=True), axis=1)
test_df.drop(columns=['texts'], inplace=True) # Drop the original 'texts' column

val_df['text'] = val_df.apply(lambda row: process_text(row, isTestSetOrValSet=True), axis=1)
val_df.drop(columns=['texts'], inplace=True) # Drop the original 'texts' column


Remove missing value

In [None]:
# Remove rows where 'text' column is "Missing Abstract"
train_df = train_df[train_df['text'] != "Missing Abstract"]
test_df = test_df[test_df['text'] != "Missing Abstract"]
val_df = val_df[val_df['text'] != "Missing Abstract"]

In [None]:
train_df["text"][0]

In [None]:
test_df["text"][0]

In [None]:
val_df["text"][0]

In [None]:
# Using list comprehension to count words in text
word_counts_text= [len(Title.split()) for Title in train_df["text"]]
# Get the maximum number of words
word_counts_text = max(word_counts_text)
print(f"Maximum number of tokens in text: {word_counts_text}")

convert into dictionary (expected input for model training )

In [None]:
##convert into dictionary
train_df_dict = train_df.to_dict(orient='list') #Convert df to dictionary
test_df_dict = test_df.to_dict(orient='list') #Convert df to dictionary
val_df_dict = val_df.to_dict(orient='list') #Convert df to dictionary



from datasets import Dataset
import pandas as pd

# Dataset with values as dictionary of col , val
training_dataset = Dataset.from_pandas( pd.DataFrame(train_df_dict))
test_dataset = Dataset.from_pandas( pd.DataFrame(test_df_dict))
val_dataset = Dataset.from_pandas( pd.DataFrame(val_df_dict))

print(training_dataset[0])

# Model evaluation before fine-tuning

In [None]:
from transformers import pipeline, logging


def predict(test, model, tokenizer):
    y_pred = []
    categories = ["True", "False"]

    # Assuming 'test' is a list or iterable that you're looping through
    total_iterations = len(test)
    update_interval = max(1, total_iterations // 5)  # Update every 20% (minimum 1)

    # Suppress Transformers warnings and info messages
    logging.set_verbosity_error()  # Suppress warnings and informational messages

    with tqdm(total=total_iterations, desc="Processing") as pbar:
      for i in (range(len(test))):
      # for i in (range(5)):


        prompt = test_df.iloc[i]["text"]
        # print("--test promp--")
        # print(prompt)
        # print("----")
        pipe = pipeline(task="text-generation",
                        model=model,
                        tokenizer=tokenizer,
                        max_new_tokens=3,
                        temperature=0.1)

        result = pipe(prompt)
        answer = result[0]['generated_text'].split("Label:")[-1].strip()
        # answer = result[0]['generated_text'].split("Answer:")[-1].strip()

        # print("--prediction--")
        # print(result[0]['generated_text'])
        # print("----")
        # Determine the predicted category
        for category in categories:
            if category.lower() in answer.lower():
                y_pred.append(category)
                break
        else:
            y_pred.append("none")
            # print(result[0]['generated_text'])

          # Update every 20%
        if (i + 1) % update_interval == 0 or (i + 1) == total_iterations:
            pbar.update(update_interval)  # Increment progress bar


    return y_pred

y_pred = predict(test_df, model, tokenizer)

In [None]:
# Convert to a Pandas Series (for easy counting)
y_pred_series = pd.Series(y_pred)

# Count the occurrences of each value
value_counts = y_pred_series.value_counts()

# Print the counts
print("Value Counts:")
print(value_counts)

In [None]:
from sklearn.metrics import (accuracy_score,
                             classification_report,
                             confusion_matrix)
from sklearn.model_selection import train_test_split


def evaluate(y_true, y_pred):
    labels = ["True", "False"]
    mapping = {'True': 1, 'False': 0}
    def map_func(x):
        return mapping.get(x, 1)

    y_true = np.vectorize(map_func)(y_true)
    y_pred = np.vectorize(map_func)(y_pred)

    # Calculate accuracy
    accuracy = accuracy_score(y_true=y_true, y_pred=y_pred)
    print(f'Accuracy: {accuracy:.3f}')

    # Generate accuracy report
    unique_labels = set(y_true)  # Get unique labels

    for label in unique_labels:
        label_indices = [i for i in range(len(y_true))
                         if y_true[i] == label]
        label_y_true = [y_true[i] for i in label_indices]
        label_y_pred = [y_pred[i] for i in label_indices]
        accuracy = accuracy_score(label_y_true, label_y_pred)
        print(f'Accuracy for label {label}: {accuracy:.3f}')

    # Generate classification report
    class_report = classification_report(y_true=y_true, y_pred=y_pred)
    print('\nClassification Report:')
    print(class_report)

    # Generate confusion matrix
    conf_matrix = confusion_matrix(y_true=y_true, y_pred=y_pred, labels=[0, 1, 2])
    print('\nConfusion Matrix:')
    print(conf_matrix)

In [None]:
y_true = test_df.loc[:,'labels']

In [None]:
logging.set_verbosity_error()  # Suppress warnings and informational messages
evaluate(y_true, y_pred)

# LORA Hyper-Parameter tuning (optional)
## ( r, lora_alpha, lora_dropout, target_modules )

In [None]:
!pip install -q optuna

In [None]:
import optuna
from accelerate import Accelerator
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments
from datasets import load_dataset
from datasets import DatasetDict
import time


start_time = time.time()


# Define the objective function
def objective(trial):
    # Clear GPU cache before loading the model for the second time
    torch.cuda.empty_cache()

    num_epochs = 1  # desired number of epochs
    # batch_size = 1  # per_device_train_batch_size


    # TRAIN_DATA_RECORD_SIZE = 10  # size of train/val dataset
    # VAL_DATA_RECORD_SIZE = 3
    # train_dataset = training_dataset.select(range(TRAIN_DATA_RECORD_SIZE))
    # validation_dataset = val_dataset.select(range(VAL_DATA_RECORD_SIZE))
    train_dataset = training_dataset
    validation_dataset = val_dataset



    # training_dataset = train_df
    # val_dataset = val_df

    # Define hyperparameters to tune
    lora_combination = trial.suggest_categorical("lora_combination", [ (32, 16), (32,32),(64, 32)])
    lora_r, lora_alpha = lora_combination
    lora_dropout = trial.suggest_categorical(
        "lora_dropout", [0.2,0.3, 0.4]
    )  # Higher Rates for smaller dataset or when you observe signs of overfitting during training


    target_modules = trial.suggest_categorical(
        "target_modules",
        [
            ["q_proj", "v_proj"],
            ["q_proj", "k_proj", "v_proj"],
            [
                "q_proj",
                "o_proj",
                "k_proj",
                "v_proj",
                "gate_proj",
                "up_proj",
                "down_proj",
            ],
        ],
    )


    lora_config = LoraConfig(
        r=lora_r,  # hyperparam tuning
        lora_alpha=lora_alpha,  # hyperparam tuning
        lora_dropout=lora_dropout,  # hyperparam tuning
        # target_modules=target_modules, # hyperparam tuning
        task_type="CAUSAL_LM",
    )

    # Define training arguments
    training_arguments = transformers.TrainingArguments(
        per_device_train_batch_size=1,
        per_device_eval_batch_size=1,
        gradient_accumulation_steps=3,  # 4
        num_train_epochs=1,
        warmup_steps=3,
        eval_strategy="steps",
        eval_steps=0.2,
        # max_steps=15,
        learning_rate=1e-4,
        weight_decay=1e-2,  # Add weight decay
        fp16=True,
        bf16=False,
        logging_steps=1,
        output_dir="outputs",
        optim="paged_adamw_8bit",
        report_to=None,  # wandb,tensorboard
        label_names=["labels"], ## taget column name

    )

    # Initialize the Accelerator for distributed processing
    accelerator = Accelerator()

    # Load model
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4", # Quantization type (fp4 or nf4)
        bnb_4bit_compute_dtype="float16",  # Compute dtype for 4-bit base models
        bnb_4bit_use_double_quant=False,
        # Enable CPU offloading for specific layers
        llm_int8_enable_fp32_cpu_offload=False, # Activate nested quantization for 4-bit base models (double quantization)
        )

    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        quantization_config=bnb_config,
        device_map="auto",  # Let Transformers automatically decide device placement
        use_auth_token=os.environ["HF_TOKEN"]
    )

    # Prepare the model, optimizer, and datasets with the Accelerator
    model, train_dataset, validation_dataset = accelerator.prepare(
        model, train_dataset, validation_dataset
    )


    # Initialize the Trainer
    tokenizer.pad_token = tokenizer.eos_token  # Ensure pad token is set
    tokenizer.padding_side = "left"  # it is a decoder-only model, it is generally recommended to set padding_side to "left".
    trainer = SFTTrainer(
        model=model,
        processing_class=tokenizer,
        # data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False ,  return_tensors="pt"),
        train_dataset=train_dataset,
        eval_dataset=validation_dataset,
        # max_seq_length=1000,  ## max seq length to input/output. It is crucial for GPU memory management.
        dataset_text_field="text",
        args=training_arguments,
        peft_config=lora_config,
        formatting_func=lambda example: example['text'],  # preprocessing function before input

    )



    # Train the model
    trainer.train()


    # Evaluate the model
    eval_results = trainer.evaluate()



    # Return the evaluation metric to optimize
    return eval_results["eval_loss"]


# Create an Optuna study and optimize the objective function
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=10)

# Print the best hyperparameters
best_params = study.best_params
print("Best hyperparameters: ", best_params)
# Print the best performance metrics
best_trial = study.best_trial

end_time = time.time()
print("\n\n--->Execution Time:", (end_time - start_time) / 60, "minutes")


# Finetune Model

In [None]:
################################################################################
# set best LORA parameters
# Modlues:
# up_proj: Up projection layer, likely part of the model’s feed-forward network.
# q_proj: Query projection layer, used in the attention mechanism.
# down_proj: Down projection layer, often used after attention or feed-forward layers.
# gate_proj: Gating projection layer, possibly used in gated feed-forward networks.
# o_proj: Output projection layer, used in the attention mechanism.
# k_proj: Key projection layer, used in the attention mechanism.
# v_proj: Value projection layer, used in the attention mechanism.
################################################################################
################################################################################
best_lora_dropout =0
best_lora_r = 16
best_lora_alpha =8
best_target_modules = [
    "q_proj",
    "k_proj",
    "v_proj",

    "o_proj",
    "gate_proj",
    "up_proj",
    "down_proj",

]

# Define LoRA configuration with the best hyperparameters
lora_config = LoraConfig(
    r=best_lora_r,
    lora_alpha=best_lora_alpha,
    lora_dropout=best_lora_dropout,
    target_modules=best_target_modules,
    task_type="CAUSAL_LM",

)


# Define training arguments
training_arguments = transformers.TrainingArguments(
  per_device_train_batch_size=1,
  per_device_eval_batch_size=1,
  gradient_accumulation_steps=3,  # 4
  gradient_checkpointing=True,
  num_train_epochs=1,
  # max_steps=25, #25
  warmup_steps=3,
  eval_strategy="steps",
  eval_steps=0.2,
  learning_rate=1e-4,
  weight_decay=1e-2,  # Add weight decay
  fp16=True,
  bf16=False,
  logging_steps=1,
  output_dir="outputs_model_training",
  optim="paged_adamw_8bit",
  report_to="wandb",  # wandb,tensorboard
  label_names=["labels"], ## taget column name

)

Distributed training using Accelerator

In [None]:
# from transformers import AdamW
from accelerate import Accelerator
from torch.optim import AdamW


# Initialize the Accelerator
accelerator = Accelerator()

# Initialize the optimizer
optimizer = AdamW(model.parameters(), lr=training_arguments.learning_rate)

# Prepare the model, tokenizer, datasets, and optimizer with the Accelerator
model, optimizer, training_dataset, val_dataset = accelerator.prepare(
    model, optimizer, training_dataset, val_dataset
)

In [None]:

def create_prompt(example):
    return [example['text']]

In [None]:
from accelerate import DistributedType
import time



start_time = time.time()

# Clear GPU cache before loading the model for the second time
torch.cuda.empty_cache()


SAVE_MODEL = True

tokenizer.pad_token = tokenizer.eos_token  # Ensure pad token is set
#If you pad on the right and haven't shifted labels appropriately, the model might try to predict padding tokens as the "next" word, which is incorrect
tokenizer.padding_side = "left"  # it is a decoder-only model, it is generally recommended to set padding_side to "left".


trainer = SFTTrainer(
    model=model,
    processing_class=tokenizer,
    train_dataset=training_dataset.select(range(300)), #<<--- limiting due to time,compute constraint
    eval_dataset=val_dataset.select(range(100)),
    # max_seq_length=1000,  ## max seq length to input/output. It is crucial for GPU memory management.
    dataset_text_field="text",
    args=training_arguments,
    peft_config=lora_config,
    formatting_func=lambda example: example['text'],  # preprocessing function before input
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False ,  return_tensors="pt"),

)

model.config.use_cache = False

# Train the final model
trainer.train()


# Save the final model
# accelerator.wait_for_everyone() method is used to synchronize all processes in a distributed training setup,ensuring that all processes reach the same point before proceeding.
# This is crucial for maintaining consistency and coordination across multiple devices (e.g., multiple GPUs or TPUs) during training.
accelerator.wait_for_everyone() # Use the Accelerator to manage the training loop
if accelerator.is_local_main_process:
    if SAVE_MODEL:
        print(f"saving model {new_model}")
        trainer.model.save_pretrained(new_model)
        trainer.tokenizer.save_pretrained(new_model)

end_time = time.time()
print("\n\n--->Execution Time:", (end_time - start_time) / 60, "minutes")

# Evaluate model after finetuning

In [None]:
y_pred = predict(test_df, model, tokenizer)


In [None]:
# Convert to a Pandas Series (for easy counting)
y_pred_series = pd.Series(y_pred)

# Count the occurrences of each value
value_counts = y_pred_series.value_counts()

# Print the counts
print("Value Counts:")
print(value_counts)

In [None]:
evaluate(y_true, y_pred)

In [None]:
wandb.finish()
model.config.use_cache = True

# Merge finetuned LORA with pre-trained model


In [None]:
# Clear GPU cache before loading the model for the second time
torch.cuda.empty_cache()


from peft import LoraConfig, PeftModel

base_model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map="auto",
)
model = PeftModel.from_pretrained(base_model, new_model)
finetuned_model = model.merge_and_unload()

# Push Model to Huggingface hub


In [None]:
trainer.model.push_to_hub(new_model, use_temp_dir=False)