<a href="https://colab.research.google.com/github/Jeremy-su1/ai-algorithm/blob/main/llama3_1b_instrunct_intent_classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [177]:
!pip install datasets
!pip install scikit-multilearn
!pip install peft
!pip install transformers
!pip install bitsandbytes
!pip install evaluate
!pip install huggingface_hub
!pip install trl



In [178]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [179]:
from huggingface_hub import login

# 로그인 함수 호출
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [180]:
import os
import random
import functools
import csv
import json
import numpy as np
import torch
import torch.nn.functional as F
from functools import partial
from sklearn.metrics import f1_score
from skmultilearn.model_selection import iterative_train_test_split
from datasets import load_dataset, Dataset, DatasetDict
from peft import (
    LoraConfig,
    prepare_model_for_kbit_training,
    get_peft_model,
    AutoPeftModelForCausalLM,
    PeftModel
)
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    DataCollatorForLanguageModeling,
    Trainer,
    pipeline
)
from trl import SFTTrainer
import bitsandbytes as bnb

In [181]:
#model.save_pretrained('/content/drive/My Drive/AiExpertCource/pj/intent/intent_llama3_1b')
#tokenizer.save_pretrained('/content/drive/My Drive/AiExpertCource/pj/intent/intent_llama3_1b')

In [182]:
def create_bnb_config(load_in_4bit, bnb_4bit_use_double_quant, bnb_4bit_quant_type, bnb_4bit_compute_dtype):
    """
    Configures model quantization method using bitsandbytes to speed up training and inference

    :param load_in_4bit: Load model in 4-bit precision mode
    :param bnb_4bit_use_double_quant: Nested quantization for 4-bit model
    :param bnb_4bit_quant_type: Quantization data type for 4-bit model
    :param bnb_4bit_compute_dtype: Computation data type for 4-bit model
    """

    bnb_config = BitsAndBytesConfig(
        load_in_4bit = load_in_4bit,
        bnb_4bit_use_double_quant = bnb_4bit_use_double_quant,
        bnb_4bit_quant_type = bnb_4bit_quant_type,
        bnb_4bit_compute_dtype = bnb_4bit_compute_dtype,
    )

    return bnb_config

In [183]:
def load_model(model_name, bnb_config):
    """
    Loads model and model tokenizer

    :param model_name: Hugging Face model name
    :param bnb_config: Bitsandbytes configuration
    """

    # Get number of GPU device and set maximum memory
    n_gpus = torch.cuda.device_count()
    max_memory = f'{40960}MB'

    # Load model
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        quantization_config = bnb_config,
        device_map = "auto", # dispatch the model efficiently on the available resources
        max_memory = {i: max_memory for i in range(n_gpus)},
    )

    # Load model tokenizer with the user authentication token
    tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token = True)

    # Set padding token as EOS token
    tokenizer.pad_token = tokenizer.eos_token

    return model, tokenizer

In [184]:
################################################################################
# transformers parameters
################################################################################

# The pre-trained model from the Hugging Face Hub to load and fine-tune
model_name = "meta-llama/Llama-3.2-1B-Instruct"

################################################################################
# bitsandbytes parameters
################################################################################

# Activate 4-bit precision base model loading
load_in_4bit = True

# Activate nested quantization for 4-bit base models (double quantization)
bnb_4bit_use_double_quant = True

# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = "nf4"

# Compute data type for 4-bit base models
bnb_4bit_compute_dtype = torch.bfloat16

In [186]:
# Load model from Hugging Face Hub with model name and bitsandbytes configuration

bnb_config = create_bnb_config(load_in_4bit, bnb_4bit_use_double_quant, bnb_4bit_quant_type, bnb_4bit_compute_dtype)

model, tokenizer = load_model(model_name, bnb_config)



In [187]:
dataset_train = load_dataset(
    'csv',
    data_files='/content/drive/My Drive/AiExpertCource/pj/intent/intent_dataset_train.csv',
    split='train'
)
dataset_valid = load_dataset(
    'csv',
    data_files='/content/drive/My Drive/AiExpertCource/pj/intent/intent_dataset_valid.csv',
    split='train'
)

In [188]:
dataset = DatasetDict({
    'train': Dataset.from_dict({'Concat_Text': dataset_train['text'], 'intents': dataset_train['intents']}),
    'val': Dataset.from_dict({'Concat_Text': dataset_valid['text'], 'intents': dataset_valid['intents']}),
})

dataset = dataset['train']

In [189]:
def create_prompt_formats(sample):
  sample["text"] = f"""
            Classify the text into Discrepancy, Error, Review, Conceptual, Learning, How-to, or Other. The descriptions for each category are as follows, and determine which category the text belongs to.
            Discrepancy : Seeking explanations for software behavior discrepancies not explicitly related to errors.
            Error : Seeking solutions for errors or exceptions.
            Review : Looking for im-proved solutions or guidance to make well-informed decisions.
            Conceptual : Seeking information or explanations with-out concrete imple-mentations.
            Learning : Seeking learning re-sources for libraries, tools, or program-ming languages.
            How-to : Requesting step-by-step instructions for specific tasks.
            Other: Does not belong to Discrepancy, Error, Review, Conceptual, Learning, or How-to.

            [text] = {sample["Concat_Text"]}
            [categories] = {sample["intents"]}
            """.strip()
  return sample

In [190]:
def get_max_length(model):
    """
    Extracts maximum token length from the model configuration

    :param model: Hugging Face model
    """

    # Pull model configuration
    conf = model.config
    # Initialize a "max_length" variable to store maximum sequence length as null
    max_length = None
    # Find maximum sequence length in the model configuration and save it in "max_length" if found
    for length_setting in ["n_positions", "max_position_embeddings", "seq_length"]:
        max_length = getattr(model.config, length_setting, None)
        if max_length:
            print(f"Found max lenth: {max_length}")
            break
    # Set "max_length" to 1024 (default value) if maximum sequence length is not found in the model configuration
    if not max_length:
        max_length = 2048
        print(f"Using default max length: {max_length}")
    return max_length

In [191]:
def preprocess_batch(batch, tokenizer, max_length):
    """
    Tokenizes dataset batch

    :param batch: Dataset batch
    :param tokenizer: Model tokenizer
    :param max_length: Maximum number of tokens to emit from the tokenizer
    """

    return tokenizer(
        batch["text"],
        max_length = max_length,
        truncation = True,
    )

In [192]:
def preprocess_dataset(tokenizer: AutoTokenizer, max_length: int, seed, dataset: str):
    """
    Tokenizes dataset for fine-tuning

    :param tokenizer (AutoTokenizer): Model tokenizer
    :param max_length (int): Maximum number of tokens to emit from the tokenizer
    :param seed: Random seed for reproducibility
    :param dataset (str): Instruction dataset
    """

    # Add prompt to each sample
    print("Preprocessing dataset...")
    dataset = dataset.map(create_prompt_formats)

    # Apply preprocessing to each batch of the dataset & and remove "instruction", "input", "output", and "text" fields
    _preprocessing_function = partial(preprocess_batch, max_length = max_length, tokenizer = tokenizer)
    dataset = dataset.map(
        _preprocessing_function,
        batched = True,
        remove_columns = ["Concat_Text", "intents", "text"],
    )

    # Filter out samples that have "input_ids" exceeding "max_length"
    dataset = dataset.filter(lambda sample: len(sample["input_ids"]) < max_length)

    # Shuffle dataset
    dataset = dataset.shuffle(seed = seed)

    return dataset

In [193]:
# Random seed
seed = 33

max_length = get_max_length(model)
preprocessed_dataset = preprocess_dataset(tokenizer, max_length, seed, dataset)

Found max lenth: 131072
Preprocessing dataset...


Map:   0%|          | 0/705 [00:00<?, ? examples/s]

Map:   0%|          | 0/705 [00:00<?, ? examples/s]

Filter:   0%|          | 0/705 [00:00<?, ? examples/s]

In [194]:
def create_peft_config(r, lora_alpha, target_modules, lora_dropout, bias, task_type):
    """
    Creates Parameter-Efficient Fine-Tuning configuration for the model

    :param r: LoRA attention dimension
    :param lora_alpha: Alpha parameter for LoRA scaling
    :param modules: Names of the modules to apply LoRA to
    :param lora_dropout: Dropout Probability for LoRA layers
    :param bias: Specifies if the bias parameters should be trained
    """
    config = LoraConfig(
        r = r,
        lora_alpha = lora_alpha,
        target_modules = target_modules,
        lora_dropout = lora_dropout,
        bias = bias,
        task_type = task_type,
    )

    return config

In [195]:
def find_all_linear_names(model):
    """
    Find modules to apply LoRA to.

    :param model: PEFT model
    """

    cls = bnb.nn.Linear4bit
    lora_module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, cls):
            names = name.split('.')
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])

    if 'lm_head' in lora_module_names:
        lora_module_names.remove('lm_head')
    print(f"LoRA module names: {list(lora_module_names)}")
    return list(lora_module_names)

In [196]:
def print_trainable_parameters(model, use_4bit = False):
    """
    Prints the number of trainable parameters in the model.

    :param model: PEFT model
    """

    trainable_params = 0
    all_param = 0

    for _, param in model.named_parameters():
        num_params = param.numel()
        if num_params == 0 and hasattr(param, "ds_numel"):
            num_params = param.ds_numel
        all_param += num_params
        if param.requires_grad:
            trainable_params += num_params

    if use_4bit:
        trainable_params /= 2

    print(
        f"All Parameters: {all_param:,d} || Trainable Parameters: {trainable_params:,d} || Trainable Parameters %: {100 * trainable_params / all_param}"
    )

In [197]:
def fine_tune(model,
          tokenizer,
          dataset,
          lora_r,
          lora_alpha,
          lora_dropout,
          bias,
          task_type,
          per_device_train_batch_size,
          gradient_accumulation_steps,
          warmup_steps,
          max_steps,
          learning_rate,
          fp16,
          logging_steps,
          output_dir,
          optim):
    """
    Prepares and fine-tune the pre-trained model.

    :param model: Pre-trained Hugging Face model
    :param tokenizer: Model tokenizer
    :param dataset: Preprocessed training dataset
    """

    # Enable gradient checkpointing to reduce memory usage during fine-tuning
    model.gradient_checkpointing_enable()

    # Prepare the model for training
    model = prepare_model_for_kbit_training(model)

    # Get LoRA module names
    target_modules = find_all_linear_names(model)

    # Create PEFT configuration for these modules and wrap the model to PEFT
    peft_config = create_peft_config(lora_r, lora_alpha, target_modules, lora_dropout, bias, task_type)
    model = get_peft_model(model, peft_config)

    # Print information about the percentage of trainable parameters
    print_trainable_parameters(model)

    # Training parameters
    trainer = Trainer(
        model = model,
        train_dataset = dataset,
        args = TrainingArguments(
            per_device_train_batch_size = per_device_train_batch_size,
            gradient_accumulation_steps = gradient_accumulation_steps,
            warmup_steps = warmup_steps,
            max_steps = max_steps,
            learning_rate = learning_rate,
            fp16 = fp16,
            logging_steps = logging_steps,
            output_dir = output_dir,
            optim = optim,
        ),
        data_collator = DataCollatorForLanguageModeling(tokenizer, mlm = False)
    )

    model.config.use_cache = False

    do_train = True

    # Launch training and log metrics
    print("Training...")

    if do_train:
        train_result = trainer.train()
        metrics = train_result.metrics
        trainer.log_metrics("train", metrics)
        trainer.save_metrics("train", metrics)
        trainer.save_state()
        print(metrics)

    # Save model
    print("Saving last checkpoint of the model...")
    os.makedirs(output_dir, exist_ok = True)
    trainer.model.save_pretrained(output_dir)

    # Free memory for merging weights
    del model
    del trainer
    torch.cuda.empty_cache()

In [198]:
################################################################################
# QLoRA parameters
################################################################################

# LoRA attention dimension
lora_r = 16

# Alpha parameter for LoRA scaling
lora_alpha = 64

# Dropout probability for LoRA layers
lora_dropout = 0.1

# Bias
bias = "none"

# Task type
task_type = "CAUSAL_LM"

In [199]:
################################################################################
# TrainingArguments parameters
################################################################################

# Output directory where the model predictions and checkpoints will be stored
output_dir = "/content/drive/My Drive/AiExpertCource/pj/intent/llama3_lb_finetune"

# Batch size per GPU for training
per_device_train_batch_size = 1

# Number of update steps to accumulate the gradients for
gradient_accumulation_steps = 4

# Initial learning rate (AdamW optimizer)
learning_rate = 2e-4

# Optimizer to use
optim = "paged_adamw_32bit"

# Number of training steps (overrides num_train_epochs)
max_steps = 10

# Linear warmup steps from 0 to learning_rate
warmup_steps = 2

# Enable fp16/bf16 training (set bf16 to True with an A100)
fp16 = True

# Log every X updates steps
logging_steps = 1

In [200]:
fine_tune(model,
      tokenizer,
      preprocessed_dataset,
      lora_r,
      lora_alpha,
      lora_dropout,
      bias,
      task_type,
      per_device_train_batch_size,
      gradient_accumulation_steps,
      warmup_steps,
      max_steps,
      learning_rate,
      fp16,
      logging_steps,
      output_dir,
      optim)

LoRA module names: ['v_proj', 'down_proj', 'o_proj', 'k_proj', 'gate_proj', 'up_proj', 'q_proj']


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
max_steps is given, it will override any value given in num_train_epochs


All Parameters: 760,547,328 || Trainable Parameters: 11,272,192 || Trainable Parameters %: 1.482115784910101
Training...


  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Step,Training Loss
1,3.0371
2,2.9245
3,2.8718
4,2.4689
5,2.0402
6,1.7315
7,1.8274
8,1.378
9,1.1232
10,1.5863


***** train metrics *****
  epoch                    =     0.0567
  total_flos               =    68326GF
  train_loss               =     2.0989
  train_runtime            = 0:00:14.97
  train_samples_per_second =      2.671
  train_steps_per_second   =      0.668
{'train_runtime': 14.9745, 'train_samples_per_second': 2.671, 'train_steps_per_second': 0.668, 'total_flos': 73364758523904.0, 'train_loss': 2.0989105701446533, 'epoch': 0.05673758865248227}
Saving last checkpoint of the model...


In [None]:
model

In [201]:
# Load fine-tuned weights
model = AutoPeftModelForCausalLM.from_pretrained(output_dir, device_map = "auto", torch_dtype = torch.bfloat16)
# Merge the LoRA layers with the base model
model = model.merge_and_unload()

# Save fine-tuned model at a new location
output_merged_dir = output_dir + "/merged"
model.save_pretrained(output_merged_dir, safe_serialization = True)

# Save tokenizer for easy inference
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.save_pretrained(output_merged_dir)

('/content/drive/My Drive/AiExpertCource/pj/intent/llama3_lb_finetune/merged/tokenizer_config.json',
 '/content/drive/My Drive/AiExpertCource/pj/intent/llama3_lb_finetune/merged/special_tokens_map.json',
 '/content/drive/My Drive/AiExpertCource/pj/intent/llama3_lb_finetune/merged/tokenizer.json')

In [None]:
model

In [202]:
base_model_reload = AutoModelForCausalLM.from_pretrained(model_name)

model = PeftModel.from_pretrained(base_model_reload, output_dir)
model = model.merge_and_unload()


In [203]:
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    torch_dtype=torch.float16,
    device_map="auto",
)


Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [207]:
  txt = "Synapse notebook access CSV / ZIP files from on premise I am trying to access different types of files (CSV/ZIP/ ETC...) within my company's on premise shared drives. As different departments submit their documents, I seek python / pyspark code to fetch these files. Company procedures wont allow to change and upload them into blob storage."

  prompt = f"""
            Classify the text into Discrepancy, Error, Review, Conceptual, Learning, How-to, or Other. The descriptions for each category are as follows, and determine which category the text belongs to.
            Discrepancy : Seeking explanations for software behavior discrepancies not explicitly related to errors.
            Error : Seeking solutions for errors or exceptions.
            Review : Looking for im-proved solutions or guidance to make well-informed decisions.
            Conceptual : Seeking information or explanations with-out concrete imple-mentations.
            Learning : Seeking learning re-sources for libraries, tools, or program-ming languages.
            How-to : Requesting step-by-step instructions for specific tasks.
            Other: Does not belong to Discrepancy, Error, Review, Conceptual, Learning, or How-to.

            [text] = {txt}
            [categories] =
            """.strip()


In [208]:
output = pipe(prompt, max_new_tokens=100)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


In [209]:
print(output);

[{'generated_text': "Classify the text into Discrepancy, Error, Review, Conceptual, Learning, How-to, or Other. The descriptions for each category are as follows, and determine which category the text belongs to.\n          Discrepancy : Seeking explanations for software behavior discrepancies not explicitly related to errors.\n          Error : Seeking solutions for errors or exceptions.\n          Review : Looking for im-proved solutions or guidance to make well-informed decisions.\n          Conceptual : Seeking information or explanations with-out concrete imple-mentations.\n          Learning : Seeking learning re-sources for libraries, tools, or program-ming languages.\n          How-to : Requesting step-by-step instructions for specific tasks.\n          Other: Does not belong to Discrepancy, Error, Review, Conceptual, Learning, or How-to.\n\n          [text] = Synapse notebook access CSV / ZIP files from on premise I am trying to access different types of files (CSV/ZIP/ ETC...