<a href="https://colab.research.google.com/github/Jeremy-su1/ai-algorithm/blob/main/llama3_1b_intent_classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install datasets
!pip install scikit-multilearn
!pip install peft
!pip install transformers
!pip install bitsandbytes
!pip install evaluate
!pip install huggingface_hub



In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
import os
import random
import functools
import csv
import json
import numpy as np
import torch
import torch.nn.functional as F
from sklearn.metrics import f1_score
from skmultilearn.model_selection import iterative_train_test_split
from datasets import load_dataset, Dataset, DatasetDict
from peft import (
    LoraConfig,
    prepare_model_for_kbit_training,
    get_peft_model
)
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    Trainer
)

In [4]:
dataset_train = load_dataset(
    'csv',
    data_files='/content/drive/My Drive/AiExpertCource/pj/intent/intent_dataset_train.csv',
    split='train'
)
dataset_valid = load_dataset(
    'csv',
    data_files='/content/drive/My Drive/AiExpertCource/pj/intent/intent_dataset_valid.csv',
    split='train'
)

In [5]:
dataset = DatasetDict({
    'train': Dataset.from_dict({'Concat_Text': dataset_train['text'], 'intents': dataset_train['intents']}),
    'val': Dataset.from_dict({'Concat_Text': dataset_valid['text'], 'intents': dataset_valid['intents']}),
})

In [6]:
classes = ['Discrepancy', 'Errors', 'Review', 'Conceptual', 'Learning', 'How-to', 'Other']
class2id = {'Discrepancy' : 0, 'Errors' : 1, 'Review' : 2, 'Conceptual' : 3, 'Learning' : 4, 'How-to' : 5, 'Other' : 6}
id2class = {0 : 'Discrepancy', 1 : 'Errors', 2 : 'Review', 3 : 'Conceptual', 4: 'Learning', 5: 'How-to', 6 :'Other'}

In [7]:
from huggingface_hub import login

# 로그인 함수 호출
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [8]:
model_path = 'meta-llama/Llama-3.2-1B'
tokenizer = AutoTokenizer.from_pretrained(model_path)
tokenizer.pad_token = tokenizer.eos_token

# preprocess dataset with tokenizer
def tokenize_examples(examples, tokenizer):
   all_labels = examples['intents'].split(' ')
   labels = [0. for i in range(len(classes))]
   for label in all_labels:
       label_id = class2id[label]
       labels[label_id] = 1.

   tokenized_inputs = tokenizer(examples['Concat_Text'])
   tokenized_inputs['labels'] = labels
   return tokenized_inputs

tokenized_ds = dataset.map(functools.partial(tokenize_examples, tokenizer=tokenizer))
tokenized_ds = tokenized_ds.with_format('torch')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Map:   0%|          | 0/705 [00:00<?, ? examples/s]

Map:   0%|          | 0/79 [00:00<?, ? examples/s]

In [9]:
tokenized_ds['train'][0]

{'Concat_Text': 'Did Domino 10 change NDX file names? I\'ve just upgraded a server from IBM Domino 9.0.1 to Domino 10.0.1, and am having some trouble with NDX files (view indexes stored outside databases). I tested some custom app functionality after the upgrade, and found that code which used views sometimes produced the error message "Attempt to Reopen an Open Container". To resolve the error, I ran the server command load updall -R to rebuild all used views. After that completed, I found there are now two NDX files for every NIFNSF-enabled database. For each NSF, the older NDX file was last updated before the upgrade, and is named to match the NSF base filename with the extension replaced by "NDX". The newer NDX is updated within the last 15 minutes, and named like the NSF base filename, plus "_nsf", then the "NDX" extension. E.g. A database file "Arc2001.nsf" has both "Arc2001.ndx" (older) and "Arc2001_nsf.ndx" (newer). So the main question is: Did the default name of NDX files cha

In [10]:
# define custom batch preprocessor
def collate_fn(batch, tokenizer):
    dict_keys = ['input_ids', 'attention_mask', 'labels']
    d = {k: [dic[k] for dic in batch] for k in dict_keys}
    d['input_ids'] = torch.nn.utils.rnn.pad_sequence(
        d['input_ids'], batch_first=True, padding_value=tokenizer.pad_token_id
    )
    d['attention_mask'] = torch.nn.utils.rnn.pad_sequence(
        d['attention_mask'], batch_first=True, padding_value=0
    )
    d['labels'] = torch.stack(d['labels'])
    return d


# define which metrics to compute for evaluation
import evaluate
import numpy as np

clf_metrics = evaluate.combine(["accuracy", "f1", "precision", "recall"])

def sigmoid(x):
   return 1/(1 + np.exp(-x))

def compute_metrics(eval_pred):

   predictions, labels = eval_pred
   predictions = sigmoid(predictions)
   predictions = (predictions > 0.5).astype(int).reshape(-1)
   return clf_metrics.compute(predictions=predictions, references=labels.astype(int).reshape(-1))


# create custom trainer class to be able to pass label weights and calculate mutilabel loss
class CustomTrainer(Trainer):

    def __init__(self, **kwargs):
        super().__init__(**kwargs)

    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")

        # forward pass
        outputs = model(**inputs)
        logits = outputs.get("logits")

        # compute custom loss
        loss = F.binary_cross_entropy_with_logits(logits, labels.to(torch.float32))
        return (loss, outputs) if return_outputs else loss

In [11]:
# qunatization config
quantization_config = BitsAndBytesConfig(
    load_in_4bit = True, # enable 4-bit quantization
    bnb_4bit_quant_type = 'nf4', # information theoretically optimal dtype for normally distributed weights
    bnb_4bit_use_double_quant = True, # quantize quantized weights //insert xzibit meme
    bnb_4bit_compute_dtype = torch.bfloat16 # optimized fp format for ML
)

# lora config
lora_config = LoraConfig(
    r = 16, # the dimension of the low-rank matrices
    lora_alpha = 8, # scaling factor for LoRA activations vs pre-trained weight activations
    target_modules = ['q_proj', 'k_proj', 'v_proj', 'o_proj'],
    lora_dropout = 0.05, # dropout probability of the LoRA layers
    bias = 'none', # wether to train bias weights, set to 'none' for attention layers
    task_type = 'SEQ_CLS'
)

# load model
model = AutoModelForSequenceClassification.from_pretrained(
    model_path,
    quantization_config=quantization_config,
    num_labels=len(classes)
)
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, lora_config)
model.config.pad_token_id = tokenizer.pad_token_id

`low_cpu_mem_usage` was None, now set to True since model is quantized.
Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
# define training args
training_args = TrainingArguments(
    output_dir = 'multilabel_classification',
    learning_rate = 1e-4,
    per_device_train_batch_size = 4, # tested with 16gb gpu ram
    per_device_eval_batch_size = 4,
    num_train_epochs = 20,
    weight_decay = 0.01,
    evaluation_strategy = 'epoch',
    save_strategy = 'epoch',
    load_best_model_at_end = True
)

# train
trainer = CustomTrainer(
    model = model,
    args = training_args,
    train_dataset = tokenized_ds['train'],
    eval_dataset = tokenized_ds['val'],
    tokenizer = tokenizer,
    data_collator = functools.partial(collate_fn, tokenizer=tokenizer),
    compute_metrics = compute_metrics
)

trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mebinna-lee[0m ([33mebinna-lee-x[0m). Use [1m`wandb login --relogin`[0m to force relogin


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.439719,0.819168,0.404762,0.478873,0.350515
2,No log,0.321491,0.877034,0.56962,0.737705,0.463918
3,0.341800,0.335153,0.871609,0.559006,0.703125,0.463918
4,0.341800,0.365412,0.878843,0.612717,0.697368,0.546392
5,0.341800,0.411772,0.851718,0.554348,0.586207,0.525773
6,0.096800,0.469768,0.862568,0.573034,0.62963,0.525773
7,0.096800,0.474345,0.882459,0.640884,0.690476,0.597938
8,0.096800,0.476895,0.887884,0.643678,0.727273,0.57732
9,0.009600,0.511841,0.886076,0.644068,0.7125,0.587629
10,0.009600,0.570791,0.882459,0.624277,0.710526,0.556701


We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)

Cannot access gated repo for url https://huggingface.co/meta-llama/Llama-3.2-1B/resolve/main/config.json.
Access to model meta-llama/Llama-3.2-1B is restricted. You must have access to it and be authenticated to access it. Please log in. - silently ignoring the lookup for the file config.json in meta-llama/Llama-3.2-1B.
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]

Cannot access gated repo for url https://huggingface.co/meta-llama/Llama-3.2-1B/resolve/main/config.json.
Access to model meta-llama/Llama-3.2-1B is restricted. You must have access to it and be authenticated to access it. Please log in. - silently ignoring the l

TrainOutput(global_step=3540, training_loss=0.0641036698057995, metrics={'train_runtime': 2499.3459, 'train_samples_per_second': 5.641, 'train_steps_per_second': 1.416, 'total_flos': 3.121275553811251e+16, 'train_loss': 0.0641036698057995, 'epoch': 20.0})

In [13]:
#model.save_pretrained('/content/drive/My Drive/AiExpertCource/pj/intent/intent_llama3_1b')
#tokenizer.save_pretrained('/content/drive/My Drive/AiExpertCource/pj/intent/intent_llama3_1b')

In [14]:
# 모델을 평가 모드로 설정
model.eval()

PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): LlamaForSequenceClassification(
      (model): LlamaModel(
        (embed_tokens): Embedding(128256, 2048)
        (layers): ModuleList(
          (0-15): 16 x LlamaDecoderLayer(
            (self_attn): LlamaSdpaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=2048, out_features=2048, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=2048, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=2048, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
           

In [15]:
1

1

In [16]:
# 예측 함수 정의
def predict(texts):
    # 텍스트를 토큰화하고 텐서로 변환
    print(texts)
    inputs = tokenizer(texts, padding='max_length', truncation=True, max_length=2048, return_tensors='pt')
    print(inputs)

    # 모델을 사용해 예측 수행
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits

    print(logits)

    # 시그모이드를 사용해 확률로 변환
    probabilities = torch.sigmoid(logits).numpy()

    # 각 클래스에 대해 threshold를 0.5로 설정하여 예측값(0 또는 1)으로 변환
    predictions = (probabilities > 0.5).astype(int)

    return predictions, probabilities

In [17]:
# 샘플 텍스트 입력 및 예측 수행
sample_texts = [
  "how to make this horizontal menu start scroll from right to left? you can see a full demo code for the menu here :"
]

# 예측 수행
predictions, probabilities = predict(sample_texts)

['how to make this horizontal menu start scroll from right to left? you can see a full demo code for the menu here :']
{'input_ids': tensor([[128000,   5269,    311,  ..., 128001, 128001, 128001]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0]])}
tensor([[-1.0923, -4.5601, -3.5322, -2.3779, -4.1942, -1.8041, -1.8939]])


In [18]:
print(predictions)
print(probabilities)

[[0 0 0 0 0 0 0]]
[[0.2511851  0.01035238 0.02841033 0.08487048 0.01485825 0.14135592
  0.13079964]]


In [19]:
arr = np.array(predictions[0])
print(arr)

# 값이 1인 인덱스에 해당하는 한글 매핑 가져오기
indices = np.where(arr == 1)[0]
indices

for idx in indices:
    tag = id2class[idx]
    print(tag)

[0 0 0 0 0 0 0]


In [20]:
tag

NameError: name 'tag' is not defined

In [None]:
arr