<a href="https://colab.research.google.com/github/Jeremy-su1/ai-algorithm/blob/main/multi_cls_llama3_2_1b.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%%capture
!pip install torch
!pip install transformers
!pip install peft
!pip install huggingface_hub

In [None]:
from google.colab import userdata
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from peft import get_peft_model, LoraConfig, TaskType  # PEFT 라이브러리 필요
import pandas as pd
from google.colab import drive
from sklearn.metrics import f1_score



### 1. Import data

In [None]:
from google.colab import drive

drive.mount('/content/drive', force_remount=False)
train_path = ('/content/drive/MyDrive/DataSet_new/rev_tag_training_samples.csv')
valid_path = ('/content/drive/MyDrive/DataSet_new/rev_tag_validation_samples.csv')

Mounted at /content/drive


In [None]:
df_train = pd.read_csv(train_path)
df_valid = pd.read_csv(valid_path)
label_names = list(df_train.columns[5:])

In [None]:

df_train.head(1)

Unnamed: 0,Title,Body,Tags_filtered,Tags_list,Tags_new,Algorithms,Backend,Data Science,Databases,Dev Tools,Frontend,Mobile,Systems,iOS/macOS
0,Why threads are needed in my given assignment ...,<p><strong>I'm not asking to do my assignment....,"['java', 'multithreading']","['java', 'multithreading']",['Algorithms'],1,0,0,0,0,0,0,0,0


### 2. Data Preparation

In [None]:
df_train['text'] = df_train['Title'] + ' ' + df_train['Body']
df_valid['text'] = df_valid['Title'] + ' ' + df_valid['Body']
text_train = list(df_train['text'])
label_train = list(df_train[label_names].values)
text_valid = list(df_valid['text'])
label_valid = list(df_valid[label_names].values)


In [None]:
# Check the data set to confirm
print(f'Text example 1:\n {text_train[0]} , and its Label:\n {label_train[0]}')


Text example 1:
 Why threads are needed in my given assignment in java? <p><strong>I'm not asking to do my assignment. Read carefully</strong></p>

<blockquote>
  <p>Write a program to simulate a bus traveling between 5 different stations and
  repeats the cycle, the bus can take up to a maximum of 50 persons, at each
  station random number of persons get off the bus and random number of
  persons get on the bus, consider these cases.</p>
  
  <ul>
  <li>If bus does not have enough space for all persons, persons will have to
  stay in station for next cycle</li>
  <li>Persons cannot mount on bus until persons on bus dismount first.</li>
  <li>You can simulate bus trip with a fixed delay between each stop to
  simulate travel time.</li>
  <li>Persons can not mount/dismount the bus until bus arrives to the
  designated station.</li>
  </ul>
  
  <p>Use semaphores to control access to the bus and other utilities to control access
  to bus. Use thread pools to manage thread management. Yo

### 3. Tokenizer and model setting

In [None]:
# 1. Load the tokenizer and model
token =userdata.get('Hugging_Face_llama3.2')
model_path = 'meta-llama/Llama-3.2-1B'
tokenizer = AutoTokenizer.from_pretrained(model_path, use_auth_token=token)
tokenizer.pad_token = tokenizer.eos_token



tokenizer_config.json:   0%|          | 0.00/50.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/301 [00:00<?, ?B/s]

In [None]:
# 2. Tokenizing Dataset
class MultiLabelDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx], dtype=torch.float)
        return item

    def __len__(self):
        return len(self.labels)
def generate_dataset(texts, labels):
    encodings = tokenizer(texts, truncation=True, padding=True, max_length=512, return_tensors="pt")
    dataset = MultiLabelDataset(encodings, labels)
    return dataset

train_dataset = generate_dataset(text_train, label_train)
valid_dataset = generate_dataset(text_valid, label_valid)




In [None]:
# 3. Configure LoRA
lora_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,  # 시퀀스 분류 작업
    r=8,  # 저차원 행렬의 랭크
    lora_alpha=16,  # 스케일링 인자
    lora_dropout=0.1  # 드롭아웃 비율
)

num_labels = len(label_names)
model = AutoModelForSequenceClassification.from_pretrained(model_path, num_labels=num_labels, use_auth_token=token)
model.config.pad_token_id = tokenizer.pad_token_id


# 3. Apply LoRA to the model
model = get_peft_model(model, lora_config)

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:

import wandb

# wandb 초기화 (API 키가 환경 변수에 설정되어 있어야 함)
# wandb.init(project="my_project", entity="my_entity", name="my_experiment")
wandb.init(project="Multi output classification", entity ="ai_expert", name = "multi_cls_llama3_2_1b_re") # Uncomment to use wandb



[34m[1mwandb[0m: Currently logged in as: [33mneurogii[0m ([33mai_expert[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [None]:
from sklearn.metrics import accuracy_score, f1_score, precision_recall_fscore_support

# 4. Define the evaluation metrics
#def compute_metrics(pred):
#    labels = pred.label_ids
#    preds = pred.predictions >= 0.5  # 시그모이드 출력 후 임계값 0.5 이상일 때 예측으로 분류
#    micro_f1 = f1_score(labels, preds, average="micro")
#    macro_f1
#    return {"f1": f1}

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions >= 0.5  # 시그모이드 출력 후 임계값 0.5 이상일 때 예측으로 분류
    accuracy = accuracy_score(labels,preds)
    precision, recall, f1_score_result, _ = precision_recall_fscore_support(labels,preds, average='micro')
    precision_macro, recall_macro, f1_score_result_macro, _ = precision_recall_fscore_support(labels,preds, average='macro')
    precision_weighted, recall_weighted, f1_score_result_weighted, _ = precision_recall_fscore_support(labels,preds, average='weighted')

    flat_predictions = preds.reshape(-1)
    flat_labels = labels.reshape(-1)
    flat_accuracy = accuracy_score(flat_labels, flat_predictions)


    return {
        'flat_accuracy': flat_accuracy,
        'accuracy': accuracy,
        'micro_precision': precision,
        'micro_recall': recall,
        'micro_f1': f1_score_result,
        'macro_precision': precision_macro,
        'macro_recall': recall_macro,
        'macro_f1': f1_score_result_macro,
        'weighted_precision': precision_weighted,
        'weighted_recall': recall_weighted,
        'weighted_f1': f1_score_result_weighted,
        }



# 5. Training arguments
training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/multi_label",
    evaluation_strategy="epoch",
    num_train_epochs=5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    save_strategy="epoch",          # 주기적으로 저장하지 않고,
    load_best_model_at_end=True, # 최상의 모델을 마지막에 불러옴
    save_total_limit=2,
    metric_for_best_model="accuracy",  # 모델 성능 향상 기준 (예: f1, accuracy)
    greater_is_better=True,
    report_to="wandb",  # wandb를 사용하도록 설정
    logging_dir="/content/drive/MyDrive/multi_label/logs",

)

# 6. Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,  # 실제로는 train/validation set을 구분해서 사용하세요
    compute_metrics=compute_metrics,
)

# 7. Train the model
trainer.train()



# 모델 저장
from peft import get_peft_model

# LoRA가 적용된 모델
peft_model = get_peft_model(model, lora_config)

# 모델과 LoRA 설정을 함께 저장
peft_model.save_pretrained("content/drive/MyDrive/multi_label_lora")
tokenizer.save_pretrained("content/drive/MyDrive/multi_label_lora")



Epoch,Training Loss,Validation Loss,Flat Accuracy,Accuracy,Micro Precision,Micro Recall,Micro F1,Macro Precision,Macro Recall,Macro F1,Weighted Precision,Weighted Recall,Weighted F1
1,0.1472,0.113457,0.963385,0.728923,0.888544,0.833164,0.859963,0.793014,0.842452,0.805989,0.897838,0.833164,0.861823
2,0.125,0.104198,0.965949,0.742308,0.87535,0.871801,0.873572,0.782124,0.873943,0.813842,0.881622,0.871801,0.8754
3,0.1121,0.102755,0.965299,0.735692,0.871515,0.871295,0.871405,0.791021,0.867888,0.815892,0.876881,0.871295,0.87296
4,0.1027,0.101848,0.96588,0.743846,0.875286,0.871295,0.873286,0.785199,0.875867,0.813537,0.881042,0.871295,0.874719
5,0.0945,0.103854,0.965744,0.742,0.873447,0.872561,0.873004,0.774888,0.87878,0.808351,0.880775,0.872561,0.875126


We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.

Cannot access gated repo for url https://huggingface.co/meta-llama/Llama-3.2-1B/resolve/main/config.json.
Access to model meta-llama/Llama-3.2-1B is restricted. You must have access to it and be authenticated to access it. Please log in. - silently ignoring the lookup for the file config.json in meta-llama/Llama-3.2-1B.

Cannot acc

('content/drive/MyDrive/multi_label_lora/tokenizer_config.json',
 'content/drive/MyDrive/multi_label_lora/special_tokens_map.json',
 'content/drive/MyDrive/multi_label_lora/tokenizer.json')

### 나중에 사용할때

In [None]:
#tokenizer = AutoTokenizer.from_pretrained("./lora_model")
#model = AutoModelForSequenceClassification.from_pretrained("./lora_model")
#peft_model = PeftModel.from_pretrained(model, "./lora_model")

In [None]:
from peft import PeftModel, LoraConfig, get_peft_model

#token =userdata.get('Hugging_Face_llama3.2')
#model_path = 'meta-llama/Llama-3.2-1B'

#num_labels = 9 # len(label_names)
#base_model = AutoModelForSequenceClassification.from_pretrained(model_path, num_labels=num_labels, use_auth_token=token)


#tokenizer = AutoTokenizer.from_pretrained(model_path, use_auth_token=token)
#tokenizer.pad_token = tokenizer.eos_token
#base_model.config.pad_token_id = tokenizer.pad_token_id

#model = AutoModelForSequenceClassification.from_pretrained("/content/drive/MyDrive/DataSet_new/lora_model")
#model = PeftModel.from_pretrained(base_model, "/content/drive/MyDrive/DataSet_new/lora_model")

In [None]:
#drive.mount('/content/drive', force_remount=False)
test_path = ('/content/drive/MyDrive/DataSet_new/rev_tag_test_samples.csv')

df_test = pd.read_csv(test_path)
label_names = list(df_test.columns[5:])

#df_test
df_test['text'] = df_test['Title'] + ' ' + df_test['Body']
text_test = list(df_test['text'])

label_test = list(df_test[label_names].values)

In [None]:
import torch

# ... (your existing code) ...

# 모델을 평가 모드로 설정
model.eval()
model.to('cuda')  # 모델을 명시적으로 GPU로 이동

test_encodings = tokenizer(text_test, truncation=True, padding=True, max_length=512, return_tensors="pt")

# 예측 수행
predictions = []
batch_size = 16 # adjust this based on your available GPU memory
num_batches = len(test_encodings['input_ids']) // batch_size + (len(test_encodings['input_ids']) % batch_size != 0)

with torch.no_grad():
    for i in range(num_batches):
        start_idx = i * batch_size
        end_idx = min((i + 1) * batch_size, len(test_encodings['input_ids']))

        batch_encodings = {k: v[start_idx:end_idx].to('cuda') for k, v in test_encodings.items()}  # Move each batch to GPU

        outputs = model(**batch_encodings)
        logits = outputs.logits
        probs = torch.sigmoid(logits)  # 시그모이드 함수를 사용해 확률로 변환
        preds = (probs >= 0.5).int()  # 임계값 0.5 이상일 때 라벨로 간주
        predictions.extend(preds.cpu().numpy())  # 결과를 CPU로 이동 and extend predictions list

In [None]:
import numpy as np
def compute_metrics_test(preds,labels):
    #labels = pred.label_ids
    #preds = pred.predictions >= 0.5  # 시그모이드 출력 후 임계값 0.5 이상일 때 예측으로 분류
    preds = np.array(preds)
    labels = np.array(labels) # Convert labels to a NumPy array

    accuracy = accuracy_score(labels,preds)
    precision, recall, f1_score_result, _ = precision_recall_fscore_support(labels,preds, average='micro')
    precision_macro, recall_macro, f1_score_result_macro, _ = precision_recall_fscore_support(labels,preds, average='macro')
    precision_weighted, recall_weighted, f1_score_result_weighted, _ = precision_recall_fscore_support(labels,preds, average='weighted')

    flat_predictions = preds.reshape(-1)
    flat_labels = labels.reshape(-1)
    flat_accuracy = accuracy_score(flat_labels, flat_predictions)


    return {
        'flat_accuracy': flat_accuracy,
        'accuracy': accuracy,
        'micro_precision': precision,
        'micro_recall': recall,
        'micro_f1': f1_score_result,
        'macro_precision': precision_macro,
        'macro_recall': recall_macro,
        'macro_f1': f1_score_result_macro,
        'weighted_precision': precision_weighted,
        'weighted_recall': recall_weighted,
        'weighted_f1': f1_score_result_weighted,
        }

In [None]:
test_result =compute_metrics_test(predictions, label_test)

In [None]:
print(test_result)

{'flat_accuracy': 0.6405555555555555, 'accuracy': 0.020666666666666667, 'micro_precision': 0.19458749873775624, 'micro_recall': 0.5270787746170679, 'micro_f1': 0.2842392506822037, 'macro_precision': 0.14870829183479214, 'macro_recall': 0.40522313821207834, 'macro_f1': 0.1887655428355156, 'weighted_precision': 0.223149128202656, 'weighted_recall': 0.5270787746170679, 'weighted_f1': 0.28591037707864403}
