<a href="https://colab.research.google.com/github/Jeremy-su1/ai-algorithm/blob/main/CLASSIFICATION2_LLMembed_StackOverFlow_multi_lable.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
!pip install torch transformers tqdm datasets



In [2]:
from datasets import load_dataset

# 멀티 레이블 데이터셋 로드
sof_dataset_train = load_dataset(
    'csv',
    data_files='/content/drive/MyDrive/LLMEmbed/MultiLabel/rev_tag_training_samples.csv',
    split='train'
)

print(sof_dataset_train)

# 레이블 이름으로 구성된 리스트
labels = ['Algorithms', 'Backend', 'Data Science', 'Databases', 'Dev Tools', 'Frontend', 'Mobile', 'Systems', 'iOS/macOS']

# 레이블 이름을 키로 하고 카운터를 값으로 하는 딕셔너리 생성
label_counter = {label: 0 for label in labels}

for example in sof_dataset_train:
    # 각 샘플에서 0과 1로 표현된 레이블 상태를 이용해 카운트를 업데이트합니다.
    for label in labels:
        if example[label] == 1:
            label_counter[label] += 1

print(label_counter)

Generating train split: 0 examples [00:00, ? examples/s]

Dataset({
    features: ['Title', 'Body', 'Tags_filtered', 'Tags_list', 'Tags_new', 'Algorithms', 'Backend', 'Data Science', 'Databases', 'Dev Tools', 'Frontend', 'Mobile', 'Systems', 'iOS/macOS'],
    num_rows: 20000
})
{'Algorithms': 4225, 'Backend': 3723, 'Data Science': 3487, 'Databases': 3072, 'Dev Tools': 3655, 'Frontend': 3966, 'Mobile': 2683, 'Systems': 3996, 'iOS/macOS': 2587}


In [3]:
from datasets import load_dataset

# 멀티 레이블 데이터셋 로드
sof_dataset_val = load_dataset(
    'csv',
    data_files='/content/drive/MyDrive/LLMEmbed/MultiLabel/rev_tag_validation_samples.csv',
    split='train'
)

print(sof_dataset_val)

# 레이블 이름으로 구성된 리스트
labels = ['Algorithms', 'Backend', 'Data Science', 'Databases', 'Dev Tools', 'Frontend', 'Mobile', 'Systems', 'iOS/macOS']

# 레이블 이름을 키로 하고 카운터를 값으로 하는 딕셔너리 생성
label_counter = {label: 0 for label in labels}

for example in sof_dataset_val:
    # 각 샘플에서 0과 1로 표현된 레이블 상태를 이용해 카운트를 업데이트합니다.
    for label in labels:
        if example[label] == 1:
            label_counter[label] += 1

print(label_counter)

Generating train split: 0 examples [00:00, ? examples/s]

Dataset({
    features: ['Title', 'Body', 'Tags_filtered', 'Tags_list', 'Tags_new', 'Algorithms', 'Backend', 'Data Science', 'Databases', 'Dev Tools', 'Frontend', 'Mobile', 'Systems', 'iOS/macOS'],
    num_rows: 6500
})
{'Algorithms': 1573, 'Backend': 722, 'Data Science': 1519, 'Databases': 230, 'Dev Tools': 656, 'Frontend': 1821, 'Mobile': 167, 'Systems': 1178, 'iOS/macOS': 28}


In [4]:
from transformers import AutoTokenizer
from datasets import Dataset, DatasetDict

sof_dataset= DatasetDict({
    "train": sof_dataset_train,
    "test": sof_dataset_val
})

# DatasetDict 출력
print(sof_dataset)


DatasetDict({
    train: Dataset({
        features: ['Title', 'Body', 'Tags_filtered', 'Tags_list', 'Tags_new', 'Algorithms', 'Backend', 'Data Science', 'Databases', 'Dev Tools', 'Frontend', 'Mobile', 'Systems', 'iOS/macOS'],
        num_rows: 20000
    })
    test: Dataset({
        features: ['Title', 'Body', 'Tags_filtered', 'Tags_list', 'Tags_new', 'Algorithms', 'Backend', 'Data Science', 'Databases', 'Dev Tools', 'Frontend', 'Mobile', 'Systems', 'iOS/macOS'],
        num_rows: 6500
    })
})


# **1. Llama2 Embedding**

In [6]:
# -*- coding: utf-8 -*-
import os
import torch
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
from tqdm import trange
from datasets import load_dataset
import argparse

def rep_extract(task, mode, device, sents, labels, max_len, step):
    model_id = "daryl149/llama-2-7b-chat-hf"

    tokenizer = AutoTokenizer.from_pretrained(model_id)
    tokenizer.pad_token = "[PAD]"
    tokenizer.padding_side = "right"

    config_kwargs = {
        "trust_remote_code": True,
        "cache_dir": None,
        "revision": 'main',
        "use_auth_token": None,
        "output_hidden_states": True
    }
    model_config = AutoConfig.from_pretrained(model_id, **config_kwargs)
    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        trust_remote_code=True,
        config=model_config,
        device_map=device,
        torch_dtype=torch.float16)
    model.eval()

    sents_reps = []
    for idx in trange(0, len(sents), step):
        idx_end = idx + step
        if idx_end > len(sents):
            idx_end = len(sents)
        sents_batch = sents[idx: idx_end]

        sents_batch_encoding = tokenizer(sents_batch, return_tensors='pt', max_length=max_len, padding="max_length", truncation=True)
        sents_batch_encoding = sents_batch_encoding.to(device)

        with torch.no_grad():
            batch_outputs = model(**sents_batch_encoding)
            reps_batch_5L = []
            for layer in range(-1, -6, -1):
                reps_batch_5L.append(torch.mean(batch_outputs.hidden_states[layer], axis=1))
            reps_batch_5L = torch.stack(reps_batch_5L, axis=1)

        sents_reps.append(reps_batch_5L.cpu())
    sents_reps = torch.cat(sents_reps)

    # 멀티 레이블 데이터를 처리하기 위한 레이블 로직 수정
    labels = torch.tensor(labels, dtype=torch.float32)

    print(sents_reps.shape)
    print(labels.shape)
    path = f'/content/drive/MyDrive/LLMEmbed/MultiLabel/llama2/{task}/dataset_tensor/'
    if not os.path.exists(path):
        os.makedirs(path)
    torch.save(sents_reps.to('cpu'), path + f'{mode}_sents.pt')
    torch.save(labels, path + f'{mode}_labels.pt')

if __name__ == '__main__':
    cuda_no = 0 # GPU 번호
    task = 'stackoverflow' # Task 이름
    # device = f'cuda:{cuda_no}'
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # 레이블에 해당하는 컬럼 이름 목록
    label_cols = ['Algorithms', 'Backend', 'Data Science', 'Databases', 'Dev Tools', 'Frontend', 'Mobile', 'Systems', 'iOS/macOS']

    sents = sof_dataset['test']['Body']  # 데이터 프레임에서 텍스트만을 추출하는 부분 수정 필요

    # print(sents)
    labels = []  # 최종 레이블을 저장할 2차원 리스트 초기화

    # sof_dataset['train']의 각 행에 대한 루프
    for i in range(len(sof_dataset['test'])):
        labels_temp = []  # 현재 행의 레이블을 저장할 임시 리스트
        # 각 카테고리에 대해 루프를 돌면서 labels_temp 리스트에 레이블을 추가
        for category in label_cols:
            labels_temp.append(sof_dataset['test'][category][i])  # 현재 행의 해당 카테고리 레이블을 추가
        labels.append(labels_temp)  # 완성된 레이블 리스트를 labels 리스트에 추가


    rep_extract(task, 'test', device, sents, labels, 1024, 10)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/727 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/411 [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama_fast.LlamaTokenizerFast'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 - if you loaded a llama tokenizer from a GGUF file you can ignore this message.


config.json:   0%|          | 0.00/507 [00:00<?, ?B/s]

pytorch_model.bin.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

pytorch_model-00001-of-00002.bin:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

pytorch_model-00002-of-00002.bin:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

  0%|          | 0/650 [00:00<?, ?it/s]We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)
100%|██████████| 650/650 [08:46<00:00,  1.23it/s]


torch.Size([6500, 5, 4096])
torch.Size([6500, 9])


# **2. BERT Embedding**

In [8]:
# -*- coding: utf-8 -*-
import os
import torch
from transformers import BertTokenizer, BertModel
from tqdm import trange
from datasets import load_dataset
import argparse

def rep_extract(task, mode, device, sents, labels):
    model_path = 'google-bert/bert-large-uncased'
    tokenizer = BertTokenizer.from_pretrained(model_path)
    model = BertModel.from_pretrained(model_path).to(device)
    model.eval()

    max_len = 512
    sents_reps = []
    step = 10
    for idx in trange(0, len(sents), step):
        idx_end = idx + step
        if idx_end > len(sents):
            idx_end = len(sents)
        sents_batch = sents[idx: idx_end]

        sents_batch_encoding = tokenizer(sents_batch, return_tensors='pt', max_length=max_len, padding="max_length", truncation=True)
        sents_batch_encoding = sents_batch_encoding.to(device)

        with torch.no_grad():
            batch_outputs = model(**sents_batch_encoding)
            reps_batch = batch_outputs.pooler_output
        sents_reps.append(reps_batch.cpu())
    sents_reps = torch.cat(sents_reps)

    # 멀티 레이블 데이터를 처리하기 위한 레이블 로직 수정
    labels = torch.tensor(labels, dtype=torch.float32)

    print(sents_reps.shape)
    print(labels.shape)
    path = f'/content/drive/MyDrive/LLMEmbed/MultiLabel/bert/{task}/dataset_tensor/'
    if not os.path.exists(path):
        os.makedirs(path)
    torch.save(sents_reps.to('cpu'), path + f'{mode}_sents.pt')
    torch.save(labels, path + f'{mode}_labels.pt')

if __name__ == '__main__':

    cuda_no = 0 # GPU 번호
    task = 'stackoverflow' # Task 이름
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # 레이블에 해당하는 컬럼 이름 목록
    label_cols = ['Algorithms', 'Backend', 'Data Science', 'Databases', 'Dev Tools', 'Frontend', 'Mobile', 'Systems', 'iOS/macOS']

    # 데이터셋 로딩
    # sof_dataset = load_dataset("path_to_your_dataset")  # 실제 데이터셋 경로로 변경 필요

    sents = sof_dataset['train']['Body']  # 데이터 프레임에서 텍스트만을 추출하는 부분 수정 필요

    # print(sents)
    labels = []  # 최종 레이블을 저장할 2차원 리스트 초기화

    # sof_dataset['train']의 각 행에 대한 루프
    for i in range(len(sof_dataset['train'])):
        labels_temp = []  # 현재 행의 레이블을 저장할 임시 리스트
        # 각 카테고리에 대해 루프를 돌면서 labels_temp 리스트에 레이블을 추가
        for category in label_cols:

            labels_temp.append(sof_dataset['train'][category][i])  # 현재 행의 해당 카테고리 레이블을 추가
        labels.append(labels_temp)  # 완성된 레이블 리스트를 labels 리스트에 추가

    rep_extract(task, 'train', device, sents, labels)

    sents = sof_dataset['test']['Body']  # 데이터 프레임에서 텍스트만을 추출하는 부분 수정 필요

    labels = []  # 최종 레이블을 저장할 2차원 리스트 초기화

    # sof_dataset['train']의 각 행에 대한 루프
    for i in range(len(sof_dataset['test'])):
        labels_temp = []  # 현재 행의 레이블을 저장할 임시 리스트
        # 각 카테고리에 대해 루프를 돌면서 labels_temp 리스트에 레이블을 추가
        for category in label_cols:
            labels_temp.append(sof_dataset['test'][category][i])  # 현재 행의 해당 카테고리 레이블을 추가
        labels.append(labels_temp)  # 완성된 레이블 리스트를 labels 리스트에 추가


    rep_extract(task, 'test', device, sents, labels)


100%|██████████| 2000/2000 [16:57<00:00,  1.97it/s]


torch.Size([20000, 1024])
torch.Size([20000, 9])


100%|██████████| 650/650 [05:29<00:00,  1.97it/s]

torch.Size([6500, 1024])
torch.Size([6500, 9])





# **3. Roberta Embedding**

In [9]:
# -*- coding: utf-8 -*-
import os
import torch
from transformers import RobertaTokenizer, RobertaModel
from tqdm import trange
from datasets import load_dataset

def rep_extract(task, mode, device, sents, labels):
    model_path = 'roberta-large'
    tokenizer = RobertaTokenizer.from_pretrained(model_path)
    model = RobertaModel.from_pretrained(model_path).to(device)
    model.eval()

    max_len = 512
    sents_reps = []
    step = 10  # 작은 배치 크기로 설정하는 것이 좋습니다, 메모리 오류를 방지하기 위해
    for idx in trange(0, len(sents), step):
        idx_end = idx + step
        if idx_end > len(sents):
            idx_end = len(sents)
        sents_batch = sents[idx: idx_end]

        sents_batch_encoding = tokenizer(sents_batch, return_tensors='pt', max_length=max_len, padding="max_length", truncation=True)
        sents_batch_encoding = sents_batch_encoding.to(device)

        with torch.no_grad():
            batch_outputs = model(**sents_batch_encoding)
            # 첫 번째 토큰([CLS] 토큰에 해당)의 표현을 추출
            reps_batch = batch_outputs.last_hidden_state[:, 0, :]
        sents_reps.append(reps_batch.cpu())
    sents_reps = torch.cat(sents_reps)

    # 멀티 레이블 데이터를 처리하기 위한 레이블 로직 수정
    labels = torch.tensor(labels, dtype=torch.float32)

    print(sents_reps.shape)
    print(labels.shape)
    path = f'/content/drive/MyDrive/LLMEmbed/MultiLabel/roberta/{task}/dataset_tensor/'
    if not os.path.exists(path):
        os.makedirs(path)
    torch.save(sents_reps.to('cpu'), path + f'{mode}_sents.pt')
    torch.save(labels, path + f'{mode}_labels.pt')

if __name__ == '__main__':
    cuda_no = 0 # GPU 번호
    task = 'stackoverflow' # Task 이름

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # 레이블에 해당하는 컬럼 이름 목록
    label_cols = ['Algorithms', 'Backend', 'Data Science', 'Databases', 'Dev Tools', 'Frontend', 'Mobile', 'Systems', 'iOS/macOS']

    # 데이터셋 로딩
    # sof_dataset = load_dataset("path_to_your_dataset")  # 실제 데이터셋 경로로 변경 필요

    sents = sof_dataset['train']['Body']  # 데이터 프레임에서 텍스트만을 추출하는 부분 수정 필요

    # print(sents)
    labels = []  # 최종 레이블을 저장할 2차원 리스트 초기화

    # sof_dataset['train']의 각 행에 대한 루프
    for i in range(len(sof_dataset['train'])):
        labels_temp = []  # 현재 행의 레이블을 저장할 임시 리스트
        # 각 카테고리에 대해 루프를 돌면서 labels_temp 리스트에 레이블을 추가
        for category in label_cols:

            labels_temp.append(sof_dataset['train'][category][i])  # 현재 행의 해당 카테고리 레이블을 추가
        labels.append(labels_temp)  # 완성된 레이블 리스트를 labels 리스트에 추가

    rep_extract(task, 'train', device, sents, labels)

    sents = sof_dataset['test']['Body']  # 데이터 프레임에서 텍스트만을 추출하는 부분 수정 필요

    labels = []  # 최종 레이블을 저장할 2차원 리스트 초기화

    # sof_dataset['train']의 각 행에 대한 루프
    for i in range(len(sof_dataset['test'])):
        labels_temp = []  # 현재 행의 레이블을 저장할 임시 리스트
        # 각 카테고리에 대해 루프를 돌면서 labels_temp 리스트에 레이블을 추가
        for category in label_cols:
            labels_temp.append(sof_dataset['test'][category][i])  # 현재 행의 해당 카테고리 레이블을 추가
        labels.append(labels_temp)  # 완성된 레이블 리스트를 labels 리스트에 추가


    rep_extract(task, 'test', device, sents, labels)



tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 2000/2000 [18:49<00:00,  1.77it/s]


torch.Size([20000, 1024])
torch.Size([20000, 9])


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 650/650 [06:06<00:00,  1.77it/s]

torch.Size([6500, 1024])
torch.Size([6500, 9])





# **4. Dataset 정의**

In [7]:
import torch
from torch.utils.data import Dataset, DataLoader

class MyDataset(Dataset):

    # 4개의 텐서 파일을 가져옴
    def __init__(self, mode, l_path, b_path, r_path):
        self.l_sents_reps = torch.load(l_path + f'{mode}_sents.pt')
        self.b_sents_reps = torch.load(b_path + f'{mode}_sents.pt')
        self.r_sents_reps = torch.load(r_path + f'{mode}_sents.pt')

        self.labels = torch.load(l_path + f'{mode}_labels.pt')

        self.sample_num = self.labels.shape[0]

    # 인덱스를 받아 해당 샘플의 데이터를 반환하는 메소드
    def __getitem__(self, index):
        return self.l_sents_reps[index], self.b_sents_reps[index], self.r_sents_reps[index], self.labels[index]

    # 데이터셋의 총 샘플 수를 반환하는 메소드
    def __len__(self):
        return self.sample_num

# * Model Operation - 검증셋 테스트

In [2]:
!pip install -q evaluate

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/480.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m471.0/480.6 kB[0m [31m17.0 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m12.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m12.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m179.3/179.3 kB[0m [31m16.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m15.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━

In [4]:
import numpy as np
import wandb
import evaluate
from tqdm import tqdm
import torch

wandb.init(project="huggingface") # Uncomment to use wandb

# Load metrics for multi-label classification
accuracy_metric = evaluate.load("accuracy")
precision_metric = evaluate.load("precision")
recall_metric = evaluate.load("recall")
f1_metric = evaluate.load("f1")



# Function to compute metrics for multi-label classification
def compute_metrics(predictions, references):
    # For multi-label classification, we need to threshold our predictions
    predictions = (torch.sigmoid(predictions) > 0.5).int()
    references = references.int()


    # Convert torch tensors to numpy arrays
    predictions = predictions.cpu().numpy()
    references = references.cpu().numpy()

    # Flatten arrays to compute global metrics, not per class
    flat_predictions = predictions.flatten()
    flat_references = references.flatten()

    # Calculate sample-based accuracy
    sample_accuracy = (predictions == references).all(axis=1).mean()

    metrics = {
        "sample_accuracy": sample_accuracy,
        "flat_accuracy": accuracy_metric.compute(predictions=flat_predictions, references=flat_references)["accuracy"],
        "precision": precision_metric.compute(predictions=flat_predictions, references=flat_references, average="weighted")["precision"],
        "recall": recall_metric.compute(predictions=flat_predictions, references=flat_references, average="weighted")["recall"],
        "f1": f1_metric.compute(predictions=flat_predictions, references=flat_references, average="weighted")["f1"],
    }
    return metrics

# Training and validation function for multi-label classification
def Train_and_Evaluate(dataloader_train, dataloader_val, device, model, loss_fn, optimizer):
    # Training
    model.train()
    total_train_loss = 0
    for batch in tqdm(dataloader_train):
        batch_l, batch_b, batch_r, batch_y = [item.to(device) for item in batch]
        pred = model(batch_l.float(), batch_b.float(), batch_r.float())
        loss = loss_fn(pred, batch_y.float())  # Make sure batch_y is a float tensor
        total_train_loss += loss.item()

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    avg_train_loss = total_train_loss / len(dataloader_train)

    # Validation
    model.eval()
    total_val_loss = 0
    all_predictions = []
    all_references = []
    with torch.no_grad():
        for batch in tqdm(dataloader_val):
            batch_l, batch_b, batch_r, batch_y = [item.to(device) for item in batch]
            pred = model(batch_l.float(), batch_b.float(), batch_r.float())
            loss = loss_fn(pred, batch_y.float())  # Make sure batch_y is a float tensor
            total_val_loss += loss.item()

            # Threshold predictions for multi-label classification
            pred_y = (torch.sigmoid(pred) > 0.5).int()
            all_predictions.append(pred_y)
            all_references.append(batch_y)

    # Concatenate all predictions and references
    all_predictions = torch.cat(all_predictions, dim=0)
    all_references = torch.cat(all_references, dim=0)

    avg_val_loss = total_val_loss / len(dataloader_val)

    # Compute metrics
    metrics = compute_metrics(all_predictions, all_references)
    metrics['train_loss'] = avg_train_loss
    metrics['val_loss'] = avg_val_loss

    # Uncomment the following line to log metrics to wandb
    wandb.log(metrics)

    print({metric: f"{value:.4f}" for metric, value in metrics.items()})

# Example usage
# Train_and_Evaluate(dataloader_train, dataloader_val, device, model, loss_fn, optimizer)

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.55k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.36k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.77k [00:00<?, ?B/s]

# **6. Downstream Model Class**

In [5]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class DownstreamModel(nn.Module):
    def __init__(self, class_num, SIGMA):
        super(DownstreamModel, self).__init__()
        self.SIGMA = SIGMA
        self.compress_layers = nn.ModuleList()
        for _ in range(5):
            layers = []
            layers.append(nn.Linear(4096, 1024))
            layers.append(nn.ReLU())
            layers.append(nn.Dropout(0.5))
            self.compress_layers.append(nn.Sequential(*layers))

        self.fc1 = nn.Linear(4145, 1024)
        self.relu1 = nn.ReLU()
        self.dropout1 = nn.Dropout(0.5)
        self.fc2 = nn.Linear(1024, 256)
        self.relu2 = nn.ReLU()
        self.dropout2 = nn.Dropout(0.5)
        self.fc3 = nn.Linear(256, class_num)
        # 변경된 부분: softmax 대신 sigmoid 활성화 함수 사용
        # self.softmax = nn.Softmax(dim=1)

    def forward(self, input_l, input_b, input_r):
        batch_size = input_l.shape[0]

        # input_l 텐서를 첫 번째 차원을 기준으로 1 크기의 텐서로 분할
        split_tensors = torch.split(input_l, 1, dim=1)
        input = []

        # 분할된 텐서들을 순회
        for i, split_tensor in enumerate(split_tensors):
            # 각 split_tensor를 배치 크기에 맞게 2차원으로 재구성
            split_tensor = split_tensor.reshape(batch_size,-1)
            # 재구성된 텐서를 압축(compress) layer를 거쳐 변환
            input.append(self.compress_layers[i](split_tensor))

        # input_b(bert 임베딩)와 input_r(Roberta 임베딩)을 input에 추가
        input.append(input_b)
        input.append(input_r)
        input = torch.stack(input, dim=1)
        # X * X^T
        input_T = input.transpose(1, 2)
        input_P = torch.matmul(input, input_T)
        input_P = input_P.reshape(batch_size, -1)
        # PN func
        input_P = 2*F.sigmoid(self.SIGMA * input_P) - 1

        a = torch.mean(input_l, dim=1)
        input = torch.cat([input_P, a], dim=1)

        output = self.fc1(input)
        output = self.relu1(output)
        output = self.dropout1(output)
        output = self.fc2(output)
        output = self.relu2(output)
        output = self.dropout2(output)
        output = self.fc3(output)

        # 변경된 부분: 각 레이블의 독립적인 확률을 출력하기 위해 sigmoid 함수를 적용
        # output = torch.sigmoid(output)

        return output


# **7. Fine-tuning**

In [8]:
# 필요한 모듈 import 구문 추가
# from DownstreamModel import DownstreamModel
from torch.utils.data import DataLoader
import torch.nn as nn
import torch.optim as optim
# from model_op import Train, Test
# from model_op_multi import Train_multi, Test_multi
import argparse
import os
import torch
# from MyDataset import MyDataset
import json

if __name__ == '__main__':
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    task = 'stackoverflow'  # 사용 가능한 옵션: 'sst2', 'mr', 'agnews', 'r8', 'r52', 'stackoverflow' 중 하나
    epochs = 20  # 원하는 에폭 수
    SIGMA = 0.1  # SIGMA 값을 설정
    batch_size = 1024  # 배치 크기 설정
    lr = 2e-4  # 학습률 설정

    class_num = {'sst2': 2, 'mr': 2, 'agnews': 5, 'r8': 8, 'r52': 52, 'stackoverflow': 9}
    class_num = class_num[task]

    l_dataset_path = f'/content/drive/MyDrive/LLMEmbed/MultiLabel/llama2/{task}/dataset_tensor/'
    b_dataset_path = f'/content/drive/MyDrive/LLMEmbed/MultiLabel/bert/{task}/dataset_tensor/'
    r_dataset_path = f'/content/drive/MyDrive/LLMEmbed/MultiLabel/roberta/{task}/dataset_tensor/'

    # Train dataset
    train_data = MyDataset('train', l_dataset_path, b_dataset_path, r_dataset_path)
    train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True, num_workers=4, pin_memory=True)

    # Test dataset
    test_data = MyDataset('test', l_dataset_path, b_dataset_path, r_dataset_path)
    test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=False, num_workers=4, pin_memory=True)

    model = DownstreamModel(class_num, SIGMA).to(device)

    loss_fn = nn.BCEWithLogitsLoss().to(device)  # 멀티 레이블 손실 함수
    optimizer = optim.Adam(model.parameters(), lr=lr)

    print('training ...')
    for epoch in range(epochs):
        model = model.to(device)
        print(f'--------------------------- epoch {epoch} ---------------------------')
        Train_and_Evaluate(train_loader, test_loader, device, model, loss_fn, optimizer)

    # 모델 학습 후, 모델의 가중치 저장하기
    model_save_path = f"/content/drive/MyDrive/LLMEmbed/MultiLabel/{task}_model_weights.pth"
    torch.save(model.state_dict(), model_save_path)

  self.l_sents_reps = torch.load(l_path + f'{mode}_sents.pt')
  self.b_sents_reps = torch.load(b_path + f'{mode}_sents.pt')
  self.r_sents_reps = torch.load(r_path + f'{mode}_sents.pt')
  self.labels = torch.load(l_path + f'{mode}_labels.pt')


training ...
--------------------------- epoch 0 ---------------------------


100%|██████████| 20/20 [00:02<00:00,  9.33it/s]
100%|██████████| 7/7 [00:00<00:00, 15.42it/s]


{'sample_accuracy': '0.0003', 'flat_accuracy': '0.8651', 'precision': '0.8833', 'recall': '0.8651', 'f1': '0.8027', 'train_loss': '0.5047', 'val_loss': '0.3639'}
--------------------------- epoch 1 ---------------------------


100%|██████████| 20/20 [00:01<00:00, 17.08it/s]
100%|██████████| 7/7 [00:00<00:00, 15.95it/s]


{'sample_accuracy': '0.2425', 'flat_accuracy': '0.8945', 'precision': '0.8929', 'recall': '0.8945', 'f1': '0.8680', 'train_loss': '0.4155', 'val_loss': '0.2882'}
--------------------------- epoch 2 ---------------------------


100%|██████████| 20/20 [00:01<00:00, 18.80it/s]
100%|██████████| 7/7 [00:00<00:00, 16.06it/s]


{'sample_accuracy': '0.4435', 'flat_accuracy': '0.9182', 'precision': '0.9120', 'recall': '0.9182', 'f1': '0.9114', 'train_loss': '0.3382', 'val_loss': '0.2287'}
--------------------------- epoch 3 ---------------------------


100%|██████████| 20/20 [00:01<00:00, 18.39it/s]
100%|██████████| 7/7 [00:00<00:00, 16.22it/s]


{'sample_accuracy': '0.5214', 'flat_accuracy': '0.9298', 'precision': '0.9257', 'recall': '0.9298', 'f1': '0.9258', 'train_loss': '0.2885', 'val_loss': '0.1983'}
--------------------------- epoch 4 ---------------------------


100%|██████████| 20/20 [00:01<00:00, 16.78it/s]
100%|██████████| 7/7 [00:00<00:00, 16.04it/s]


{'sample_accuracy': '0.5683', 'flat_accuracy': '0.9364', 'precision': '0.9343', 'recall': '0.9364', 'f1': '0.9350', 'train_loss': '0.2584', 'val_loss': '0.1833'}
--------------------------- epoch 5 ---------------------------


100%|██████████| 20/20 [00:01<00:00, 18.41it/s]
100%|██████████| 7/7 [00:00<00:00, 16.66it/s]


{'sample_accuracy': '0.5880', 'flat_accuracy': '0.9406', 'precision': '0.9384', 'recall': '0.9406', 'f1': '0.9390', 'train_loss': '0.2403', 'val_loss': '0.1710'}
--------------------------- epoch 6 ---------------------------


100%|██████████| 20/20 [00:01<00:00, 17.42it/s]
100%|██████████| 7/7 [00:00<00:00, 15.76it/s]


{'sample_accuracy': '0.5954', 'flat_accuracy': '0.9415', 'precision': '0.9408', 'recall': '0.9415', 'f1': '0.9411', 'train_loss': '0.2262', 'val_loss': '0.1653'}
--------------------------- epoch 7 ---------------------------


100%|██████████| 20/20 [00:01<00:00, 19.00it/s]
100%|██████████| 7/7 [00:00<00:00, 15.86it/s]


{'sample_accuracy': '0.6169', 'flat_accuracy': '0.9451', 'precision': '0.9438', 'recall': '0.9451', 'f1': '0.9443', 'train_loss': '0.2158', 'val_loss': '0.1543'}
--------------------------- epoch 8 ---------------------------


100%|██████████| 20/20 [00:01<00:00, 18.21it/s]
100%|██████████| 7/7 [00:00<00:00, 16.98it/s]


{'sample_accuracy': '0.6246', 'flat_accuracy': '0.9455', 'precision': '0.9443', 'recall': '0.9455', 'f1': '0.9448', 'train_loss': '0.2080', 'val_loss': '0.1556'}
--------------------------- epoch 9 ---------------------------


100%|██████████| 20/20 [00:01<00:00, 18.14it/s]
100%|██████████| 7/7 [00:00<00:00, 15.69it/s]


{'sample_accuracy': '0.6186', 'flat_accuracy': '0.9455', 'precision': '0.9451', 'recall': '0.9455', 'f1': '0.9453', 'train_loss': '0.2008', 'val_loss': '0.1517'}
--------------------------- epoch 10 ---------------------------


100%|██████████| 20/20 [00:01<00:00, 18.05it/s]
100%|██████████| 7/7 [00:00<00:00, 15.93it/s]


{'sample_accuracy': '0.6431', 'flat_accuracy': '0.9483', 'precision': '0.9478', 'recall': '0.9483', 'f1': '0.9480', 'train_loss': '0.1954', 'val_loss': '0.1465'}
--------------------------- epoch 11 ---------------------------


100%|██████████| 20/20 [00:01<00:00, 18.14it/s]
100%|██████████| 7/7 [00:00<00:00, 16.62it/s]


{'sample_accuracy': '0.6462', 'flat_accuracy': '0.9492', 'precision': '0.9484', 'recall': '0.9492', 'f1': '0.9487', 'train_loss': '0.1921', 'val_loss': '0.1430'}
--------------------------- epoch 12 ---------------------------


100%|██████████| 20/20 [00:01<00:00, 18.80it/s]
100%|██████████| 7/7 [00:00<00:00, 16.69it/s]


{'sample_accuracy': '0.6405', 'flat_accuracy': '0.9489', 'precision': '0.9486', 'recall': '0.9489', 'f1': '0.9488', 'train_loss': '0.1874', 'val_loss': '0.1438'}
--------------------------- epoch 13 ---------------------------


100%|██████████| 20/20 [00:01<00:00, 17.98it/s]
100%|██████████| 7/7 [00:00<00:00, 16.56it/s]


{'sample_accuracy': '0.6420', 'flat_accuracy': '0.9486', 'precision': '0.9487', 'recall': '0.9486', 'f1': '0.9487', 'train_loss': '0.1831', 'val_loss': '0.1437'}
--------------------------- epoch 14 ---------------------------


100%|██████████| 20/20 [00:01<00:00, 17.27it/s]
100%|██████████| 7/7 [00:00<00:00, 15.55it/s]


{'sample_accuracy': '0.6511', 'flat_accuracy': '0.9499', 'precision': '0.9499', 'recall': '0.9499', 'f1': '0.9499', 'train_loss': '0.1779', 'val_loss': '0.1390'}
--------------------------- epoch 15 ---------------------------


100%|██████████| 20/20 [00:01<00:00, 19.20it/s]
100%|██████████| 7/7 [00:00<00:00, 15.71it/s]


{'sample_accuracy': '0.6532', 'flat_accuracy': '0.9503', 'precision': '0.9500', 'recall': '0.9503', 'f1': '0.9501', 'train_loss': '0.1754', 'val_loss': '0.1359'}
--------------------------- epoch 16 ---------------------------


100%|██████████| 20/20 [00:01<00:00, 18.56it/s]
100%|██████████| 7/7 [00:00<00:00, 15.43it/s]


{'sample_accuracy': '0.6532', 'flat_accuracy': '0.9503', 'precision': '0.9502', 'recall': '0.9503', 'f1': '0.9503', 'train_loss': '0.1720', 'val_loss': '0.1369'}
--------------------------- epoch 17 ---------------------------


100%|██████████| 20/20 [00:01<00:00, 16.50it/s]
100%|██████████| 7/7 [00:00<00:00, 17.09it/s]


{'sample_accuracy': '0.6532', 'flat_accuracy': '0.9503', 'precision': '0.9502', 'recall': '0.9503', 'f1': '0.9502', 'train_loss': '0.1688', 'val_loss': '0.1374'}
--------------------------- epoch 18 ---------------------------


100%|██████████| 20/20 [00:01<00:00, 18.02it/s]
100%|██████████| 7/7 [00:00<00:00, 15.90it/s]


{'sample_accuracy': '0.6565', 'flat_accuracy': '0.9501', 'precision': '0.9506', 'recall': '0.9501', 'f1': '0.9503', 'train_loss': '0.1666', 'val_loss': '0.1360'}
--------------------------- epoch 19 ---------------------------


100%|██████████| 20/20 [00:01<00:00, 17.61it/s]
100%|██████████| 7/7 [00:00<00:00, 15.36it/s]


{'sample_accuracy': '0.6534', 'flat_accuracy': '0.9499', 'precision': '0.9503', 'recall': '0.9499', 'f1': '0.9501', 'train_loss': '0.1636', 'val_loss': '0.1381'}


# **8. 추론 메서드 정의**

In [4]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BertTokenizer, BertModel, RobertaTokenizer, RobertaModel
from transformers import AutoConfig

# Load the tokenizers and models for Llama2, BERT, and Roberta
llama2_tokenizer = AutoTokenizer.from_pretrained("daryl149/llama-2-7b-chat-hf", use_auth_token="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", trust_remote_code=True)
llama2_tokenizer.pad_token = llama2_tokenizer.eos_token  # 패딩 토큰 설정
llama2_config = AutoConfig.from_pretrained("daryl149/llama-2-7b-chat-hf",use_auth_token="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", output_hidden_states=True)
llama2_model = AutoModelForCausalLM.from_pretrained("daryl149/llama-2-7b-chat-hf",use_auth_token="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", config=llama2_config)

bert_tokenizer = BertTokenizer.from_pretrained('google-bert/bert-large-uncased')
bert_model = BertModel.from_pretrained('google-bert/bert-large-uncased')

roberta_tokenizer = RobertaTokenizer.from_pretrained('FacebookAI/roberta-large')
roberta_model = RobertaModel.from_pretrained('FacebookAI/roberta-large')

# Make sure all models are in evaluation mode and moved to the appropriate device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
llama2_model.eval().to(device)
bert_model.eval().to(device)
roberta_model.eval().to(device)

# Initialize the downstream model
class_num = 9  # For example, if you have 8 classes
SIGMA = 0.1  # SIGMA value for your downstream model
downstream_model = DownstreamModel(class_num, SIGMA).to(device)

model_load_path = "/content/drive/MyDrive/LLMEmbed/MultiLabel/stackoverflow_model_weights.pth"

# 가중치 로드
downstream_model.load_state_dict(torch.load(model_load_path, map_location=device))
downstream_model.eval()

def get_llama2_embedding(text, tokenizer, model, device):
    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
        # Average the last 5 layers
        embedding = torch.stack([torch.mean(outputs.hidden_states[i], dim=1) for i in range(-1, -6, -1)], dim=1)
    return embedding

def get_bert_embedding(text, tokenizer, model, device):
    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
        # Use pooler_output for BERT embeddings
        embedding = outputs.pooler_output
    return embedding

def get_roberta_embedding(text, tokenizer, model, device):
    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
        # Use the first token ([CLS] token) representation
        embedding = outputs.last_hidden_state[:, 0, :]
    return embedding

def infer(text, downstream_model, device):
    # 각 모델로부터 임베딩을 추출
    llama2_emb = get_llama2_embedding(text, llama2_tokenizer, llama2_model, device)
    bert_emb = get_bert_embedding(text, bert_tokenizer, bert_model, device)
    roberta_emb = get_roberta_embedding(text, roberta_tokenizer, roberta_model, device)

    # 모든 임베딩을 float 타입으로 변환
    llama2_emb = llama2_emb.float()
    bert_emb = bert_emb.float()
    roberta_emb = roberta_emb.float()

    # Forward pass through the downstream model
    with torch.no_grad():
        prediction = downstream_model(llama2_emb, bert_emb, roberta_emb)
        # Apply sigmoid to obtain probabilities
        prediction = torch.sigmoid(prediction)

    return prediction


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of RobertaModel were not initialized from the model checkpoint at FacebookAI/roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  downstream_model.load_state_dict(torch.load(model_load_path, map_location=device))


In [10]:

# 예측된 클래스를 레이블로 맵핑하여 출력하는 함수
def print_predicted_labels(predicted_classes, labels):
    # 예측된 클래스 중 1인 레이블만 선택
    predicted_labels = [labels[i] for i, pred in enumerate(predicted_classes) if pred == 1]

    # 선택된 레이블 출력
    print("Predicted labels:", predicted_labels)


# 레이블 목록
labels = ['Algorithms', 'Backend', 'Data Science', 'Databases', 'Dev Tools', 'Frontend', 'Mobile', 'Systems', 'iOS/macOS']

# 주어진 텍스트를 토대로 예측을 5번 실행
texts = [
    "working of compareTo() method of Comparable interface            I have one Employee class and the requirement is to sort the objects using comparable interface. The output with this code is :The difference of this id and other id is..** 6  other id**1The difference of this id and other id is..** 3  other id**6The difference of this id and other id is..** 3  other id**6The difference of this id and other id is..** 3  other id**1The difference of this id and other id is..** 11  other id**3The difference of this id and other id is..** 11  other id**6",
]

# 각 텍스트에 대해 예측 실행 및 레이블 출력
for i, text in enumerate(texts):
    prediction = infer(text, downstream_model, device)

    # 예측 결과를 CPU로 이동
    prediction = prediction.cpu()
    predicted_classes = (prediction > 0.5).int().numpy()

    # 예측 결과 출력
    print("Predicted probabilities:", prediction.numpy())
    print("Predicted classes:", predicted_classes)


    print_predicted_labels(predicted_classes[0], labels)

Predicted probabilities: [[9.8818821e-01 4.6332803e-04 1.2619452e-02 1.9442239e-03 2.4366153e-02
  4.7192728e-04 1.1911500e-03 5.0285155e-01 3.7760334e-04]]
Predicted classes: [[1 0 0 0 0 0 0 1 0]]
Predicted labels: ['Algorithms', 'Systems']
