In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import evaluate
import numpy as np
import torch

In [2]:
from datasets import load_dataset

data_down_dir = "/root/storage/nas/JH_server/2025/Synthetic_data/0_dataset"

dataset = load_dataset(f"{data_down_dir}/goemotions")
# 예시 출력
print(dataset)
print(dataset["train"][0])

DatasetDict({
    train: Dataset({
        features: ['text', 'labels', 'id'],
        num_rows: 43410
    })
    validation: Dataset({
        features: ['text', 'labels', 'id'],
        num_rows: 5426
    })
    test: Dataset({
        features: ['text', 'labels', 'id'],
        num_rows: 5427
    })
})
{'text': "My favourite food is anything I didn't have to cook myself.", 'labels': [27], 'id': 'eebbqej'}


In [10]:
from collections import defaultdict

label_counter = defaultdict(int)

for row in dataset['train']:
    label_counter[row['labels'][0]] += 1

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import load_dataset
import evaluate
import numpy as np
import torch
from torch.nn import BCEWithLogitsLoss
from transformers import TrainerCallback

model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# 예시 전처리 (단일 텍스트)
def preprocess(example):
    return tokenizer(example['text'], truncation=True, padding="max_length", max_length=128)

# multi-hot label로 변환
def to_multihot(example, num_labels):
    vec = [float(0.0)] * num_labels
    for label in example["labels"]:
        vec[label] = float(1.0)
    example["labels"] = vec
    return example


num_labels = 28  # 너의 클래스 수에 맞게 설정

dataset = dataset.map(lambda x: to_multihot(x, num_labels))
dataset = dataset.map(preprocess)

from datasets import Features, Sequence, Value

features = Features({
    "text": Value("string"),
    "labels": Sequence(Value("float32")),  
    "id": Value("string"),
    "input_ids": Sequence(Value("int32")),
    "token_type_ids": Sequence(Value("int32")),
    "attention_mask": Sequence(Value("int8"))
})

dataset["train"] = dataset["train"].cast(features)
dataset["test"] = dataset["test"].cast(features)


model = AutoModelForSequenceClassification.from_pretrained(
    model_name, num_labels=num_labels, problem_type="multi_label_classification"
)                             

In [None]:
from sklearn.metrics import accuracy_score, f1_score


def compute_metrics(p):
    logits, labels = p
    probs = torch.sigmoid(torch.tensor(logits))
    preds = (probs > 0.5).int().numpy()
    labels = np.array(labels)

    return {
        "f1_micro": f1_score(labels, preds, average="micro"),
        "f1_macro": f1_score(labels, preds, average="macro"),
        "subset_accuracy": accuracy_score(labels, preds),  # 전체 레이블이 다 맞아야만 1로 인정
    }
# 학습 인자
args = TrainingArguments(
    eval_strategy="steps",
    eval_steps=100,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    save_strategy="no"
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()


  trainer = Trainer(


Step,Training Loss,Validation Loss,F1 Micro,F1 Macro,Subset Accuracy
100,No log,0.127779,0.327189,0.053331,0.219643
200,No log,0.113544,0.426351,0.126935,0.29777
300,No log,0.103945,0.450938,0.130551,0.324304
400,No log,0.099544,0.484887,0.193883,0.355998
500,0.116000,0.095856,0.476647,0.170451,0.34918
600,0.116000,0.096644,0.496713,0.262328,0.360236
700,0.116000,0.092157,0.506876,0.24133,0.377004
800,0.116000,0.090965,0.481799,0.23075,0.33665
900,0.116000,0.089569,0.55563,0.352417,0.437258
1000,0.094700,0.088417,0.568559,0.344462,0.446656
