### Google Colab 환경에서 진행하였습니다.
Korean Hate Speech Detection의 입력 텍스트에서 감정 벡터를 추출하기 위하여 Sentiment Analysis에서 학습(fine-tuning)한 모델을 활용하였습니다.  
따라서 Sentiment Analysis를 진행한 이후에 Korean Hate Speech Detection을 진행하였습니다.  
드라이브에 마운트하는 코드와 데이터셋을 가져오는 코드에서는 **데이터셋이 있는 디렉토리를 올바르게 지정**하여야 합니다.  
또한 모델이 추론(inference)을 수행하는 코드에서는 **추론을 수행하고자 하는 모델 파일이 있는 디렉토리를 올바르게 지정**하여야 합니다.  
Sentiment Analysis의 Model Training 코드에서는 early stopping 수행 여부에 따라 **두 블록의 학습 코드 중 하나만 실행**해야 합니다.



# Environment settings

### install modules

In [1]:
# 텍스트 이모지 전처리
!pip install --quiet emoji==2.11.1

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/433.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.4/433.8 kB[0m [31m4.1 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m433.8/433.8 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[?25h

### import modules

In [2]:
import numpy as np
import pandas as pd
import os                               # file system에 접근
import re, html, unicodedata, emoji     # text preprocessing의 regular expression
import gc                               # GPU memory cache 청소
import time

from transformers import AutoTokenizer, AutoConfig, AutoModelForSequenceClassification
from transformers.optimization import get_scheduler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, f1_score
from tqdm import tqdm                   # model training/validation/test에서 학습 과정을 시각화
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW

### drive mount

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
# directory check
DIRECTORY = "/content/drive/MyDrive/DeepLearning/"
for dirname, _, filenames in os.walk(DIRECTORY):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/content/drive/MyDrive/DeepLearning/test.tsv
/content/drive/MyDrive/DeepLearning/validation.tsv
/content/drive/MyDrive/DeepLearning/train.tsv
/content/drive/MyDrive/DeepLearning/sentiment.csv
/content/drive/MyDrive/DeepLearning/sentiment/snunlp/KR-Medium_20250612-153337/model/config.json
/content/drive/MyDrive/DeepLearning/sentiment/snunlp/KR-Medium_20250612-153337/model/model.safetensors
/content/drive/MyDrive/DeepLearning/sentiment/snunlp/KR-Medium_20250612-153337/tokenizer/tokenizer_config.json
/content/drive/MyDrive/DeepLearning/sentiment/snunlp/KR-Medium_20250612-153337/tokenizer/special_tokens_map.json
/content/drive/MyDrive/DeepLearning/sentiment/snunlp/KR-Medium_20250612-153337/tokenizer/vocab.txt
/content/drive/MyDrive/DeepLearning/sentiment/snunlp/KR-Medium_20250612-153337/tokenizer/tokenizer.json
/content/drive/MyDrive/DeepLearning/sentiment/snunlp/KR-Medium_20250612-154247/model/config.json
/content/drive/MyDrive/DeepLearning/sentiment/snunlp/KR-Medium_20250612-154247/model

### cuda

In [5]:
# 런타임 - 런타임 유형 변경 - T4 GPU 선택 - 저장(저장 필수)
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f"현재 device: {torch.cuda.get_device_name()}")
else:
    device = torch.device("cpu")
    print(f"현재 device: {device}")
    print("GPU 안 쓰면 학습 못함(진짜임)")

현재 device: Tesla T4


# Pipeline: Sentiment Analysis

In [30]:
# Label dictionary
LABEL_2_ID = {
    "공포": 0,
    "놀람": 1,
    "분노": 2,
    "슬픔": 3,
    "중립": 4,
    "행복": 5,
    "혐오": 6,
}
ID_2_LABEL = {value: key for key, value in LABEL_2_ID.items()}

## Load Dataset

In [31]:
df = pd.read_csv(os.path.join(DIRECTORY, 'sentiment.csv'))

In [32]:
display(df)

Unnamed: 0,Sentence,Emotion
0,언니 동생으로 부르는게 맞는 일인가요..??,공포
1,그냥 내 느낌일뿐겠지?,공포
2,아직너무초기라서 그런거죠?,공포
3,유치원버스 사고 낫다던데,공포
4,근데 원래이런거맞나요,공포
...,...,...
38589,솔직히 예보 제대로 못하는 데 세금이라도 아끼게 그냥 폐지해라..,혐오
38590,재미가 없으니 망하지,혐오
38591,공장 도시락 비우생적임 아르바이트했는데 화장실가성 손도 않씯고 재료 담고 바닥 떨어...,혐오
38592,코딱지 만한 나라에서 지들끼리 피터지게 싸우는 센징 클래스 ㅉㅉㅉ,혐오


## Text Preprocessing

### checking missing data

In [33]:
# checking missing data
mask = (df["Sentence"].fillna('').str.len() == 0) | (~df["Emotion"].isin(LABEL_2_ID))
if mask.any(): print(f"{mask.sum()} missing datas")
else: print("no missing data")

no missing data


In [34]:
emotion_counts_dict = df['Emotion'].value_counts().to_dict()
print(emotion_counts_dict)

{'행복': 6037, '놀람': 5898, '분노': 5665, '공포': 5468, '혐오': 5429, '슬픔': 5267, '중립': 4830}


### outliar processing

In [35]:
URL_PAT = re.compile(r'https?://\S+')
HTML_PAT = re.compile(r'<[^>]+>')
REPEAT_PAT = re.compile(r'(.)\1{2,}')                       # 3회 이상 연속된 글자(ㅋㅋㅋ, ㅠㅠㅠ 등) → 2회로 축약
SPEC_PAT = re.compile(r'[^ㄱ-ㅎ가-힣a-zA-Z0-9\s\.\,\!\?]+') # 허용 문자: 한글, 영문, 숫자, 공백, 주요 punctuation
MULTI_SP = re.compile(r'\s+')

def clean_text(text):
    text = html.unescape(text)                          # &quot; → "
    text = URL_PAT.sub(' URL ', text)                   # URL 토큰화
    text = HTML_PAT.sub(' ', text)                      # HTML tag 제거
    text = emoji.demojize(text, delimiters=(' ', ' '))  # 😀 → :grinning_face:
    text = REPEAT_PAT.sub(r'\1\1', text)                # ㅋㅋㅋㅋ → ㅋㅋ
    text = SPEC_PAT.sub(' ', text)                      # 특수문자 제거
    text = unicodedata.normalize('NFKC', text)          # Unicode 정규화
    text = MULTI_SP.sub(' ', text).strip()              # 다중 공백 정규화
    return text

df["Sentence"] = df["Sentence"].apply(clean_text)

### dataset balancing

In [36]:
df = (
    df.groupby("Emotion")
    .apply(lambda x: x.sample(n=df["Emotion"].value_counts().min(), random_state=42))   # down-sampling
    .reset_index(drop=True)                                                             # multi-index 방지
    .sample(frac=1, random_state=42)
    .reset_index(drop=True)
)

  .apply(lambda x: x.sample(n=df["Emotion"].value_counts().min(), random_state=42))   # down-sampling


### target digitize

In [37]:
# ["공포", "놀람", "분노", ...] => [0, 1, 2, ...]
df["Emotion"] = df["Emotion"].replace(LABEL_2_ID)

  df["Emotion"] = df["Emotion"].replace(LABEL_2_ID)


### dataset slicing

In [38]:
# colab의 GPU 메모리/연산 할당량 제한으로 인해 dataset의 일부만 학습에 사용
df = df.iloc[:30000]

In [39]:
display(df)

Unnamed: 0,Sentence,Emotion
0,이건 잘하는것 같은데 말이다.,0
1,그치만 타이밍이 문제네요..,0
2,대박??,5
3,시급 1600원 로봇 시급이 헐?,1
4,오늘 지진 대박이다,1
...,...,...
29995,진짜 네이트온만해야해?ᄏᄏ,0
29996,뭘해도 집중이안돼..힘들어,0
29997,올해 한국시리즈는 최순실이한테 묻혀서 조용히 끝났네..,6
29998,덕분에 새댁이 헤매지 않을 수 있게 되었어요,5


### dataset split

In [40]:
# feature와 target 분리
texts = df["Sentence"].tolist()
labels = df["Emotion"].tolist()

# train/validation/test split
train_texts, test_texts, train_labels, test_labels = train_test_split(
    texts, labels,
    test_size=0.2, random_state=42, stratify=labels
)
train_texts, val_texts, train_labels, val_labels = train_test_split(
    train_texts, train_labels,
    test_size=0.25, random_state=42, stratify=train_labels
)

## Text Tokenize

In [41]:
MODEL_NAME = "UICHEOL-HWANG/kobert"
BATCH_SIZE = 16

### Text Tokenize

In [42]:
class EmotionDataset(Dataset):
    """
    문장과 라벨을 읽어
    Transformer 토크나이저로 encoding
    """
    def __init__(self, texts, labels, tokenizer, max_length=64):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, index):
        text = self.texts[index]
        label = self.labels[index]

        encoding = self.tokenizer(
            text,
            padding="max_length",
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt"
        )

        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "labels": torch.tensor(label, dtype=torch.long),
        }

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)

# Dataset의 역할 (GPT 주)
# abstraction: “샘플 하나”를 가져오는 표준 interface(__len__, __getitem__)를 정의
# encapsulation: tokenization·label 변환·augmentation 등을 on-the-fly로 처리
train_dataset = EmotionDataset(train_texts, train_labels, tokenizer)
val_dataset = EmotionDataset(val_texts, val_labels, tokenizer)
test_dataset = EmotionDataset(test_texts, test_labels, tokenizer)

# DataLoader의 역할 (GPT 주)
# batch·shuffle·parallel I/O·pin-memory 등의 running engine 기능 담당
# GPU pipeline을 병렬화하여 I/O bottleneck 최소화
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=True)

## Model Training

In [43]:
# GPU 메모리 캐시 정리
gc.collect()
torch.cuda.empty_cache()

### Model Hyperparams

In [44]:
MODEL_NAME = "UICHEOL-HWANG/kobert"
NUM_LABELS = 7
BATCH_SIZE = 16
NUM_EPOCHS = 20
LEARNING_RATE = 3e-5
WARMUP_DECAY_RATE = 0.1 # 학습률 증가 => 감소
PATIENCE = 3            # Early stopping을 위한 patience 값

### Pre-Training

In [45]:
config = AutoConfig.from_pretrained(
    MODEL_NAME,
    num_labels=NUM_LABELS,
    id2label=ID_2_LABEL,
    label2id=LABEL_2_ID,
)

model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    config=config,
    ignore_mismatched_sizes=True  # classification head 크기가 달라도 자동 교체
)

model.to(device)  # 모델을 디바이스로 이동

optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
num_training_steps = NUM_EPOCHS * len(train_loader)
num_warmup_steps = int(WARMUP_DECAY_RATE * num_training_steps)

lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=num_warmup_steps,
    num_training_steps=num_training_steps
)

### Training (fine-tuning)

In [46]:
#--------------------------------------------------------------------------------------------------------------------------------
# !!! 중요 !!!
#
# 이 코드는 모델의 검증 과정에서 early-stopping을 수행합니다.
#
# early-stopping 없이 학습하고자 한다면, 이 코드 블록을 **실행하지 마시고** 바로 아래에 있는 코드 블록을 실행하십시오.
#
#--------------------------------------------------------------------------------------------------------------------------------

# Early Stopping 변수
best_val_f1 = 0
patience_counter = 0

# 전체 데이터를 {NUM_EPOCHS}회 순회하며 학습
for epoch in range(NUM_EPOCHS):

    ### 학습 과정(train)

    # 모델을 학습 모드로 전환
    model.train()

    train_loss = 0
    train_labels = []
    train_preds = []

    # 배치 단위로 학습 수행
    for batch in tqdm(train_loader, desc=f"Epoch {epoch + 1} [Training]"):

        # 1. 입력을 GPU에 전달
        inputs = {
            "input_ids": batch["input_ids"].to(device),
            "attention_mask": batch["attention_mask"].to(device),
            "labels": batch["labels"].to(device)
        }

        # 2. gradient 초기화
        optimizer.zero_grad()

        # 3. forward pass
        outputs = model(**inputs)

        # 4. 예측값 집계
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1)

        # 5. 손실 집계
        loss = outputs.loss
        train_loss += loss.item()
        train_labels.extend(inputs["labels"].cpu().numpy())
        train_preds.extend(preds.cpu().numpy())

        # 6. 모델 파라미터 갱신(backward pass; back propagation)
        loss.backward()
        optimizer.step()
        lr_scheduler.step()

    ### 학습 단계에서의 손실 및 metric 계산
    avg_train_loss = train_loss / len(train_loader)
    train_accuracy = accuracy_score(train_labels, train_preds)
    train_f1 = f1_score(train_labels, train_preds, average='macro')
    print(
        f"Epoch {epoch + 1} "
        f"Training Loss: {avg_train_loss:.4f} | "
        f"Training Accuracy: {train_accuracy:.4f} | "
        f"Training Macro F1: {train_f1:.4f}"
    )

    ### 검증 과정(validation)

    # 모델을 검증/평가/추론 모드로 전환
    model.eval()

    val_loss = 0
    val_labels = []
    val_preds = []

    # 검증 과정에서는 gradient 연산을 생략하여 연산량 및 오버헤드를 줄임
    with torch.no_grad():

        # 배치 단위로 검증 수행
        for batch in tqdm(val_loader, desc=f"Epoch {epoch + 1} [Validation]"):

            # 1. 입력을 GPU에 전달
            inputs = {
                "input_ids": batch["input_ids"].to(device),
                "attention_mask": batch["attention_mask"].to(device),
                "labels": batch["labels"].to(device)
            }

            # 2. forward pass
            outputs = model(**inputs)

            # 3. 예측값 집계
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1)

            # 4. 손실 집계
            loss = outputs.loss
            val_loss += loss.item()
            val_labels.extend(inputs["labels"].cpu().numpy())
            val_preds.extend(preds.cpu().numpy())

    ### 검증 단계에서의 손실 및 metric 계산
    avg_val_loss = val_loss / len(val_loader)
    val_accuracy = accuracy_score(val_labels, val_preds)
    val_f1 = f1_score(val_labels, val_preds, average='macro')
    print(
        f"Epoch {epoch + 1} "
        f"Validation Loss: {avg_val_loss:.4f} | "
        f"Validation Accuracy: {val_accuracy:.4f} | "
        f"Validation Macro F1: {val_f1:.4f}"
    )

    ### Early Stopping 여부 확인
    if val_f1 > best_val_f1:                # 가장 높은 정확도를 기록하면
        best_val_f1 = val_f1                # 정확도 기준을 갱신
        patience_counter = 0                # patience 초기화

        # 모델 및 토크나이저 저장

        ## 경로 설정
        timestamp = time.strftime("%Y%m%d-%H%M%S")
        save_path = os.path.join(DIRECTORY, "sentiment", f"{MODEL_NAME.replace('/', '_')}_{timestamp}")
        model_path = os.path.join(save_path, "model")
        tokenizer_path = os.path.join(save_path, "tokenizer")

        ## 모델 저장
        model.save_pretrained(model_path)

        ## 토크나이저 저장 (KoBertTokenizer는 save_pretrained()를 지원하지 않기에 직접 저장)
        ## 사실 KoBertTokenizer를 그대로 써도 됨...

        ### 디렉토리 생성
        os.makedirs(tokenizer_path, exist_ok=True)

        ### vocab.txt 저장
        tokenizer.save_vocabulary(tokenizer_path)

        ### config.json 저장
        with open(os.path.join(tokenizer_path, "tokenizer_config.json"), "w", encoding="utf-8") as f:
            import json
            json.dump({
                "do_lower_case": False,
                "unk_token": "[UNK]",
                "sep_token": "[SEP]",
                "pad_token": "[PAD]",
                "cls_token": "[CLS]",
                "mask_token": "[MASK]"
            }, f, indent=4)
    else:                                   # 그렇지 못하면
        patience_counter += 1               # patience 값을 1 증가
        print(
            f"Validation accuracy did not improve. "
            f"Patience counter: {patience_counter}/{PATIENCE}"
        )

    if patience_counter >= PATIENCE:        # 일정 epoch을 순회할 때까지 정확도 기준을 갱신하지 못하면
        print("Early stopping triggered.")  #
        break                               # 학습 종료

Epoch 1 [Training]: 100%|██████████| 1125/1125 [03:17<00:00,  5.69it/s]


Epoch 1 Training Loss: 1.0349 | Training Accuracy: 0.6156 | Training Macro F1: 0.6144


Epoch 1 [Validation]: 100%|██████████| 375/375 [00:22<00:00, 16.67it/s]


Epoch 1 Validation Loss: 1.0045 | Validation Accuracy: 0.6283 | Validation Macro F1: 0.6173


Epoch 2 [Training]: 100%|██████████| 1125/1125 [03:19<00:00,  5.64it/s]


Epoch 2 Training Loss: 1.0255 | Training Accuracy: 0.6168 | Training Macro F1: 0.6161


Epoch 2 [Validation]: 100%|██████████| 375/375 [00:20<00:00, 18.27it/s]


Epoch 2 Validation Loss: 1.0861 | Validation Accuracy: 0.5910 | Validation Macro F1: 0.5879
Validation accuracy did not improve. Patience counter: 1/3


Epoch 3 [Training]: 100%|██████████| 1125/1125 [03:16<00:00,  5.74it/s]


Epoch 3 Training Loss: 0.8822 | Training Accuracy: 0.6716 | Training Macro F1: 0.6710


Epoch 3 [Validation]: 100%|██████████| 375/375 [00:20<00:00, 18.34it/s]


Epoch 3 Validation Loss: 1.1819 | Validation Accuracy: 0.5745 | Validation Macro F1: 0.5637
Validation accuracy did not improve. Patience counter: 2/3


Epoch 4 [Training]: 100%|██████████| 1125/1125 [03:16<00:00,  5.74it/s]


Epoch 4 Training Loss: 0.6570 | Training Accuracy: 0.7591 | Training Macro F1: 0.7589


Epoch 4 [Validation]: 100%|██████████| 375/375 [00:20<00:00, 18.05it/s]

Epoch 4 Validation Loss: 1.2498 | Validation Accuracy: 0.5715 | Validation Macro F1: 0.5668
Validation accuracy did not improve. Patience counter: 3/3
Early stopping triggered.





In [None]:
#--------------------------------------------------------------------------------------------------------------------------------
# !!! 중요 !!!
#
# 이 코드는 모델의 검증 과정에서 early-stopping을 수행하지 않습니다.
#
# early-stopping 여부를 확인하며 학습하고자 한다면, 이 코드 블록을 **실행하지 마시고** 바로 위에 있는 코드 블록을 실행하십시오.
#
#--------------------------------------------------------------------------------------------------------------------------------

# 전체 데이터를 {NUM_EPOCHS}회 순회하며 학습
for epoch in range(NUM_EPOCHS):

    ### 학습 과정(train)

    # 모델을 학습 모드로 전환
    model.train()

    train_loss = 0
    train_labels = []
    train_preds = []

    # 배치 단위로 학습 수행
    for batch in tqdm(train_loader, desc=f"Epoch {epoch + 1} [Training]"):

        # 1. 입력을 GPU에 전달
        inputs = {
            "input_ids": batch["input_ids"].to(device),
            "attention_mask": batch["attention_mask"].to(device),
            "labels": batch["labels"].to(device)
        }

        # 2. gradient 초기화
        optimizer.zero_grad()

        # 3. forward pass
        outputs = model(**inputs)

        # 4. 예측값 집계
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1)

        # 5. 손실 집계
        loss = outputs.loss
        train_loss += loss.item()
        train_labels.extend(inputs["labels"].cpu().numpy())
        train_preds.extend(preds.cpu().numpy())

        # 6. 모델 파라미터 갱신(backward pass; back propagation)
        loss.backward()
        optimizer.step()
        lr_scheduler.step()

    ### 학습 단계에서의 손실 및 metric 계산
    avg_train_loss = train_loss / len(train_loader)
    train_accuracy = accuracy_score(train_labels, train_preds)
    train_f1 = f1_score(train_labels, train_preds, average='macro')
    print(
        f"Epoch {epoch + 1} "
        f"Training Loss: {avg_train_loss:.4f} | "
        f"Training Accuracy: {train_accuracy:.4f} | "
        f"Training Macro F1: {train_f1:.4f}"
    )

    ### 검증 과정(validation)

    # 모델을 검증/평가/추론 모드로 전환
    model.eval()

    val_loss = 0
    val_labels = []
    val_preds = []

    # 검증 과정에서는 gradient 연산을 생략하여 연산량 및 오버헤드를 줄임
    with torch.no_grad():

        # 배치 단위로 검증 수행
        for batch in tqdm(val_loader, desc=f"Epoch {epoch + 1} [Validation]"):

            # 1. 입력을 GPU에 전달
            inputs = {
                "input_ids": batch["input_ids"].to(device),
                "attention_mask": batch["attention_mask"].to(device),
                "labels": batch["labels"].to(device)
            }

            # 2. forward pass
            outputs = model(**inputs)

            # 3. 예측값 집계
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1)

            # 4. 손실 집계
            loss = outputs.loss
            val_loss += loss.item()
            val_labels.extend(inputs["labels"].cpu().numpy())
            val_preds.extend(preds.cpu().numpy())

    ### 검증 단계에서의 손실 및 metric 계산
    avg_val_loss = val_loss / len(val_loader)
    val_accuracy = accuracy_score(val_labels, val_preds)
    val_f1 = f1_score(val_labels, val_preds, average='macro')
    print(
        f"Epoch {epoch + 1} "
        f"Validation Loss: {avg_val_loss:.4f} | "
        f"Validation Accuracy: {val_accuracy:.4f} | "
        f"Validation Macro F1: {val_f1:.4f}"
    )

# 모델 및 토크나이저 저장

## 경로 설정
timestamp = time.strftime("%Y%m%d-%H%M%S")
save_path = os.path.join(DIRECTORY, "sentiment", f"{MODEL_NAME.replace('/', '_')}_{timestamp}")
model_path = os.path.join(save_path, "model")
tokenizer_path = os.path.join(save_path, "tokenizer")

## 모델 저장
model.save_pretrained(model_path)

## 토크나이저 저장 (KoBertTokenizer는 save_pretrained()를 지원하지 않기에 직접 저장)
## 사실 KoBertTokenizer를 그대로 써도 됨...

### 디렉토리 생성
os.makedirs(tokenizer_path, exist_ok=True)

### vocab.txt 저장
tokenizer.save_vocabulary(tokenizer_path)

### config.json 저장
with open(os.path.join(tokenizer_path, "tokenizer_config.json"), "w", encoding="utf-8") as f:
    import json
    json.dump({
        "do_lower_case": False,
        "unk_token": "[UNK]",
        "sep_token": "[SEP]",
        "pad_token": "[PAD]",
        "cls_token": "[CLS]",
        "mask_token": "[MASK]"
    }, f, indent=4)

## Evaluation

In [47]:
# Label dictionary
LABEL_2_ID = {
    "공포": 0,
    "놀람": 1,
    "분노": 2,
    "슬픔": 3,
    "중립": 4,
    "행복": 5,
    "혐오": 6,
}
ID_2_LABEL = {value: key for key, value in LABEL_2_ID.items()}

### Load Pre-trained Model

In [48]:
# 기학습 모델의 경로
path = "/content/drive/MyDrive/DeepLearning/sentiment/UICHEOL-HWANG_kobert_20250626-111637"

config = AutoConfig.from_pretrained(
    os.path.join(path, "model")
)

model = AutoModelForSequenceClassification.from_pretrained(
    os.path.join(path, "model"),
    config=config,
    ignore_mismatched_sizes=True
)

model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(8002, 768, padding_idx=1)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-

In [50]:
# baseline model
# model = "UICHEOL-HWANG/kobert"

# config = AutoConfig.from_pretrained(
#     model
# )

# model = AutoModelForSequenceClassification.from_pretrained(
#     model,
#     config=config,
#     ignore_mismatched_sizes=True
# )

# model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(8002, 768, padding_idx=1)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-

### Evaluation

In [51]:
model.eval()  # 모델을 검증/평가/추론 모드로 전환

# 테스트
test_loss = 0.0
test_labels = []
test_preds = []

with torch.no_grad():
    for batch in tqdm(test_loader, desc="[Test]"):

        # 입력을 GPU로 전달
        inputs = {
            "input_ids": batch["input_ids"].to(device),
            "attention_mask": batch["attention_mask"].to(device),
            "labels": batch["labels"].to(device)
        }

        # forward pass
        outputs = model(**inputs)

        # 예측값 집계
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1)

        # 손실 집계
        loss  = outputs.loss
        test_loss += loss.item()
        test_labels.extend(inputs["labels"].cpu().numpy())
        test_preds.extend(preds.cpu().numpy())

# metric 계산
avg_test_loss = test_loss / len(test_loader)
test_accuracy = accuracy_score(test_labels, test_preds)
test_f1 = f1_score(test_labels, test_preds, average="macro")

print(f"Test Loss: {avg_test_loss:.4f} | "
      f"Test Accuracy: {test_accuracy:.4f} | "
      f"Test Macro F1: {test_f1:.4f}")

print(classification_report(test_labels, test_preds, target_names=LABEL_2_ID.keys()))

[Test]: 100%|██████████| 375/375 [00:21<00:00, 17.57it/s]


Test Loss: 0.9792 | Test Accuracy: 0.6440 | Test Macro F1: 0.6383
              precision    recall  f1-score   support

          공포       0.69      0.72      0.70       858
          놀람       0.55      0.76      0.64       859
          분노       0.55      0.63      0.59       855
          슬픔       0.78      0.70      0.73       860
          중립       0.63      0.50      0.55       858
          행복       0.84      0.86      0.85       854
          혐오       0.49      0.34      0.40       856

    accuracy                           0.64      6000
   macro avg       0.64      0.64      0.64      6000
weighted avg       0.64      0.64      0.64      6000



## Inference Test

In [None]:
# Label dictionary
LABEL_2_ID = {
    "공포": 0,
    "놀람": 1,
    "분노": 2,
    "슬픔": 3,
    "중립": 4,
    "행복": 5,
    "혐오": 6,
}
ID_2_LABEL = {value: key for key, value in LABEL_2_ID.items()}

In [None]:
# 텍스트 클리닝 함수 정의
URL_PAT = re.compile(r'https?://\S+')
HTML_PAT = re.compile(r'<[^>]+>')
REPEAT_PAT = re.compile(r'(.)\1{2,}')                       # 3회 이상 연속된 글자(ㅋㅋㅋ, ㅠㅠㅠ 등) → 2회로 축약
SPEC_PAT = re.compile(r'[^ㄱ-ㅎ가-힣a-zA-Z0-9\s\.\,\!\?]+') # 허용 문자: 한글, 영문, 숫자, 공백, 주요 punctuation
MULTI_SP = re.compile(r'\s+')

def clean_text(text):
    text = html.unescape(text)                          # &quot; → "
    text = URL_PAT.sub(' URL ', text)                   # URL 토큰화
    text = HTML_PAT.sub(' ', text)                      # HTML tag 제거
    text = emoji.demojize(text, delimiters=(' ', ' '))  # 😀 → :grinning_face:
    text = REPEAT_PAT.sub(r'\1\1', text)                # ㅋㅋㅋㅋ → ㅋㅋ
    text = SPEC_PAT.sub(' ', text)                      # 특수문자 제거
    text = unicodedata.normalize('NFKC', text)          # Unicode 정규화
    text = MULTI_SP.sub(' ', text).strip()              # 다중 공백 정규화
    return text

In [None]:
# 모델 정의 및 초기화

# 기학습 모델의 경로
path = "/content/drive/MyDrive/DeepLearning/sentiment/UICHEOL-HWANG_kobert_20250621-093152"

config = AutoConfig.from_pretrained(
    os.path.join(path, "model")
)

model = AutoModelForSequenceClassification.from_pretrained(
    os.path.join(path, "model"),
    config=config,
    ignore_mismatched_sizes=True
)

model.to(device)

In [None]:
# 입력 텍스트
text = [
    "Hello Mr. my yesterday",
    "전해주지 않을래",
    "꿈이 이루어지는 그때 꼭 다시 만나자고",
    "미치도록 내달려도 앞이 보이지 않아",
    "덩그러니 홀로 남겨져 길 위에 털썩",
    "주저앉아 애써 눈물을 참으려 했어",
    "초라한 내가 싫어서...",
    "Hello Mr. my yesterday",
    "타임머신을 타고",
    "꿈을 쫓는 어제의 내게 전해야 될 얘기",
    "내 전부를 걸고 맹세할게 삶이 끝난다 해도",
    "꿈이 이뤄질 그때 너를 맞이하러 가겠어"
]

# 텍스트 전처리
preprocessed_text = [clean_text(t) for t in text]

# 토크나이저 정의
MODEL_NAME = "UICHEOL-HWANG/kobert"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)

# 텍스트 토큰화
tokenized_text = tokenizer(
    preprocessed_text,
    padding="max_length",
    truncation=True,
    max_length=128,
    return_tensors="pt"
)

# 모델을 GPU로 이동
model.to(device)

# 모델을 검증/평가/추론 모드로 전환
model.eval()

# 추론
with torch.no_grad():

    # 입력을 GPU에 전달
    inputs = {
        "input_ids": tokenized_text["input_ids"].to(device),
        "attention_mask": tokenized_text["attention_mask"].to(device)
    }

    # 추론
    outputs = model(**inputs)
    logits = outputs.logits
    preds = torch.argmax(logits, dim=1)

for text, logit, pred in zip(text, logits, preds):
    print(f"입력 텍스트: {text}")
    print(f"감정 벡터: {[round(num.item(), 3) for num in logit.cpu().numpy()]}")
    print(f"예측된 감정: {ID_2_LABEL[int(pred)]}")
    print()

# Pipeline: Korean Hate Speech Detection

In [6]:
# Label dictionary
LABEL_2_ID = {
    0: 0,
    1: 1,
}
ID_2_LABEL = {value: key for key, value in LABEL_2_ID.items()}

## Load Datasets

In [7]:
df_train = pd.read_csv(os.path.join(DIRECTORY, 'train.tsv'), sep='\t')
df_validation = pd.read_csv(os.path.join(DIRECTORY, 'validation.tsv'), sep='\t')
df_test = pd.read_csv(os.path.join(DIRECTORY, 'test.tsv'), sep='\t')

In [8]:
# dataset 병합
df = pd.concat([df_train, df_validation, df_test])

In [None]:
display(df)

Unnamed: 0,text,label
0,언니 화면멈췄어,0
1,디피씨 박제요????,0
2,철팽씨가 저를 카트로 암살했어유..(?),0
3,타우러스 몇센치 옆에 맞아놓고,0
4,팀버그가,0
...,...,...
499995,너무 속보이잖아ㅋㅋ,0
499996,아파서 조퇴함,0
499997,<@981538220909133885> 돌아오쇼,0
499998,넌 골을 넣고 미국 혁명이라고 공포했지,0


## Text Preprocessing

### checking missing data

In [9]:
mask = (df["text"].fillna('').str.len() == 0) | (~df["label"].isin(LABEL_2_ID))
if mask.any(): print(f"{mask.sum()} missing datas")
else: print("no missing data")

no missing data


In [None]:
hate_counts_dict = df["label"].value_counts().to_dict()
print(hate_counts_dict)

### dataset balancing

In [10]:
df = (
    df.groupby("label")
    .apply(lambda x: x.sample(n=df["label"].value_counts().min(), random_state=42)) # down-sampling
    .reset_index(drop=True)                                                         # multi-index 방지
    .sample(frac=1, random_state=42)
    .reset_index(drop=True)
)

  .apply(lambda x: x.sample(n=df["label"].value_counts().min(), random_state=42)) # down-sampling


### outliar processing

In [11]:
# Sentiment Analysis의 outliar processing과 동일한 필터링 함수를 가짐
URL_PAT = re.compile(r'https?://\S+')
HTML_PAT = re.compile(r'<[^>]+>')
REPEAT_PAT = re.compile(r'(.)\1{2,}')                       # 3회 이상 연속된 글자(ㅋㅋㅋ, ㅠㅠㅠ 등) → 2회로 축약
SPEC_PAT = re.compile(r'[^ㄱ-ㅎ가-힣a-zA-Z0-9\s\.\,\!\?]+') # 허용 문자: 한글, 영문, 숫자, 공백, 주요 punctuation
MULTI_SP = re.compile(r'\s+')

def clean_text(text):
    text = html.unescape(text)                          # &quot; → "
    text = URL_PAT.sub(' URL ', text)                   # URL 토큰화
    text = HTML_PAT.sub(' ', text)                      # HTML tag 제거
    text = emoji.demojize(text, delimiters=(' ', ' '))  # 😀 → :grinning_face:
    text = REPEAT_PAT.sub(r'\1\1', text)                # ㅋㅋㅋㅋ → ㅋㅋ
    text = SPEC_PAT.sub(' ', text)                      # 특수문자 제거
    text = unicodedata.normalize('NFKC', text)          # Unicode 정규화
    text = MULTI_SP.sub(' ', text).strip()              # 다중 공백 정규화
    return text

df["text"] = df["text"].apply(clean_text)

### dataset slicing

In [12]:
# colab의 GPU 메모리 제한으로 인해 dataset의 일부만 학습에 사용
df = df.iloc[:30000]

### dataset split

In [13]:
# feature와 target 분리
texts = df["text"].tolist()
labels = df["label"].tolist()

# train/validation/test split
train_texts, test_texts, train_labels, test_labels = train_test_split(
    texts, labels,
    test_size=0.2, random_state=42, stratify=labels
)
train_texts, val_texts, train_labels, val_labels = train_test_split(
    train_texts, train_labels,
    test_size=0.25, random_state=42, stratify=train_labels
)

## Text Tokenize

In [14]:
MODEL_NAME = "UICHEOL-HWANG/kobert"
BATCH_SIZE = 16

In [15]:
# 이전에 정의한 클래스, 생략 가능
class EmotionDataset(Dataset):
    """
    문장과 라벨을 읽어
    Transformer 토크나이저로 encoding
    """
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, index):
        text = self.texts[index]
        label = self.labels[index]

        encoding = self.tokenizer(
            text,
            padding="max_length",
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt"
        )

        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "labels": torch.tensor(label, dtype=torch.long),
        }

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)

train_dataset = EmotionDataset(train_texts, train_labels, tokenizer)
val_dataset = EmotionDataset(val_texts, val_labels, tokenizer)
test_dataset = EmotionDataset(test_texts, test_labels, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenization_kobert.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/monologg/kobert:
- tokenization_kobert.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


tokenizer_78b3253a26.model:   0%|          | 0.00/371k [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

## Text Vectorize - Sentiment Model Inference

In [16]:
# GPU 메모리 캐시 정리
gc.collect()
torch.cuda.empty_cache()

### Load Pre-trained Model

In [17]:
# 기학습 모델의 경로
path = "/content/drive/MyDrive/DeepLearning/sentiment/UICHEOL-HWANG_kobert_20250626-111637"

config = AutoConfig.from_pretrained(
    os.path.join(path, "model")
)

model = AutoModelForSequenceClassification.from_pretrained(
    os.path.join(path, "model"),
    config=config,
    ignore_mismatched_sizes=True
)

model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(8002, 768, padding_idx=1)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-

### Text Vectorize

In [18]:
train_vectors = []
val_vectors = []
test_vectors = []

train_labels = []
val_labels = []
test_labels = []

# 모델을 검증/평가/추론 모드로 전환
model.eval()

with torch.no_grad():

    for batch in tqdm(train_loader, desc=f"[Inference - Training Data]"):

        # 입력을 GPU에 전달
        inputs = {
            "input_ids": batch["input_ids"].to(device),
            "attention_mask": batch["attention_mask"].to(device),
            "labels": batch["labels"].to(device)
        }

        # logit 구하기
        outputs = model(**inputs)
        logits = outputs.logits

        train_labels.extend(inputs["labels"].cpu().numpy())
        train_vectors.extend(logits.cpu().numpy())

    for batch in tqdm(val_loader, desc=f"[Inference - Validation Data]"):

        # 입력을 GPU에 전달
        inputs = {
            "input_ids": batch["input_ids"].to(device),
            "attention_mask": batch["attention_mask"].to(device),
            "labels": batch["labels"].to(device)
        }

        # logit 구하기
        outputs = model(**inputs)
        logits = outputs.logits

        val_labels.extend(inputs["labels"].cpu().numpy())
        val_vectors.extend(logits.cpu().numpy())

    for batch in tqdm(test_loader, desc=f"[Inference - Test Data]"):

        # 입력을 GPU에 전달
        inputs = {
            "input_ids": batch["input_ids"].to(device),
            "attention_mask": batch["attention_mask"].to(device),
            "labels": batch["labels"].to(device)
        }

        # logit 구하기
        outputs = model(**inputs)
        logits = outputs.logits

        test_labels.extend(inputs["labels"].cpu().numpy())
        test_vectors.extend(logits.cpu().numpy())

[Inference - Training Data]: 100%|██████████| 1125/1125 [02:09<00:00,  8.68it/s]
[Inference - Validation Data]: 100%|██████████| 375/375 [00:44<00:00,  8.48it/s]
[Inference - Test Data]: 100%|██████████| 375/375 [00:44<00:00,  8.41it/s]


In [19]:
class BinaryTargetDataset(Dataset):
    """
    학습에 맞게 데이터를 전처리
    """
    def __init__(self, features, targets):
        self.features = torch.tensor(features, dtype=torch.float32)
        self.targets = torch.tensor(targets, dtype=torch.long)

    def __len__(self):
        return len(self.targets)

    def __getitem__(self, index):
        return {
            'input': self.features[index],
            'label': self.targets[index]
        }

train_dataset = BinaryTargetDataset(train_vectors, train_labels)
val_dataset = BinaryTargetDataset(val_vectors, val_labels)
test_dataset = BinaryTargetDataset(test_vectors, test_labels)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=True)

  self.features = torch.tensor(features, dtype=torch.float32)


## Model Training

### Model Hyperparams

In [20]:
INPUT_SIZE = len(train_vectors[0])
NUM_LABELS = 2
HIDDEN_SIZE = 64
BATCH_SIZE = 16
NUM_EPOCHS = 20
LEARNING_RATE = 1e-3
WARMUP_DECAY_RATE = 0.1 # 학습률 증가 => 감소

### Pre-Training

In [21]:
# MLP 모델 정의
class MLPClassifier(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(MLPClassifier, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x  # 로짓 출력

input_size = INPUT_SIZE
hidden_size = HIDDEN_SIZE
num_classes = NUM_LABELS

# 모델 초기화
model = MLPClassifier(input_size, hidden_size, num_classes)

# 모델을 디바이스로 이동
model.to(device)

# 손실 함수와 옵티마이저
criterion = nn.CrossEntropyLoss()
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
num_training_steps = NUM_EPOCHS * len(train_loader)
num_warmup_steps = int(WARMUP_DECAY_RATE * num_training_steps)

lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=num_warmup_steps,
    num_training_steps=num_training_steps
)

### Model Training

In [22]:
# 전체 데이터를 {NUM_EPOCHS}회 순회하며 학습
for epoch in range(NUM_EPOCHS):

    ### 학습 과정(train)

    # 모델을 학습 모드로 전환
    model.train()

    train_loss = 0
    train_labels = []
    train_preds = []

    # 배치 단위로 학습 수행
    for batch in tqdm(train_loader, desc=f"Epoch {epoch + 1} [Training]"):

        # 1. 입력을 GPU에 전달
        inputs = batch['input'].to(device)
        labels = batch['label'].to(device)

        # 2. gradient 초기화
        optimizer.zero_grad()

        # 3. forward pass
        outputs = model(inputs)

        # 4. 예측값 집계
        preds = torch.argmax(outputs, dim=1)

        # 5. 손실 집계
        loss = criterion(outputs, labels)
        train_loss += loss.item()
        train_labels.extend(labels.cpu().numpy())
        train_preds.extend(preds.cpu().numpy())

        # 6. 모델 파라미터 갱신(backward pass; back propagation)
        loss.backward()
        optimizer.step()
        lr_scheduler.step()

    ### 학습 단계에서의 손실 및 metric 계산
    avg_train_loss = train_loss / len(train_loader)
    train_accuracy = accuracy_score(train_labels, train_preds)
    train_f1 = f1_score(train_labels, train_preds, average='macro')
    print(
        f"Epoch {epoch + 1} "
        f"Training Loss: {avg_train_loss:.4f} | "
        f"Training Accuracy: {train_accuracy:.4f} | "
        f"Training Macro F1: {train_f1:.4f}"
    )

    ### 검증 과정(validation)

    # 모델을 검증/평가/추론 모드로 전환
    model.eval()

    val_loss = 0
    val_labels = []
    val_preds = []

    with torch.no_grad():       # 검증 과정에서는 gradient 연산을 생략(torch.no_grad())하여 연산량 및 오버헤드를 줄임

        # 배치 단위로 검증 수행
        for batch in tqdm(val_loader, desc=f"Epoch {epoch + 1} [Validation]"):

            # 1. 입력을 GPU에 전달
            inputs = batch['input'].to(device)
            labels = batch['label'].to(device)

            # 2. forward pass
            outputs = model(inputs)

            # 3. 예측값 집계
            preds = torch.argmax(outputs, dim=1)

            # 4. 손실 집계
            loss = criterion(outputs, labels)
            val_loss += loss.item()
            val_labels.extend(labels.cpu().numpy())
            val_preds.extend(preds.cpu().numpy())

    ### 검증 단계에서의 손실 및 metric 계산
    avg_val_loss = val_loss / len(val_loader)
    val_accuracy = accuracy_score(val_labels, val_preds)
    val_f1 = f1_score(val_labels, val_preds, average='macro')
    print(
        f"Epoch {epoch + 1} "
        f"Validation Loss: {avg_val_loss:.4f} | "
        f"Validation Accuracy: {val_accuracy:.4f} | "
        f"Validation Macro F1: {val_f1:.4f}"
    )

# 모델 및 토크나이저 저장

## 경로 설정
timestamp = time.strftime("%Y%m%d-%H%M%S")
save_path = os.path.join(DIRECTORY, "hate_speech", f"my_model_{timestamp}")
model_path = os.path.join(save_path, "model")
tokenizer_path = os.path.join(save_path, "tokenizer")

## 모델 저장

### 디렉토리 생성
os.makedirs(model_path, exist_ok=True)

### 모델 저장
torch.save(model.state_dict(), os.path.join(model_path, "pytorch_model.bin"))

## 토크나이저 저장 (KoBertTokenizer는 save_pretrained()를 지원하지 않기에 직접 저장)
## 사실 KoBertTokenizer를 그대로 써도 됨...

### 디렉토리 생성
os.makedirs(tokenizer_path, exist_ok=True)

### vocab.txt 저장
tokenizer.save_vocabulary(tokenizer_path)

### config.json 저장
with open(os.path.join(tokenizer_path, "tokenizer_config.json"), "w", encoding="utf-8") as f:
    import json
    json.dump({
        "do_lower_case": False,
        "unk_token": "[UNK]",
        "sep_token": "[SEP]",
        "pad_token": "[PAD]",
        "cls_token": "[CLS]",
        "mask_token": "[MASK]"
    }, f, indent=4)

Epoch 1 [Training]: 100%|██████████| 1125/1125 [00:02<00:00, 532.46it/s]


Epoch 1 Training Loss: 0.5801 | Training Accuracy: 0.6749 | Training Macro F1: 0.6700


Epoch 1 [Validation]: 100%|██████████| 375/375 [00:00<00:00, 1745.10it/s]


Epoch 1 Validation Loss: 0.5022 | Validation Accuracy: 0.7558 | Validation Macro F1: 0.7548


Epoch 2 [Training]: 100%|██████████| 1125/1125 [00:01<00:00, 631.53it/s]


Epoch 2 Training Loss: 0.5007 | Training Accuracy: 0.7546 | Training Macro F1: 0.7536


Epoch 2 [Validation]: 100%|██████████| 375/375 [00:00<00:00, 1741.85it/s]


Epoch 2 Validation Loss: 0.4899 | Validation Accuracy: 0.7632 | Validation Macro F1: 0.7622


Epoch 3 [Training]: 100%|██████████| 1125/1125 [00:01<00:00, 619.25it/s]


Epoch 3 Training Loss: 0.4951 | Training Accuracy: 0.7554 | Training Macro F1: 0.7547


Epoch 3 [Validation]: 100%|██████████| 375/375 [00:00<00:00, 1826.24it/s]


Epoch 3 Validation Loss: 0.4854 | Validation Accuracy: 0.7642 | Validation Macro F1: 0.7626


Epoch 4 [Training]: 100%|██████████| 1125/1125 [00:01<00:00, 632.25it/s]


Epoch 4 Training Loss: 0.4906 | Training Accuracy: 0.7613 | Training Macro F1: 0.7607


Epoch 4 [Validation]: 100%|██████████| 375/375 [00:00<00:00, 1753.14it/s]


Epoch 4 Validation Loss: 0.4857 | Validation Accuracy: 0.7635 | Validation Macro F1: 0.7635


Epoch 5 [Training]: 100%|██████████| 1125/1125 [00:02<00:00, 532.54it/s]


Epoch 5 Training Loss: 0.4891 | Training Accuracy: 0.7594 | Training Macro F1: 0.7589


Epoch 5 [Validation]: 100%|██████████| 375/375 [00:00<00:00, 1273.87it/s]


Epoch 5 Validation Loss: 0.4823 | Validation Accuracy: 0.7700 | Validation Macro F1: 0.7695


Epoch 6 [Training]: 100%|██████████| 1125/1125 [00:02<00:00, 546.53it/s]


Epoch 6 Training Loss: 0.4874 | Training Accuracy: 0.7598 | Training Macro F1: 0.7592


Epoch 6 [Validation]: 100%|██████████| 375/375 [00:00<00:00, 1756.69it/s]


Epoch 6 Validation Loss: 0.4832 | Validation Accuracy: 0.7665 | Validation Macro F1: 0.7665


Epoch 7 [Training]: 100%|██████████| 1125/1125 [00:01<00:00, 614.72it/s]


Epoch 7 Training Loss: 0.4860 | Training Accuracy: 0.7612 | Training Macro F1: 0.7608


Epoch 7 [Validation]: 100%|██████████| 375/375 [00:00<00:00, 1802.73it/s]


Epoch 7 Validation Loss: 0.4798 | Validation Accuracy: 0.7678 | Validation Macro F1: 0.7673


Epoch 8 [Training]: 100%|██████████| 1125/1125 [00:01<00:00, 609.87it/s]


Epoch 8 Training Loss: 0.4848 | Training Accuracy: 0.7631 | Training Macro F1: 0.7625


Epoch 8 [Validation]: 100%|██████████| 375/375 [00:00<00:00, 1801.76it/s]


Epoch 8 Validation Loss: 0.4832 | Validation Accuracy: 0.7667 | Validation Macro F1: 0.7666


Epoch 9 [Training]: 100%|██████████| 1125/1125 [00:01<00:00, 601.46it/s]


Epoch 9 Training Loss: 0.4843 | Training Accuracy: 0.7631 | Training Macro F1: 0.7627


Epoch 9 [Validation]: 100%|██████████| 375/375 [00:00<00:00, 1783.07it/s]


Epoch 9 Validation Loss: 0.4785 | Validation Accuracy: 0.7715 | Validation Macro F1: 0.7708


Epoch 10 [Training]: 100%|██████████| 1125/1125 [00:01<00:00, 618.36it/s]


Epoch 10 Training Loss: 0.4835 | Training Accuracy: 0.7612 | Training Macro F1: 0.7607


Epoch 10 [Validation]: 100%|██████████| 375/375 [00:00<00:00, 1715.49it/s]


Epoch 10 Validation Loss: 0.4810 | Validation Accuracy: 0.7693 | Validation Macro F1: 0.7691


Epoch 11 [Training]: 100%|██████████| 1125/1125 [00:02<00:00, 523.95it/s]


Epoch 11 Training Loss: 0.4831 | Training Accuracy: 0.7641 | Training Macro F1: 0.7636


Epoch 11 [Validation]: 100%|██████████| 375/375 [00:00<00:00, 1294.75it/s]


Epoch 11 Validation Loss: 0.4801 | Validation Accuracy: 0.7697 | Validation Macro F1: 0.7692


Epoch 12 [Training]: 100%|██████████| 1125/1125 [00:02<00:00, 556.37it/s]


Epoch 12 Training Loss: 0.4823 | Training Accuracy: 0.7623 | Training Macro F1: 0.7618


Epoch 12 [Validation]: 100%|██████████| 375/375 [00:00<00:00, 1660.23it/s]


Epoch 12 Validation Loss: 0.4804 | Validation Accuracy: 0.7667 | Validation Macro F1: 0.7659


Epoch 13 [Training]: 100%|██████████| 1125/1125 [00:01<00:00, 627.05it/s]


Epoch 13 Training Loss: 0.4815 | Training Accuracy: 0.7648 | Training Macro F1: 0.7643


Epoch 13 [Validation]: 100%|██████████| 375/375 [00:00<00:00, 1683.06it/s]


Epoch 13 Validation Loss: 0.4789 | Validation Accuracy: 0.7708 | Validation Macro F1: 0.7707


Epoch 14 [Training]: 100%|██████████| 1125/1125 [00:01<00:00, 628.40it/s]


Epoch 14 Training Loss: 0.4809 | Training Accuracy: 0.7636 | Training Macro F1: 0.7631


Epoch 14 [Validation]: 100%|██████████| 375/375 [00:00<00:00, 1651.05it/s]


Epoch 14 Validation Loss: 0.4774 | Validation Accuracy: 0.7715 | Validation Macro F1: 0.7712


Epoch 15 [Training]: 100%|██████████| 1125/1125 [00:01<00:00, 626.90it/s]


Epoch 15 Training Loss: 0.4809 | Training Accuracy: 0.7642 | Training Macro F1: 0.7637


Epoch 15 [Validation]: 100%|██████████| 375/375 [00:00<00:00, 1603.90it/s]


Epoch 15 Validation Loss: 0.4768 | Validation Accuracy: 0.7713 | Validation Macro F1: 0.7710


Epoch 16 [Training]: 100%|██████████| 1125/1125 [00:01<00:00, 630.32it/s]


Epoch 16 Training Loss: 0.4802 | Training Accuracy: 0.7645 | Training Macro F1: 0.7640


Epoch 16 [Validation]: 100%|██████████| 375/375 [00:00<00:00, 1702.25it/s]


Epoch 16 Validation Loss: 0.4768 | Validation Accuracy: 0.7695 | Validation Macro F1: 0.7692


Epoch 17 [Training]: 100%|██████████| 1125/1125 [00:02<00:00, 517.72it/s]


Epoch 17 Training Loss: 0.4797 | Training Accuracy: 0.7644 | Training Macro F1: 0.7639


Epoch 17 [Validation]: 100%|██████████| 375/375 [00:00<00:00, 1274.86it/s]


Epoch 17 Validation Loss: 0.4764 | Validation Accuracy: 0.7692 | Validation Macro F1: 0.7688


Epoch 18 [Training]: 100%|██████████| 1125/1125 [00:02<00:00, 556.96it/s]


Epoch 18 Training Loss: 0.4794 | Training Accuracy: 0.7663 | Training Macro F1: 0.7658


Epoch 18 [Validation]: 100%|██████████| 375/375 [00:00<00:00, 1794.55it/s]


Epoch 18 Validation Loss: 0.4763 | Validation Accuracy: 0.7692 | Validation Macro F1: 0.7688


Epoch 19 [Training]: 100%|██████████| 1125/1125 [00:01<00:00, 619.08it/s]


Epoch 19 Training Loss: 0.4790 | Training Accuracy: 0.7658 | Training Macro F1: 0.7653


Epoch 19 [Validation]: 100%|██████████| 375/375 [00:00<00:00, 1744.79it/s]


Epoch 19 Validation Loss: 0.4763 | Validation Accuracy: 0.7697 | Validation Macro F1: 0.7692


Epoch 20 [Training]: 100%|██████████| 1125/1125 [00:01<00:00, 630.31it/s]


Epoch 20 Training Loss: 0.4787 | Training Accuracy: 0.7654 | Training Macro F1: 0.7649


Epoch 20 [Validation]: 100%|██████████| 375/375 [00:00<00:00, 1753.46it/s]

Epoch 20 Validation Loss: 0.4763 | Validation Accuracy: 0.7702 | Validation Macro F1: 0.7697





## Evaluation

In [23]:
# Label dictionary
LABEL_2_ID = {
    0: 0,
    1: 1,
}
ID_2_LABEL = {value: key for key, value in LABEL_2_ID.items()}

### Load Pre-trained Model

In [34]:
# 기학습 모델의 경로
# 모델 파일을 지정
path = "/content/drive/MyDrive/DeepLearning/hate_speech/my_model_20250626-122126/model/pytorch_model.bin"

input_size = INPUT_SIZE
hidden_size = HIDDEN_SIZE
num_classes = NUM_LABELS

model = MLPClassifier(input_size, hidden_size, num_classes)
model.load_state_dict(torch.load(path))

model.to(device)

MLPClassifier(
  (fc1): Linear(in_features=7, out_features=64, bias=True)
  (relu): ReLU()
  (fc2): Linear(in_features=64, out_features=2, bias=True)
)

In [32]:
# baseline model
# class DummyClassifier(nn.Module):
#     def __init__(self, input_size, num_classes):
#         super(DummyClassifier, self).__init__()
#         self.input_size = input_size
#         self.num_classes = num_classes

#     def forward(self, x):
#         batch_size = x.size(0)                                              # 입력 텐서의 batch size 확인
#         device = x.device
#         logits = torch.randn(batch_size, self.num_classes, device=device)   # 무작위 로짓 생성
#         return logits

# input_size = INPUT_SIZE
# num_classes = NUM_LABELS

# model = DummyClassifier(input_size, num_classes)

# model.to(device)

DummyClassifier()

### Evaluation

In [35]:
model.eval()  # 모델을 검증/평가/추론 모드로 전환

# 테스트
test_loss = 0.0
test_labels = []
test_preds = []

with torch.no_grad():  # gradient 계산 비활성화
    for batch in tqdm(test_loader, desc="[Test]"):

        # 입력을 GPU로 전달
        inputs = batch['input'].to(device)
        labels = batch['label'].to(device)

        # forward pass
        outputs = model(inputs)

        # 예측값 집계
        preds = torch.argmax(outputs, dim=1)

        # 손실 집계
        loss = criterion(outputs, labels)
        test_loss += loss.item()
        test_labels.extend(labels.cpu().numpy())
        test_preds.extend(preds.cpu().numpy())

# metric 계산
avg_test_loss = test_loss / len(test_loader)
test_accuracy = accuracy_score(test_labels, test_preds)
test_f1 = f1_score(test_labels, test_preds, average="macro")

print(f"Test Loss: {avg_test_loss:.4f} | "
      f"Test Accuracy: {test_accuracy:.4f} | "
      f"Test Macro F1: {test_f1:.4f}")

print(classification_report(test_labels, test_preds))

[Test]: 100%|██████████| 375/375 [00:00<00:00, 1588.60it/s]

Test Loss: 0.4873 | Test Accuracy: 0.7555 | Test Macro F1: 0.7549
              precision    recall  f1-score   support

           0       0.73      0.81      0.77      2982
           1       0.79      0.70      0.74      3018

    accuracy                           0.76      6000
   macro avg       0.76      0.76      0.75      6000
weighted avg       0.76      0.76      0.75      6000






## Inference Test

In [None]:
# 텍스트 클리닝 함수 정의
URL_PAT = re.compile(r'https?://\S+')
HTML_PAT = re.compile(r'<[^>]+>')
REPEAT_PAT = re.compile(r'(.)\1{2,}')                       # 3회 이상 연속된 글자(ㅋㅋㅋ, ㅠㅠㅠ 등) → 2회로 축약
SPEC_PAT = re.compile(r'[^ㄱ-ㅎ가-힣a-zA-Z0-9\s\.\,\!\?]+') # 허용 문자: 한글, 영문, 숫자, 공백, 주요 punctuation
MULTI_SP = re.compile(r'\s+')

def clean_text(text):
    text = html.unescape(text)                          # &quot; → "
    text = URL_PAT.sub(' URL ', text)                   # URL 토큰화
    text = HTML_PAT.sub(' ', text)                      # HTML tag 제거
    text = emoji.demojize(text, delimiters=(' ', ' '))  # 😀 → :grinning_face:
    text = REPEAT_PAT.sub(r'\1\1', text)                # ㅋㅋㅋㅋ → ㅋㅋ
    text = SPEC_PAT.sub(' ', text)                      # 특수문자 제거
    text = unicodedata.normalize('NFKC', text)          # Unicode 정규화
    text = MULTI_SP.sub(' ', text).strip()              # 다중 공백 정규화
    return text

In [None]:
# MLP 모델 정의
class MLPClassifier(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(MLPClassifier, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x  # 로짓 출력

In [None]:
### Sentiment Analysis

# 입력 텍스트
text = [
    "좋은 아침이에요!",             # 혐오 표현 아님
    "검둥이들은 다 죽여버려야해",   # 혐오 표현
    "불 좀 꺼줄래?",                # "신"
    "야 이 반란군노무 쉐키야"       # 혐오 표현
]

# 텍스트 전처리
preprocessed_text = [clean_text(t) for t in text]

# 토크나이저 정의
MODEL_NAME = "UICHEOL-HWANG/kobert"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)

# 텍스트 토큰화
tokenized_text = tokenizer(
    preprocessed_text,
    padding="max_length",
    truncation=True,
    max_length=128,
    return_tensors="pt"
)

# 모델 정의 및 초기화
path = "/content/drive/MyDrive/DeepLearning/sentiment/UICHEOL-HWANG_kobert_20250621-093152"

config = AutoConfig.from_pretrained(
    os.path.join(path, "model")
)

model = AutoModelForSequenceClassification.from_pretrained(
    os.path.join(path, "model"),
    config=config,
    ignore_mismatched_sizes=True
)

# 모델을 GPU에 전달
model.to(device)

# 모델을 검증/평가/추론 모드로 전환
model.eval()

# 추론
with torch.no_grad():

    # 입력을 GPU에 전달
    inputs = {
        "input_ids": tokenized_text["input_ids"].to(device),
        "attention_mask": tokenized_text["attention_mask"].to(device)
    }

    # 추론
    outputs = model(**inputs)

    # 감정 벡터 반환
    logits = outputs.logits

### Hate Speech Detection

# 모델 정의 및 초기화
INPUT_SIZE = logits.size(1)
NUM_LABELS = 2
HIDDEN_SIZE = 64

input_size = INPUT_SIZE
hidden_size = HIDDEN_SIZE
num_classes = NUM_LABELS
path = "/content/drive/MyDrive/DeepLearning/hate_speech/UICHEOL-HWANG_kobert_20250621-155603/model/pytorch_model.bin"

model = MLPClassifier(input_size, hidden_size, num_classes)
model.load_state_dict(torch.load(path))

# 모델을 GPU에 전달
model.to(device)

# 모델을 검증/평가/추론 모드로 전환
model.eval()

with torch.no_grad():

    # 입력을 GPU에 전달
    inputs = logits.to(device)

    # 추론
    outputs = model(inputs)

    # 예측값 집계
    preds = torch.argmax(outputs, dim=1)

for text, logit, output, label in zip(text, logits, outputs, preds):
    print(f"입력 텍스트: {text}")
    print(f"감정 벡터: {[round(num.item(), 3) for num in logit.cpu().numpy()]}")
    print(f"혐오 표현 여부 벡터: {[round(num.item(), 3) for num in output.cpu().numpy()]}")
    if label == 1: print("혐오 표현입니다.")
    else: print("혐오 표현이 아닙니다.")
    print()

입력 텍스트: 좋은 아침이에요!
감정 벡터: [-1.571, -0.331, -1.078, -0.146, 0.735, 4.941, -1.237]
혐오 표현 여부 벡터: [0.284, -0.859]
혐오 표현이 아닙니다.

입력 텍스트: 검둥이들은 다 죽여버려야해
감정 벡터: [-0.488, -0.683, 3.154, -0.448, 0.485, -1.693, -0.102]
혐오 표현 여부 벡터: [-0.918, 0.915]
혐오 표현입니다.

입력 텍스트: 불 좀 꺼줄래?
감정 벡터: [0.83, 0.569, 0.865, -0.164, 1.361, -2.503, -0.959]
혐오 표현 여부 벡터: [-0.125, -0.199]
혐오 표현이 아닙니다.

입력 텍스트: 야 이 반란군노무 쉐키야
감정 벡터: [-1.191, 0.922, 1.423, -1.636, 1.541, -0.552, -0.069]
혐오 표현 여부 벡터: [-0.713, 0.967]
혐오 표현입니다.



In [None]:
print("$FINISH")

$FINISH


# Legacy

In [None]:
# UICHEOL-HWANG/kobert sample code
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
tokenizer = AutoTokenizer.from_pretrained("monologg/kobert")
model = AutoModelForSequenceClassification.from_pretrained("UICHEOL-HWANG/kobert")

model.to(device)  # 모델을 디바이스로 이동
model.eval()

id2label = {
    0: "공포",   # Fear
    1: "놀람",   # Surprise
    2: "분노",   # Anger
    3: "슬픔",   # Sadness
    4: "중립",   # Neutral
    5: "행복",   # Joy
    6: "혐오"    # Disgust
}

texts = [
    "오늘 너무 행복해!",
    "진짜 짜증나고 화난다.",
    "이게 무서워서 못하겠어.",
    "별 감정이 없어요.",
    "충격적인 뉴스였어."
]


def preprocess_and_tokenize(texts, tokenizer, max_length=128):
    inputs = tokenizer(
        texts,
        padding=True,  # 배치 크기에 맞게 패딩
        truncation=True,  # 최대 길이 초과 시 자름
        max_length=max_length,
        return_tensors="pt",  # PyTorch 텐서 반환
    )
    return inputs

inputs = preprocess_and_tokenize(texts, tokenizer)

# 입력 데이터를 GPU로 이동
inputs = {key: val.to(device) for key, val in inputs.items()}

# 4. 인퍼런스 수행
with torch.no_grad():
    outputs = model(**inputs)
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1).cpu().numpy()


# 예측 결과 매핑
predicted_labels = [id2label[pred] for pred in predictions]

# 6. 결과 출력
for text, label in zip(texts, predicted_labels):
    print(f"Input: {text}")
    print(f"Predicted Label: {label}\n")

The repository `monologg/kobert` contains custom code which must be executed to correctly load the model. You can inspect the repository content at https://hf.co/monologg/kobert.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y
Input: 오늘 너무 행복해!
Predicted Label: 기쁨

Input: 진짜 짜증나고 화난다.
Predicted Label: 분노

Input: 이게 무서워서 못하겠어.
Predicted Label: 공포

Input: 별 감정이 없어요.
Predicted Label: 슬픔

Input: 충격적인 뉴스였어.
Predicted Label: 놀람

