In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score, accuracy_score
from sklearn.utils.class_weight import compute_class_weight
import joblib
from tqdm import tqdm
import os

In [None]:
# 하이퍼파라미터
EPOCHS = 10
BATCH_SIZE = 8
MAX_LEN = 128
LR = 2e-5
PATIENCE = 2
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
BEST_MODEL_PATH = "/content/drive/MyDrive/금융공모전/best_model"

# 1. 데이터 로드
df = pd.read_excel("/content/drive/MyDrive/금융공모전/merged_emotion_dataset.xlsx")
df = df.reset_index(drop=True)

# 증강
TARGET_SIZE = 10000
augmented_data = []

for emotion, group in df.groupby("Emotion"):
    count = len(group)

    if count > TARGET_SIZE:
        sampled = group.sample(n=TARGET_SIZE, random_state=42)
        augmented_data.append(sampled)

    elif count == TARGET_SIZE:
        augmented_data.append(group)

    else:
        needed = TARGET_SIZE - count
        # 🔁 부족한 수만큼 반복해서 복사
        repeated = group.sample(n=needed, replace=True, random_state=42)
        combined = pd.concat([group, repeated], ignore_index=True)
        augmented_data.append(combined)

df = pd.concat(augmented_data, ignore_index=True).sample(frac=1.0, random_state=42)

# # 2. 다운샘플링: 감정당 최대 3000개
# df = (
#     df.groupby("Emotion")
#     .apply(lambda x: x.sample(n=8000, random_state=42) if len(x) > 8000 else x)
#     .reset_index(drop=True)
# )
print(df["Emotion"].value_counts())
# 3. 라벨 인코딩

le = LabelEncoder()
df["label_id"] = le.fit_transform(df["Emotion"])

# 4. train/val/test split
train_val, test = train_test_split(df, test_size=0.1, stratify=df["label_id"], random_state=42)
train, val = train_test_split(train_val, test_size=0.1, stratify=train_val["label_id"], random_state=42)

# 5. 토크나이저
model_name = "monologg/koelectra-base-v3-discriminator"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# 6. Dataset 클래스 정의
class EmotionDataset(Dataset):
    def __init__(self, texts, labels):
        self.encodings = tokenizer(list(texts), truncation=True, padding="max_length", max_length=MAX_LEN)
        self.labels = labels.reset_index(drop=True)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# 7. DataLoader
train_dataset = EmotionDataset(train["Sentence"], train["label_id"])
val_dataset = EmotionDataset(val["Sentence"], val["label_id"])
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)

# 8. 클래스 가중치 계산
class_weights = compute_class_weight("balanced", classes=np.unique(df["label_id"]), y=df["label_id"])
class_weights = torch.tensor(class_weights, dtype=torch.float).to(DEVICE)

# 9. 모델 정의
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(le.classes_)).to(DEVICE)
optimizer = torch.optim.AdamW(model.parameters(), lr=LR)
loss_fn = torch.nn.CrossEntropyLoss(weight=class_weights)

# 10. EarlyStopping 클래스
class EarlyStopping:
    def __init__(self, patience=2):
        self.patience = patience
        self.counter = 0
        self.best_f1 = 0
        self.early_stop = False

    def __call__(self, f1):
        if f1 > self.best_f1:
            self.best_f1 = f1
            self.counter = 0
        else:
            self.counter += 1
            if self.counter >= self.patience:
                self.early_stop = True

early_stopping = EarlyStopping(patience=PATIENCE)
best_f1 = 0

# 🔁 학습 루프
for epoch in range(EPOCHS):
    model.train()
    train_preds, train_labels = [], []

    for batch in tqdm(train_loader, desc=f"Epoch {epoch+1} Training"):
        optimizer.zero_grad()
        input_ids = batch["input_ids"].to(DEVICE)
        attention_mask = batch["attention_mask"].to(DEVICE)
        labels = batch["labels"].to(DEVICE)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = loss_fn(outputs.logits, labels)
        loss.backward()
        optimizer.step()

        preds = torch.argmax(outputs.logits, dim=1)
        train_preds.extend(preds.cpu().numpy())
        train_labels.extend(labels.cpu().numpy())

    train_acc = accuracy_score(train_labels, train_preds)

    # 🔍 검증
    model.eval()
    val_preds, val_labels = [], []
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch["input_ids"].to(DEVICE)
            attention_mask = batch["attention_mask"].to(DEVICE)
            labels = batch["labels"].to(DEVICE)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            preds = torch.argmax(outputs.logits, dim=1)
            val_preds.extend(preds.cpu().numpy())
            val_labels.extend(labels.cpu().numpy())

    val_acc = accuracy_score(val_labels, val_preds)
    val_f1 = f1_score(val_labels, val_preds, average="weighted")

    print(f"Epoch {epoch+1} | Train Acc: {train_acc:.4f} | Val Acc: {val_acc:.4f} | Val F1: {val_f1:.4f}")

    if val_f1 > best_f1:
        best_f1 = val_f1
        model.save_pretrained(BEST_MODEL_PATH)
        tokenizer.save_pretrained(BEST_MODEL_PATH)
        joblib.dump(le, os.path.join(BEST_MODEL_PATH, "label0806_encoder.pkl"))
        print("✅ Best model saved.")

    early_stopping(val_f1)
    if early_stopping.early_stop:
        print("🛑 Early stopping triggered!")
        break


Emotion
상처    10000
불안    10000
당황    10000
분노    10000
기쁨    10000
슬픔    10000
중립    10000
Name: count, dtype: int64


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/61.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/467 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/452M [00:00<?, ?B/s]

Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at monologg/koelectra-base-v3-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  return forward_call(*args, **kwargs)


model.safetensors:   0%|          | 0.00/452M [00:00<?, ?B/s]

Epoch 1 Training: 100%|██████████| 7088/7088 [23:56<00:00,  4.93it/s]
  return forward_call(*args, **kwargs)


Epoch 1 | Train Acc: 0.6389 | Val Acc: 0.7029 | Val F1: 0.7023
✅ Best model saved.


  return forward_call(*args, **kwargs)
Epoch 2 Training: 100%|██████████| 7088/7088 [23:57<00:00,  4.93it/s]
  return forward_call(*args, **kwargs)


Epoch 2 | Train Acc: 0.7309 | Val Acc: 0.7181 | Val F1: 0.7165
✅ Best model saved.


  return forward_call(*args, **kwargs)
Epoch 3 Training: 100%|██████████| 7088/7088 [23:56<00:00,  4.93it/s]
  return forward_call(*args, **kwargs)


Epoch 3 | Train Acc: 0.3291 | Val Acc: 0.1463 | Val F1: 0.0429


  return forward_call(*args, **kwargs)
Epoch 4 Training: 100%|██████████| 7088/7088 [23:55<00:00,  4.94it/s]
  return forward_call(*args, **kwargs)


Epoch 4 | Train Acc: 0.3074 | Val Acc: 0.2216 | Val F1: 0.1169
🛑 Early stopping triggered!


In [None]:
import os
import torch
import joblib
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoConfig
from sklearn.metrics import classification_report, accuracy_score, f1_score

# ✅ 1. 디바이스 설정
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# ✅ 2. 모델 경로 (영문 경로 사용 필수)
BEST_MODEL_PATH = "/content/drive/MyDrive/finance_project/best_model"

# ✅ 3. EmotionDataset 클래스 정의
class EmotionDataset(Dataset):
    def __init__(self, sentences, labels, tokenizer, max_len=128):
        self.sentences = sentences
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        text = str(self.sentences[idx])
        label = self.labels[idx]
        encoding = self.tokenizer(
            text,
            padding="max_length",
            truncation=True,
            max_length=self.max_len,
            return_tensors="pt",
        )
        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "labels": torch.tensor(label, dtype=torch.long),
        }

# ✅ 4. 모델 구성 불러오기 (safetensors 대응 포함)
config = AutoConfig.from_pretrained(BEST_MODEL_PATH, local_files_only=True)
tokenizer = AutoTokenizer.from_pretrained(BEST_MODEL_PATH, local_files_only=True)
model = AutoModelForSequenceClassification.from_pretrained(
    BEST_MODEL_PATH,
    config=config,
    local_files_only=True,
    trust_remote_code=True  # safetensors 지원을 위해 필요할 수 있음
).to(DEVICE)

# ✅ 5. 라벨 인코더 불러오기
le = joblib.load(os.path.join(BEST_MODEL_PATH, "labeg2_encoder.pkl"))

# ✅ 6. 테스트 데이터 불러오기 (예시용. 이미 test가 있다면 이 부분은 생략하세요)
# test = pd.read_csv("...") 또는 미리 정의된 test 사용
# 여기선 가정: test["Sentence"], test["label_id"] 존재

# ✅ 7. 테스트셋 구성
test_dataset = EmotionDataset(test["Sentence"], test["label_id"], tokenizer)
test_loader = DataLoader(test_dataset, batch_size=32)

# ✅ 8. 추론 및 평가
model.eval()
all_preds, all_labels = [], []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch["input_ids"].to(DEVICE)
        attention_mask = batch["attention_mask"].to(DEVICE)
        labels = batch["labels"].to(DEVICE)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        preds = torch.argmax(outputs.logits, dim=1)

        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

# ✅ 9. 결과 출력
print("✅ Test Accuracy:", accuracy_score(all_labels, all_preds))
print("✅ Test F1 Score:", f1_score(all_labels, all_preds, average="weighted"))
print("\n✅ Classification Report:\n", classification_report(all_labels, all_preds, target_names=le.classes_))


OSError: Can't load the configuration of '/content/drive/MyDrive/finance_project/best_model'. If you were trying to load it from 'https://huggingface.co/models', make sure you don't have a local directory with the same name. Otherwise, make sure '/content/drive/MyDrive/finance_project/best_model' is the correct path to a directory containing a config.json file

In [None]:
import os
BEST_MODEL_PATH = "/content/drive/MyDrive/금융공모전/best_model"
print(os.listdir(BEST_MODEL_PATH))


['label_encoder.pkl', 'labegl_encoder.pkl', 'config.json', 'model.safetensors', 'special_tokens_map.json', 'vocab.txt', 'tokenizer_config.json', 'tokenizer.json', 'labeg2_encoder.pkl']


In [None]:
from sklearn.metrics import classification_report, accuracy_score, f1_score
import torch
import joblib
import torch.nn.functional as F

In [None]:
import torch.nn.functional as F

# 모델 경로 (학습 시 저장한 경로)
MODEL_PATH = "/content/drive/MyDrive/금융공모전/best_model"

# 모델, 토크나이저, 라벨 로드
model = AutoModelForSequenceClassification.from_pretrained(MODEL_PATH).to(DEVICE)
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
label_encoder = joblib.load(MODEL_PATH + "/labeg2_encoder.pkl")

# 추론 함수
def predict_emotion(text):
    model.eval()
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding="max_length", max_length=128).to(DEVICE)
    with torch.no_grad():
        outputs = model(**inputs)
        probs = F.softmax(outputs.logits, dim=1).squeeze().cpu().numpy()

    # 결과 매핑 및 출력
    label_names = label_encoder.classes_
    result = {label: round(prob * 100, 2) for label, prob in zip(label_names, probs)}
    return result

# 🔍 테스트 문장
sentence = "안녕?"
result = predict_emotion(sentence)

# 📊 결과 출력
import pprint
pprint.pprint(result)


  return forward_call(*args, **kwargs)


{'기쁨': np.float32(2.77),
 '당황': np.float32(0.01),
 '분노': np.float32(1.36),
 '불안': np.float32(0.01),
 '상처': np.float32(0.01),
 '슬픔': np.float32(1.21),
 '중립': np.float32(94.64)}
