In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from transformers import ElectraTokenizer
import torch
import torch.nn as nn
from transformers import ElectraModel
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import copy
import random
from sklearn.model_selection import train_test_split
from torch.nn.utils.rnn import pad_sequence
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix
import torch.nn.functional as F

In [None]:
df = pd.read_csv("/content/drive/MyDrive/SBERT_result1.csv")

### 데이터 전처리 및 학습 데이터 준비

In [None]:
df.head()

In [None]:
df.drop(columns=['Unnamed: 0'], inplace=True)

In [6]:
len(df)

55313

In [7]:
df.dropna(inplace=True)

In [8]:
len(df)

55313

In [9]:
import re

pattern = r"\b\d+\."

df = df[~df["text"].str.contains(pattern, regex=True)]
df = df[~df["text"].str.fullmatch(r"\s*\d+\s*")]

df = df.reset_index(drop=True)

In [10]:
len(df)

54374

In [12]:
#라벨 인코딩 진행
le = LabelEncoder()
df['label'] = le.fit_transform(df['depression_level'])

In [13]:
df['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
0,52832
1,831
2,567
3,144


In [14]:
# #발화간 시간 차이에 대한 피쳐를 추가함
# df['uttered_at'] = pd.to_datetime(df['uttered_at'], errors='coerce')
# df = df.sort_values(by=['doll_id', 'uttered_at'])
# df['time_gap'] = df.groupby('doll_id')['uttered_at'].diff().dt.total_seconds().fillna(0) / 60.0

In [15]:
#세션 분할 -> 10개의 row를 하나의 세션으로 묶음
session_id = []
for pid, group in df.groupby("id"):
    sid = 1
    count = 0
    for _ in range(len(group)):
        session_id.append(f"{pid}_{sid}")
        count += 1
        if count == 10:   # 10개 발화마다 새로운 session 시작
            sid += 1
            count = 0

df["session_id"] = session_id

In [16]:
#세션단위 인코딩
tokenizer = ElectraTokenizer.from_pretrained("monologg/koelectra-small-discriminator")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/60.0 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/466 [00:00<?, ?B/s]

In [17]:
def encode_session(tokenizer, texts, max_len=128):
    input_ids_list, attention_masks_list = [], []
    for text in texts:
        enc = tokenizer(
            text,
            padding="max_length",
            truncation=True,
            max_length=max_len,
            return_tensors="pt",
        )
        input_ids_list.append(enc["input_ids"])
        attention_masks_list.append(enc["attention_mask"])

    input_ids = torch.cat(input_ids_list, dim=0)            # (U, L)
    attention_mask = torch.cat(attention_masks_list, dim=0) # (U, L)

    return {"input_ids": input_ids, "attention_mask": attention_mask}

In [18]:
class ConversationDataset(Dataset):
    def __init__(self, df, tokenizer, max_len=128):
        self.df = df.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.sessions = list(self.df.groupby("session_id"))  # (session_id, session_df) 리스트

    def __len__(self):
        return len(self.sessions)

    def __getitem__(self, idx):
        session_id, session_df = self.sessions[idx]
        texts = session_df['text'].tolist()
        labels = session_df['label'].tolist()
        row_indices = session_df.index.tolist()

        examples = []
        for i in range(len(texts)):
            # time_gap 없이 encode_session 호출
            enc = encode_session(self.tokenizer, texts[:i+1], max_len=self.max_len)

            label = labels[i]
            row_idx = row_indices[i]
            current_text = texts[i]  # 현재 시점 발화

            examples.append((enc, label, row_idx, current_text))

        return examples, session_id

In [19]:
def collate_fn(batch):
    input_ids, attention_mask = [], []
    labels, texts, indices = [], [], []

    for examples, _ in batch:
        for enc, label, idx, current_text in examples:
            input_ids.append(enc["input_ids"])           # (U, L) tensor
            attention_mask.append(enc["attention_mask"]) # (U, L) tensor
            labels.append(label)
            texts.append(current_text)                   # 현재 발화
            indices.append(idx)

    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": torch.tensor(labels, dtype=torch.long),
        "texts": texts,
        "indices": torch.tensor(indices, dtype=torch.long),
    }

In [20]:
# session_id 단위로 train/test 나눠야함
session_ids = df['session_id'].unique()

# 먼저 train+val / test 분리
trainval_sess, test_sess = train_test_split(
    session_ids, test_size=0.2, random_state=42
)

# 다시 train / val 분리
train_sess, val_sess = train_test_split(
    trainval_sess, test_size=0.25, random_state=42  # 전체의 20% val, 60% train
)

train_df = df[df['session_id'].isin(train_sess)].reset_index(drop=True)
val_df   = df[df['session_id'].isin(val_sess)].reset_index(drop=True)
test_df  = df[df['session_id'].isin(test_sess)].reset_index(drop=True)

In [21]:
train_df['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
0,31649
1,516
2,338
3,93


In [22]:
# Dataset 생성
train_dataset = ConversationDataset(train_df, tokenizer, max_len=128)
val_dataset   = ConversationDataset(val_df, tokenizer, max_len=128)
test_dataset  = ConversationDataset(test_df, tokenizer, max_len=128)

# DataLoader 생성
# batch size 크기도 실험 대상
train_loader = DataLoader(
    train_dataset,
    batch_size=4,
    shuffle=True,
    collate_fn=collate_fn
)

val_loader = DataLoader(
    val_dataset,
    batch_size=4,
    shuffle=False,
    collate_fn=collate_fn
)

test_loader = DataLoader(
    test_dataset,
    batch_size=4,
    shuffle=False,
    collate_fn=collate_fn
)

### 모델 정의

In [23]:
class RiskClassifier(nn.Module):
    def __init__(self, base_model="monologg/koelectra-small-discriminator",
                 hidden_size=256, num_classes=4, finetune_layers=2):
        super().__init__()
        # Local Encoder
        self.local_encoder = ElectraModel.from_pretrained(base_model)
        local_dim = self.local_encoder.config.hidden_size

        for param in self.local_encoder.parameters():
          param.requires_grad = False

        # tuning할 layer
        if finetune_layers > 0:
            for layer in self.local_encoder.encoder.layer[-finetune_layers:]:
                for param in layer.parameters():
                    param.requires_grad = True

        # Global Encoder -> BiLSTM
        # 일단 layer 하나로 둠
        self.global_encoder = nn.LSTM(
            input_size=local_dim,
            hidden_size=hidden_size,
            num_layers=1,
            batch_first=True,
            bidirectional=True
        )
        global_dim = hidden_size * 2

        # Attention Layer (발화 중요도 계산)
        self.attn = nn.Linear(global_dim, 1)

        # 최종 분류기: [세션 어텐션 요약 + 마지막 발화]
        self.fc = nn.Linear(global_dim * 2, num_classes)

    def forward(self, input_ids, attention_mask):
        """
        input_ids: (B, U, L)
        attention_mask: (B, U, L)
        """
        B, U, L = input_ids.shape

        # ---- Local Encoder ----
        input_ids = input_ids.view(-1, L)             # (B*U, L)
        attention_mask = attention_mask.view(-1, L)   # (B*U, L)
        outputs = self.local_encoder(input_ids, attention_mask=attention_mask)
        utter_embeds = outputs.last_hidden_state[:, 0, :]  # CLS (B*U, local_dim)

        # reshape back to (B, U, local_dim)
        utter_embeds = utter_embeds.view(B, U, -1)

        # ---- Global Encoder ----
        global_out, _ = self.global_encoder(utter_embeds)  # (B, U, 2*hidden_size)

        # ---- Attention Pooling ----
        attn_scores = self.attn(global_out).squeeze(-1)    # (B, U)
        attn_weights = torch.softmax(attn_scores, dim=1)   # (B, U)
        session_repr = torch.bmm(attn_weights.unsqueeze(1), global_out).squeeze(1)
        # (B, 2*hidden_size)

        # ---- 마지막 발화 ----
        last_repr = global_out[:, -1, :]  # (B, 2*hidden_size)

        # ---- 결합 ----
        final_repr = torch.cat([last_repr, session_repr], dim=-1)  # (B, 4*hidden_size)

        # ---- 분류 ----
        logits = self.fc(final_repr)
        return logits

### 학습 코드

In [24]:
from torch.cuda.amp import autocast, GradScaler

In [25]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [26]:
#데이터 불균형 문제 때문에 잘 맞춘 샘플은 loss 기여도를 줄이고, 어려운 샘플에 집중하도록 하기 위해 Focal Loss 사용(pytorch의 경우에는 loss함수 구현해야함)
class FocalLoss(nn.Module):
    def __init__(self, alpha=None, gamma=2.0, reduction="mean"):
        """
        alpha: 클래스 가중치 Tensor (shape [num_classes])
        gamma: focusing parameter
        """
        super().__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.reduction = reduction

    def forward(self, inputs, targets):
        # inputs: (B, C) - raw logits
        # targets: (B,)
        log_probs = F.log_softmax(inputs, dim=-1)   # (B,C)
        probs = torch.exp(log_probs)                # (B,C)

        # one-hot으로 정답 위치만 추출
        targets_one_hot = F.one_hot(targets, num_classes=inputs.size(1)).float()
        pt = (probs * targets_one_hot).sum(dim=-1)      # (B,)
        log_pt = (log_probs * targets_one_hot).sum(dim=-1)  # (B,)

        # focal loss
        if self.alpha is not None:
            at = self.alpha.to(inputs.device)[targets]  # (B,)
        else:
            at = 1.0

        focal_loss = -at * (1 - pt) ** self.gamma * log_pt

        if self.reduction == "mean":
            return focal_loss.mean()
        elif self.reduction == "sum":
            return focal_loss.sum()
        else:
            return focal_loss

In [38]:
class_counts = train_df["label"].value_counts().sort_index().values
weights = 1.0 / (torch.tensor(class_counts, dtype=torch.float) ** 0.7)
weights = (len(weights) * weights) / weights.sum()

# 모델, 옵티마이저, 로스 정의
# 파인 튜닝 할 layer 수(finetune_layers)도 하이퍼파라미터 값이므로 학습시킬 때 이 값 조정하면서 실험해봐야함!
model = RiskClassifier(num_classes=len(le.classes_), finetune_layers=4).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5, weight_decay=0.01)
# criterion = nn.CrossEntropyLoss(weight=weights.to(device))
criterion = FocalLoss(alpha=weights.to(device), gamma=2.5)
scaler = GradScaler()

  scaler = GradScaler()


In [39]:
num_epochs = 25

In [40]:
for epoch in range(num_epochs):
    # ===== Train =====
    model.train()
    total_loss = 0.0

    for batch in train_loader:
        labels = batch["labels"].to(device)  # (B,)

        # Pad
        input_ids = pad_sequence(batch["input_ids"], batch_first=True).to(device)
        attention_mask = pad_sequence(batch["attention_mask"], batch_first=True).to(device)

        # Forward
        with torch.amp.autocast("cuda"):
            logits = model(input_ids, attention_mask)  # (B, C)
            loss = criterion(logits, labels)

        # Backward
        optimizer.zero_grad(set_to_none=True)
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        total_loss += loss.item()

    avg_train_loss = total_loss / len(train_loader)

    # ===== Validation =====
    model.eval()
    val_loss, preds_all, labels_all = 0.0, [], []

    with torch.no_grad():
        for batch in val_loader:
            labels = batch["labels"].to(device)
            input_ids = pad_sequence(batch["input_ids"], batch_first=True).to(device)
            attention_mask = pad_sequence(batch["attention_mask"], batch_first=True).to(device)

            with torch.amp.autocast("cuda"):
                logits = model(input_ids, attention_mask)
                loss = criterion(logits, labels)

            val_loss += loss.item()

            preds = torch.argmax(logits, dim=-1)
            preds_all.extend(preds.cpu().tolist())
            labels_all.extend(labels.cpu().tolist())

    avg_val_loss = val_loss / len(val_loader)
    val_acc = accuracy_score(labels_all, preds_all)
    val_f1 = f1_score(labels_all, preds_all, average="macro")  # 불균형 데이터이므로 macro-F1 사용

    print(
        f"Epoch {epoch+1} | "
        f"Train Loss={avg_train_loss:.4f} | "
        f"Val Loss={avg_val_loss:.4f} | "
        f"Val Acc={val_acc:.4f} | "
        f"Val Macro-F1={val_f1:.4f}"
    )

Epoch 1 | Train Loss=0.0350 | Val Loss=0.0295 | Val Acc=0.9756 | Val Macro-F1=0.2469
Epoch 2 | Train Loss=0.0341 | Val Loss=0.0293 | Val Acc=0.9701 | Val Macro-F1=0.2558
Epoch 3 | Train Loss=0.0337 | Val Loss=0.0290 | Val Acc=0.9647 | Val Macro-F1=0.2560
Epoch 4 | Train Loss=0.0324 | Val Loss=0.0274 | Val Acc=0.9584 | Val Macro-F1=0.2624
Epoch 5 | Train Loss=0.0304 | Val Loss=0.0263 | Val Acc=0.9233 | Val Macro-F1=0.2882
Epoch 6 | Train Loss=0.0286 | Val Loss=0.0259 | Val Acc=0.9023 | Val Macro-F1=0.2998
Epoch 7 | Train Loss=0.0263 | Val Loss=0.0282 | Val Acc=0.9060 | Val Macro-F1=0.2848
Epoch 8 | Train Loss=0.0250 | Val Loss=0.0275 | Val Acc=0.8332 | Val Macro-F1=0.2701
Epoch 9 | Train Loss=0.0225 | Val Loss=0.0299 | Val Acc=0.9225 | Val Macro-F1=0.2959
Epoch 10 | Train Loss=0.0208 | Val Loss=0.0300 | Val Acc=0.9121 | Val Macro-F1=0.2969


### teat data를 이용한 성능 평가

In [41]:
model.eval()
test_loss, preds_all, labels_all = 0.0, [], []

with torch.no_grad():
    for batch in test_loader:
        labels = batch["labels"].to(device)
        input_ids = pad_sequence(batch["input_ids"], batch_first=True).to(device)
        attention_mask = pad_sequence(batch["attention_mask"], batch_first=True).to(device)

        # Mixed precision inference
        with torch.amp.autocast('cuda'):
            logits = model(input_ids, attention_mask)
            loss = criterion(logits, labels)

        test_loss += loss.item()

        preds = torch.argmax(logits, dim=-1)
        preds_all.extend(preds.cpu().tolist())
        labels_all.extend(labels.cpu().tolist())

avg_test_loss = test_loss / len(test_loader)

In [None]:
test_acc = accuracy_score(labels_all, preds_all)
test_macro_f1 = f1_score(labels_all, preds_all, average="macro")

print("=== Test Performance ===")
print(f"Test Loss: {avg_test_loss:.4f}")
print(f"Test Acc: {test_acc:.4f}")
print(f"Test Macro-F1: {test_macro_f1:.4f}")

print("\nClassification Report:")
print(classification_report(labels_all, preds_all, digits=4))

print("Confusion Matrix:")
print(confusion_matrix(labels_all, preds_all))

=== Test Performance ===
Test Loss: 0.0300
Test Acc: 0.6381
Test Macro-F1: 0.2399
Test Weighted-F1: 0.7597

Classification Report:
              precision    recall  f1-score   support

           0     0.9899    0.6460    0.7818     10728
           1     0.0455    0.3298    0.0800       188
           2     0.0406    0.5179    0.0754       112
           3     0.0117    0.3409    0.0226        44

    accuracy                         0.6381     11072
   macro avg     0.2719    0.4586    0.2399     11072
weighted avg     0.9603    0.6381    0.7597     11072

Confusion Matrix:
[[6930 1278 1302 1218]
 [  47   62   52   27]
 [  14   18   58   22]
 [  10    4   15   15]]
