In [None]:
from google.colab import drive, files
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# **klue/bert-base 모델**

In [None]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence
from tqdm import tqdm
from google.colab import files
import os

# GPU 사용 설정 (CUDA가 사용 가능한 경우)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

print("GPU available:", torch.cuda.is_available())

GPU available: True


In [None]:
# 데이터셋 클래스 정의
class LegalDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        tokens = eval(self.data.iloc[idx]['tokens_klue'])  # 토큰화된 데이터를 리스트로 변환
        tokens = torch.tensor(tokens, dtype=torch.long)
        label = torch.tensor(self.data.iloc[idx]['label'], dtype=torch.long)
        return tokens, label

def collate_fn(batch):
    tokens, labels = zip(*batch)

    # 각 샘플의 tokens 길이가 다를 수 있으므로 패딩 적용
    tokens_padded = pad_sequence([torch.tensor(token) for token in tokens], batch_first=True, padding_value=tokenizer.pad_token_id)

    # Attention mask 추가 (패딩된 토큰을 0으로, 나머지를 1로 설정)
    attention_mask = (tokens_padded != tokenizer.pad_token_id).long()

    labels = torch.tensor(labels)
    return tokens_padded, attention_mask, labels

In [None]:
# 모델과 토크나이저 불러오기
model_name = 'klue/bert-base'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=13)
model.to(device)

# 미리 토큰화된 데이터 불러오기
df = pd.read_csv('/content/drive/MyDrive/df_klue.csv')

# 빈 리스트([])가 있는 행 제거
df = df[df['tokens_klue'].apply(lambda x: len(eval(x)) > 0)]

# '판결유형' 컬럼을 라벨로 변환
label_map = {'민사_승소': 0, '민사_패소': 1, '민사_기각': 2, '징역': 3, '무혐의': 4, '벌금': 5, '형사_기각': 6, '가사_승소': 7, '가사_패소': 8, '가사_기각': 9, '세무_승소': 10, '세무_패소': 11, '세무_기각': 12}
df['label'] = df['판결유형'].map(label_map)

print("데이터 불러오기 및 라벨 변환 완료")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at klue/bert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


데이터 불러오기 및 라벨 변환 완료


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['label'] = df['판결유형'].map(label_map)


In [None]:
# 사건종류와 판결유형을 7:3으로 분리, 'label'만을 기준으로 stratify
train_data, valid_data = train_test_split(df, test_size=0.3, random_state=42, stratify=df['label'])

# 클래스별 데이터 수를 계산하여 가중치 설정
class_weights = compute_class_weight('balanced', classes=df['label'].unique(), y=train_data['label'])
weights = torch.tensor(class_weights, dtype=torch.float).to(device)

print("데이터 분리 및 가중치 설정 완료")

데이터 분리 및 가중치 설정 완료


In [None]:
# 학습 데이터셋과 데이터로더 정의
train_dataset = LegalDataset(train_data)
valid_dataset = LegalDataset(valid_data)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True, collate_fn=collate_fn)
valid_loader = DataLoader(valid_dataset, batch_size=8, shuffle=False, collate_fn=collate_fn)

# Optimizer 설정
optimizer = AdamW(model.parameters(), lr=1e-5)

# CrossEntropyLoss에 가중치 적용
criterion = torch.nn.CrossEntropyLoss(weight=weights)

print("데이터로더 및 Optimizer 설정 완료")

데이터로더 및 Optimizer 설정 완료




In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score

# 성능 지표 계산 함수 추가
def calculate_metrics(predictions, labels):
    preds = predictions.argmax(dim=1).cpu().numpy()  # 예측 결과
    labels = labels.cpu().numpy()  # 실제 라벨

    precision = precision_score(labels, preds, average='weighted', zero_division=1)
    recall = recall_score(labels, preds, average='weighted', zero_division=1)
    f1 = f1_score(labels, preds, average='weighted', zero_division=1)

    return precision, recall, f1

# 학습 함수에서 precision, recall, f1 추가
def train(model, loader, optimizer, epoch):
    model.train()
    total_loss, total_correct = 0, 0
    all_preds, all_labels = [], []

    total_batches = len(loader)
    progress_bar = tqdm(total=total_batches, desc=f"Training Epoch {epoch}", unit='batch', dynamic_ncols=True, mininterval=1)

    for batch_idx, (tokens, attention_mask, labels) in enumerate(loader):
        tokens, attention_mask, labels = tokens.to(device), attention_mask.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(input_ids=tokens, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        total_correct += (outputs.logits.argmax(dim=1) == labels).sum().item()

        all_preds.append(outputs.logits)
        all_labels.append(labels)

        # 25%마다 진행 상황 업데이트
        if (batch_idx + 1) % (total_batches // 4) == 0:
            progress_bar.update(total_batches // 4)
            progress_bar.set_postfix(loss=total_loss / (batch_idx + 1), accuracy=total_correct / ((batch_idx + 1) * loader.batch_size))

    progress_bar.close()

    # Precision, Recall, F1 계산
    all_preds = torch.cat(all_preds)
    all_labels = torch.cat(all_labels)
    precision, recall, f1 = calculate_metrics(all_preds, all_labels)

    avg_loss = total_loss / total_batches
    accuracy = total_correct / len(loader.dataset)
    return avg_loss, accuracy, precision, recall, f1

# 검증 함수에서 precision, recall, f1 추가
def validate(model, loader, epoch):
    model.eval()
    total_loss, total_correct = 0, 0
    all_preds, all_labels = [], []

    total_batches = len(loader)
    progress_bar = tqdm(total=total_batches, desc=f"Validation Epoch {epoch}", unit='batch', dynamic_ncols=True)

    with torch.no_grad():
        for batch_idx, (tokens, attention_mask, labels) in enumerate(loader):
            tokens, attention_mask, labels = tokens.to(device), attention_mask.to(device), labels.to(device)
            outputs = model(input_ids=tokens, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss

            total_loss += loss.item()
            total_correct += (outputs.logits.argmax(dim=1) == labels).sum().item()

            all_preds.append(outputs.logits)
            all_labels.append(labels)

            # 50%마다 진행 상황 업데이트
            if (batch_idx + 1) % (total_batches // 2) == 0:
                progress_bar.update(total_batches // 2)
                progress_bar.set_postfix(loss=total_loss / (batch_idx + 1), accuracy=total_correct / ((batch_idx + 1) * loader.batch_size))

    progress_bar.close()

    # Precision, Recall, F1 계산
    all_preds = torch.cat(all_preds)
    all_labels = torch.cat(all_labels)
    precision, recall, f1 = calculate_metrics(all_preds, all_labels)

    avg_loss = total_loss / total_batches
    accuracy = total_correct / len(loader.dataset)
    return avg_loss, accuracy, precision, recall, f1

In [None]:
# 초기 변수 설정
best_val_loss = float('inf')  # 가장 좋은 검증 손실을 초기화

# 학습 및 검증 함수 실행
train_accuracies, val_accuracies = [], []
train_losses, val_losses = [], []
precisions, recalls, f1_scores = [], [], []

for epoch in range(20):
    train_loss, train_acc, train_precision, train_recall, train_f1 = train(model, train_loader, optimizer, epoch)
    val_loss, val_acc, val_precision, val_recall, val_f1 = validate(model, valid_loader, epoch)

    # 각 모델의 결과 저장
    train_losses.append(train_loss)
    val_losses.append(val_loss)
    train_accuracies.append(train_acc)
    val_accuracies.append(val_acc)
    precisions.append(val_precision)
    recalls.append(val_recall)
    f1_scores.append(val_f1)

    # 조기 종료 및 모델 저장
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        # 폴더가 없으면 생성
        if not os.path.exists(model_name):
            os.makedirs(model_name)
        # 모델 저장
        torch.save(model.state_dict(), f'{model_name}/best_model.pth')

print("학습 및 검증 완료")


  tokens_padded = pad_sequence([torch.tensor(token) for token in tokens], batch_first=True, padding_value=tokenizer.pad_token_id)

Training Epoch 0:  25%|██▍       | 922/3689 [02:36<07:50,  5.89batch/s][A
Training Epoch 0:  25%|██▍       | 922/3689 [02:36<07:50,  5.89batch/s, accuracy=0.866, loss=0.498][A
Training Epoch 0:  25%|██▍       | 922/3689 [02:49<07:50,  5.89batch/s, accuracy=0.866, loss=0.498][A
Training Epoch 0:  50%|████▉     | 1844/3689 [05:12<05:13,  5.89batch/s, accuracy=0.866, loss=0.498][A
Training Epoch 0:  50%|████▉     | 1844/3689 [05:12<05:13,  5.89batch/s, accuracy=0.905, loss=0.353][A
Training Epoch 0:  50%|████▉     | 1844/3689 [05:25<05:13,  5.89batch/s, accuracy=0.905, loss=0.353][A
Training Epoch 0:  75%|███████▍  | 2766/3689 [07:49<02:36,  5.90batch/s, accuracy=0.905, loss=0.353][A
Training Epoch 0:  75%|███████▍  | 2766/3689 [07:49<02:36,  5.90batch/s, accuracy=0.918, loss=0.295][A
Training Epoch 0:  75%|███████▍  | 2766/3689 [07:59<02:36,  5.90bat

학습 및 검증 완료





In [None]:
# 성능 지표 출력 및 저장
metrics = {
    'model_name': model_name,
    'train_accuracy': train_accuracies,
    'val_accuracy': val_accuracies,
    'train_loss': train_losses,
    'val_loss': val_losses,
    'precision': precisions,
    'recall': recalls,
    'f1_score': f1_scores
}

metrics_df = pd.DataFrame([metrics])
metrics_df.to_csv('klue_metrics.csv', index=False)
print("klue_metrics saved.")
files.download('klue_metrics.csv')

klue_metrics saved.


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# **koelectra 모델 학습**

In [None]:
import torch
from transformers import BertTokenizer, ElectraForSequenceClassification, AdamW
import pandas as pd
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
from google.colab import files

# GPU 사용 설정
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
# 데이터셋 클래스 정의
class LegalDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        tokens = eval(self.data.iloc[idx]['tokens_koelectra'])  # 토큰화된 데이터를 리스트로 변환
        tokens = torch.tensor(tokens, dtype=torch.long)
        label = torch.tensor(self.data.iloc[idx]['label'], dtype=torch.long)
        return tokens, label

# 모델과 토크나이저 불러오기
model_name = 'monologg/koelectra-base-v3-discriminator'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = ElectraForSequenceClassification.from_pretrained(model_name, num_labels=13)
model.to(device)  # 모델을 GPU로 전송

tokenizer_config.json:   0%|          | 0.00/61.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/263k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/467 [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'ElectraTokenizer'. 
The class this function is called from is 'BertTokenizer'.


pytorch_model.bin:   0%|          | 0.00/452M [00:00<?, ?B/s]

Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at monologg/koelectra-base-v3-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


ElectraForSequenceClassification(
  (electra): ElectraModel(
    (embeddings): ElectraEmbeddings(
      (word_embeddings): Embedding(35000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): ElectraEncoder(
      (layer): ModuleList(
        (0-11): 12 x ElectraLayer(
          (attention): ElectraAttention(
            (self): ElectraSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): ElectraSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): L

In [None]:
# 미리 토큰화된 데이터 불러오기
df = pd.read_csv('/content/drive/MyDrive/df_koelectra.csv')

# 빈 리스트([])가 있는 행 제거
df = df[df['tokens_koelectra'].apply(lambda x: len(eval(x)) > 0)]

# '판결유형' 컬럼을 라벨로 변환
label_map = {'민사_승소': 0, '민사_패소': 1, '민사_기각': 2, '징역': 3, '무혐의': 4, '벌금': 5, '형사_기각': 6, '가사_승소': 7, '가사_패소': 8, '가사_기각': 9, '세무_승소': 10, '세무_패소': 11, '세무_기각': 12}
df['label'] = df['판결유형'].map(label_map)

print("빈 리스트 제거 및 데이터 불러오기 완료")

빈 리스트 제거 및 데이터 불러오기 완료


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['label'] = df['판결유형'].map(label_map)


In [None]:
# 데이터 7:3 비율로 분리
train_data = df.sample(frac=0.7, random_state=42)
valid_data = df.drop(train_data.index)

# 학습 데이터셋과 데이터로더 정의
train_dataset = LegalDataset(train_data)
valid_dataset = LegalDataset(valid_data)

# 데이터로더 정의 시 collate_fn 추가
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True, collate_fn=collate_fn)
valid_loader = DataLoader(valid_dataset, batch_size=8, shuffle=False, collate_fn=collate_fn)

# Optimizer 설정
optimizer = AdamW(model.parameters(), lr=2e-5)



In [None]:
def collate_fn(batch):
    tokens, labels = zip(*batch)

    # 각 샘플의 tokens 길이를 512로 고정
    max_length = 512

    # 각 샘플의 tokens 길이가 다를 수 있으므로 패딩 적용 (최대 길이로 고정)
    tokens_padded = torch.stack([torch.cat([torch.tensor(token[:max_length], dtype=torch.long), torch.tensor([tokenizer.pad_token_id] * (max_length - len(token)), dtype=torch.long)]) if len(token) < max_length else torch.tensor(token[:max_length], dtype=torch.long) for token in tokens])

    # Attention mask 추가 (패딩된 토큰을 0으로, 나머지를 1로 설정)
    attention_mask = (tokens_padded != tokenizer.pad_token_id).long()

    labels = torch.tensor(labels, dtype=torch.long)  # 라벨도 long 타입으로 변환

    # 3개의 값을 반환해야 함: tokens_padded, attention_mask, labels
    return tokens_padded, attention_mask, labels

In [None]:
def train(model, loader, optimizer, epoch):
    model.train()
    total_loss, total_correct = 0, 0
    total_batches = len(loader)

    all_preds, all_labels = [], []

    # tqdm을 전체 에포크 단위로 초기화
    progress_bar = tqdm(total=total_batches, desc=f"Training Epoch {epoch}", unit='batch', dynamic_ncols=True)

    for batch_idx, (tokens, attention_mask, labels) in enumerate(loader):
        tokens, attention_mask, labels = tokens.to(device), attention_mask.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(input_ids=tokens, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        total_correct += (outputs.logits.argmax(dim=1) == labels).sum().item()

        # 예측 값과 실제 라벨 저장
        all_preds.extend(outputs.logits.argmax(dim=1).detach().cpu().numpy())
        all_labels.extend(labels.detach().cpu().numpy())

        # 25%마다 진행 상황 업데이트
        if (batch_idx + 1) % (total_batches // 4) == 0:
            progress_bar.update(total_batches // 4)
            progress_bar.set_postfix(loss=total_loss / (batch_idx + 1), accuracy=total_correct / ((batch_idx + 1) * loader.batch_size))

    progress_bar.close()

    avg_loss = total_loss / total_batches
    accuracy = total_correct / len(loader.dataset)

    # Precision, Recall, F1-score 계산
    precision = precision_score(all_labels, all_preds, average='weighted')
    recall = recall_score(all_labels, all_preds, average='weighted')
    f1 = f1_score(all_labels, all_preds, average='weighted')

    return avg_loss, accuracy, precision, recall, f1

def validate(model, loader, epoch):
    model.eval()
    total_loss, total_correct = 0, 0
    total_batches = len(loader)

    all_preds, all_labels = [], []

    progress_bar = tqdm(total=total_batches, desc=f"Validation Epoch {epoch}", unit='batch', dynamic_ncols=True)

    with torch.no_grad():
        for batch_idx, (tokens, attention_mask, labels) in enumerate(loader):
            tokens, attention_mask, labels = tokens.to(device), attention_mask.to(device), labels.to(device)
            outputs = model(input_ids=tokens, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss

            total_loss += loss.item()
            total_correct += (outputs.logits.argmax(dim=1) == labels).sum().item()

            # 예측 값과 실제 라벨 저장
            all_preds.extend(outputs.logits.argmax(dim=1).detach().cpu().numpy())
            all_labels.extend(labels.detach().cpu().numpy())

            # 50%마다 진행 상황 업데이트
            if (batch_idx + 1) % (total_batches // 2) == 0:
                progress_bar.update(total_batches // 2)
                progress_bar.set_postfix(loss=total_loss / (batch_idx + 1), accuracy=total_correct / ((batch_idx + 1) * loader.batch_size))

    progress_bar.close()

    avg_loss = total_loss / total_batches
    accuracy = total_correct / len(loader.dataset)

    # Precision, Recall, F1-score 계산
    precision = precision_score(all_labels, all_preds, average='weighted')
    recall = recall_score(all_labels, all_preds, average='weighted')
    f1 = f1_score(all_labels, all_preds, average='weighted')

    return avg_loss, accuracy, precision, recall, f1

In [None]:
# 학습 및 검증 진행
best_val_loss = float('inf')
train_accuracies, val_accuracies = [], []
train_losses, val_losses = [], []
precisions, recalls, f1_scores = [], [], []

for epoch in range(20):
    train_loss, train_acc, train_precision, train_recall, train_f1 = train(model, train_loader, optimizer, epoch)
    val_loss, val_acc, val_precision, val_recall, val_f1 = validate(model, valid_loader, epoch)

    # 각 모델의 결과 저장
    train_losses.append(train_loss)
    val_losses.append(val_loss)
    train_accuracies.append(train_acc)
    val_accuracies.append(val_acc)
    precisions.append(val_precision)
    recalls.append(val_recall)
    f1_scores.append(val_f1)

    # 조기 종료 및 모델 저장
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        # 폴더가 없으면 생성
        if not os.path.exists(model_name):
            os.makedirs(model_name)
        # 모델 저장
        torch.save(model.state_dict(), f'{model_name}/best_model.pth')


  tokens_padded = pad_sequence([torch.tensor(token) for token in tokens], batch_first=True, padding_value=tokenizer.pad_token_id)

Training Epoch 0:  25%|██▍       | 922/3689 [02:58<08:56,  5.16batch/s][A
Training Epoch 0:  25%|██▍       | 922/3689 [02:58<08:56,  5.16batch/s, accuracy=0.69, loss=1.15][A
Training Epoch 0:  50%|████▉     | 1844/3689 [05:57<05:57,  5.16batch/s, accuracy=0.69, loss=1.15][A
Training Epoch 0:  50%|████▉     | 1844/3689 [05:57<05:57,  5.16batch/s, accuracy=0.63, loss=1.31][A
Training Epoch 0:  75%|███████▍  | 2766/3689 [08:56<02:58,  5.16batch/s, accuracy=0.63, loss=1.31][A
Training Epoch 0:  75%|███████▍  | 2766/3689 [08:56<02:58,  5.16batch/s, accuracy=0.612, loss=1.36][A
Training Epoch 0: 100%|█████████▉| 3688/3689 [11:54<00:00,  5.16batch/s, accuracy=0.612, loss=1.36][A
Training Epoch 0: 100%|█████████▉| 3688/3689 [11:54<00:00,  5.16batch/s, accuracy=0.601, loss=1.39]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))

  toke

In [None]:
# 성능 지표 출력 및 저장
metrics_2 = {
    'model_name': model_name,
    'train_accuracy': train_accuracies,
    'val_accuracy': val_accuracies,
    'train_loss': train_losses,
    'val_loss': val_losses,
    'precision': precisions,
    'recall': recalls,
    'f1_score': f1_scores
}

metrics_df = pd.DataFrame([metrics_2])
metrics_df.to_csv('koelectra_metrics.csv', index=False)
print("koelectra_metrics saved.")
files.download("koelectra_metrics.csv")

koelectra_metrics saved.


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# **KcBERT 모델 학습**

In [None]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
import pandas as pd
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
from torch.nn.utils.rnn import pad_sequence
from sklearn.metrics import precision_score, recall_score, f1_score

# GPU 설정
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
# 데이터셋 클래스 정의
class LegalDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        tokens = eval(self.data.iloc[idx]['tokens_KcBERT'])  # 토큰화된 데이터를 리스트로 변환
        tokens = torch.tensor(tokens, dtype=torch.long)
        label = torch.tensor(self.data.iloc[idx]['label'], dtype=torch.long)  # 라벨을 적절하게 변환
        return tokens, label

# 패딩 함수 정의
def collate_fn(batch):
    tokens, labels = zip(*batch)

    # 각 샘플의 tokens 길이를 300으로 고정
    max_length = 300

    # 각 샘플의 tokens 길이가 다를 수 있으므로 패딩 적용 (최대 길이로 고정)
    tokens_padded = torch.stack([torch.cat([torch.tensor(token[:max_length], dtype=torch.long), torch.tensor([tokenizer.pad_token_id] * (max_length - len(token)), dtype=torch.long)]) if len(token) < max_length else torch.tensor(token[:max_length], dtype=torch.long) for token in tokens])

    # Attention mask 추가 (패딩된 토큰을 0으로, 나머지를 1로 설정)
    attention_mask = (tokens_padded != tokenizer.pad_token_id).long()

    labels = torch.tensor(labels)

    return tokens_padded, attention_mask, labels

In [None]:
# 모델과 토크나이저 불러오기
model_name = 'beomi/KcBERT-base'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=13)
model.to(device)

# 미리 토큰화된 데이터 불러오기
df = pd.read_csv('/content/drive/MyDrive/df_KcBERT.csv')

# 'tokens_KcBERT' 열에서 빈 리스트가 있는지 확인하고 제거
df = df[df['tokens_KcBERT'].apply(lambda x: len(eval(x)) > 0)]

# '판결유형' 컬럼을 라벨로 변환
label_map = {'민사_승소': 0, '민사_패소': 1, '민사_기각': 2, '징역': 3, '무혐의': 4, '벌금': 5, '형사_기각': 6, '가사_승소': 7, '가사_패소': 8, '가사_기각': 9, '세무_승소': 10, '세무_패소': 11, '세무_기각': 12}
df['label'] = df['판결유형'].map(label_map)

print("데이터 불러오기 및 라벨 변환 완료")

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/250k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/619 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at beomi/KcBERT-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


데이터 불러오기 및 라벨 변환 완료


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['label'] = df['판결유형'].map(label_map)


In [None]:
# 데이터 7:3 비율로 분리
train_data = df.sample(frac=0.7, random_state=42)
valid_data = df.drop(train_data.index)

# 학습 데이터셋과 데이터로더 정의
train_dataset = LegalDataset(train_data)
valid_dataset = LegalDataset(valid_data)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True, collate_fn=collate_fn)
valid_loader = DataLoader(valid_dataset, batch_size=8, shuffle=False, collate_fn=collate_fn)

# Optimizer 설정
optimizer = AdamW(model.parameters(), lr=2e-5)



In [None]:
# 학습 함수 정의
def train(model, loader, optimizer, epoch):
    model.train()
    total_loss, total_correct = 0, 0
    total_batches = len(loader)
    all_preds, all_labels = [], []

    # tqdm을 전체 에포크 단위로 초기화
    progress_bar = tqdm(total=total_batches, desc=f"Training Epoch {epoch}", unit='batch', dynamic_ncols=True, mininterval=1)

    for batch_idx, (tokens, attention_mask, labels) in enumerate(loader):
        tokens, attention_mask, labels = tokens.to(device), attention_mask.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(input_ids=tokens, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        preds = outputs.logits.argmax(dim=1)
        total_correct += (preds == labels).sum().item()

        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

        # 25%마다 진행 상황 업데이트
        if (batch_idx + 1) % (total_batches // 4) == 0:
            progress_bar.update(total_batches // 4)
            progress_bar.set_postfix(loss=total_loss / (batch_idx + 1), accuracy=total_correct / ((batch_idx + 1) * loader.batch_size))

    progress_bar.close()
    avg_loss = total_loss / total_batches
    accuracy = total_correct / len(loader.dataset)

    # precision, recall, f1-score 계산
    precision = precision_score(all_labels, all_preds, average='macro', zero_division=0)
    recall = recall_score(all_labels, all_preds, average='macro', zero_division=0)
    f1 = f1_score(all_labels, all_preds, average='macro', zero_division=0)

    return avg_loss, accuracy, precision, recall, f1

# 검증 함수 정의
def validate(model, loader, epoch):
    model.eval()
    total_loss, total_correct = 0, 0
    total_batches = len(loader)
    all_preds, all_labels = [], []

    progress_bar = tqdm(total=total_batches, desc=f"Validation Epoch {epoch}", unit='batch', dynamic_ncols=True)

    with torch.no_grad():
        for batch_idx, (tokens, attention_mask, labels) in enumerate(loader):
            tokens, attention_mask, labels = tokens.to(device), attention_mask.to(device), labels.to(device)
            outputs = model(input_ids=tokens, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss

            total_loss += loss.item()
            preds = outputs.logits.argmax(dim=1)
            total_correct += (preds == labels).sum().item()

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

            # 50%마다 진행 상황 업데이트
            if (batch_idx + 1) % (total_batches // 2) == 0:
                progress_bar.update(total_batches // 2)
                progress_bar.set_postfix(loss=total_loss / (batch_idx + 1), accuracy=total_correct / ((batch_idx + 1) * loader.batch_size))

    progress_bar.close()
    avg_loss = total_loss / total_batches
    accuracy = total_correct / len(loader.dataset)

    # precision, recall, f1-score 계산
    precision = precision_score(all_labels, all_preds, average='macro', zero_division=0)
    recall = recall_score(all_labels, all_preds, average='macro', zero_division=0)
    f1 = f1_score(all_labels, all_preds, average='macro', zero_division=0)

    return avg_loss, accuracy, precision, recall, f1

In [None]:
# 학습 및 검증 함수 실행
best_val_loss = float('inf')
train_accuracies, val_accuracies = [], []
train_losses, val_losses = [], []
precisions, recalls, f1_scores = [], [], []

for epoch in range(20):
    train_loss, train_acc, train_precision, train_recall, train_f1 = train(model, train_loader, optimizer, epoch)
    val_loss, val_acc, val_precision, val_recall, val_f1 = validate(model, valid_loader, epoch)

    # 각 모델의 결과 저장
    train_losses.append(train_loss)
    val_losses.append(val_loss)
    train_accuracies.append(train_acc)
    val_accuracies.append(val_acc)
    precisions.append(val_precision)
    recalls.append(val_recall)
    f1_scores.append(val_f1)

    # 조기 종료 및 모델 저장
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        # 폴더가 없으면 생성
        if not os.path.exists(model_name):
            os.makedirs(model_name)
        # 모델 저장
        torch.save(model.state_dict(), f'{model_name}/best_model.pth')


  tokens_padded = torch.stack([torch.cat([torch.tensor(token[:max_length], dtype=torch.long), torch.tensor([tokenizer.pad_token_id] * (max_length - len(token)), dtype=torch.long)]) if len(token) < max_length else torch.tensor(token[:max_length], dtype=torch.long) for token in tokens])

Training Epoch 0:  25%|██▍       | 922/3689 [01:39<04:58,  9.27batch/s][A
Training Epoch 0:  25%|██▍       | 922/3689 [01:39<04:58,  9.27batch/s, accuracy=0.886, loss=0.417][A
Training Epoch 0:  25%|██▍       | 922/3689 [01:55<04:58,  9.27batch/s, accuracy=0.886, loss=0.417][A
Training Epoch 0:  50%|████▉     | 1844/3689 [03:18<03:18,  9.28batch/s, accuracy=0.886, loss=0.417][A
Training Epoch 0:  50%|████▉     | 1844/3689 [03:18<03:18,  9.28batch/s, accuracy=0.91, loss=0.315] [A
Training Epoch 0:  50%|████▉     | 1844/3689 [03:29<03:18,  9.28batch/s, accuracy=0.91, loss=0.315][A
Training Epoch 0:  75%|███████▍  | 2766/3689 [04:57<01:39,  9.29batch/s, accuracy=0.91, loss=0.315][A
Training Epoch 0:

In [None]:
# 성능 지표 출력 및 저장
metrics_3 = {
    'model_name': model_name,
    'train_accuracy': train_accuracies,
    'val_accuracy': val_accuracies,
    'train_loss': train_losses,
    'val_loss': val_losses,
    'precision': precisions,
    'recall': recalls,
    'f1_score': f1_scores
}

metrics_df = pd.DataFrame([metrics_3])
metrics_df.to_csv('KcBert_metrics.csv', index=False)
print("KcBERT metrics saved.")
files.download('KcBert_metrics.csv')

KcBERT metrics saved.


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>