In [None]:
import torch
import torch.nn as nn
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import Dataset, DataLoader
from torch.optim import Adam
from tqdm import tqdm
from sklearn.metrics import precision_score, recall_score, f1_score
import pandas as pd

In [None]:
from google.colab import drive, files
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
df = pd.read_excel('/content/drive/MyDrive/df_final_real.xlsx')

In [None]:
df_save = df

In [None]:
# GPU가 사용 가능한지 확인하고 설정
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# BERT 토크나이저 및 모델 불러오기
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased', num_labels=13)
model.to(device)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1

In [None]:
# 레이블 매핑을 사건 종류에 따라 다르게 적용하는 함수 정의 (수정된 레이블 적용)
label_map = {
    '민사_승소': 0, '민사_패소': 1, '민사_기각': 2,
    '형사_기각': 3, '징역': 4, '벌금': 5, '무혐의': 6,
    '가사_승소': 7, '가사_패소': 8, '가사_기각': 9,
    '세무_승소': 10, '세무_패소': 11, '세무_기각': 12
}

In [None]:
df = df[df['판결유형'].isin(label_map.keys())]

In [None]:
# 텍스트 데이터 전처리 및 토큰화 함수 정의 (tqdm 추가)
def preprocess_and_tokenize(data, column_name):
    tokenized_data = []
    for idx, row in tqdm(data.iterrows(), total=len(data), desc="Tokenizing"):
        text = str(row[column_name])
        # 전처리: 불필요한 공백 제거
        text = " ".join(text.split())
        # 토큰화
        tokens = tokenizer.encode(text, add_special_tokens=True, max_length=512, truncation=True)
        tokenized_data.append(tokens)
    return tokenized_data

In [None]:
# '판례내용' 열에 대해 토큰화 수행
df['판례내용_tokens'] = preprocess_and_tokenize(df, '판례내용')

Tokenizing: 100%|██████████| 42222/42222 [12:10<00:00, 57.76it/s]


In [None]:
df.to_excel('df_token.xlsx')
files.download('df_token.xlsx')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# df = pd.read_excel('/content/drive/MyDrive/df_token.xlsx')

In [None]:
df_2 = df

In [None]:
print(df_2.columns)

Index(['사건명', '사건종류명', '판결유형', '판시사항', '판결요지', '판례내용', '판례내용_tokens'], dtype='object')


In [None]:
# 데이터셋 정의
class LegalDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        tokens = self.data.iloc[idx]['판례내용_tokens']
        tokens = torch.tensor(tokens, dtype=torch.long)
        label = torch.tensor(label_map[self.data.iloc[idx]['판결유형']], dtype=torch.long)
        return tokens, label

In [None]:
# Collate 함수 정의
def collate_fn(batch):
    tokens, labels = zip(*batch)
    tokens_padded = nn.utils.rnn.pad_sequence(tokens, batch_first=True, padding_value=tokenizer.pad_token_id)
    labels = torch.stack(labels)
    return tokens_padded, labels

In [None]:
# 데이터셋을 학습 및 검증 데이터로 분리 (7:3 비율)
train_size = int(0.7 * len(df_2))
valid_size = len(df_2) - train_size
train_data = df_2.iloc[:train_size]
valid_data = df_2.iloc[train_size:]

train_dataset = LegalDataset(train_data)
valid_dataset = LegalDataset(valid_data)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True, collate_fn=collate_fn)
valid_loader = DataLoader(valid_dataset, batch_size=8, shuffle=False, collate_fn=collate_fn)

In [None]:
# Optimizer / 하이퍼파라미터 설정
optimizer = Adam(model.parameters(), lr=2e-5)

In [None]:
# 데이터 수에 따른 가중치 설정
class_counts = train_data['사건종류명'].value_counts().to_dict()
total_count = sum(class_counts.values())
class_weights = {case: total_count / count for case, count in class_counts.items()}

weights = torch.tensor([class_weights.get(case, 1.0) for case in train_data['사건종류명']], dtype=torch.float).to(device)
criterion = nn.CrossEntropyLoss(weight=weights)

In [None]:
# 평가 지표 저장을 위한 리스트 초기화
train_accuracies, val_accuracies = [], []
train_losses, val_losses = [], []
precisions, recalls, f1_scores = [], [], []

In [None]:
# 학습 함수
def train(model, loader, optimizer, epoch, log_interval=10):
    model.train()
    total_loss, total_correct = 0, 0
    total_batches = len(loader)
    all_labels = []
    all_preds = []

    progress_bar = tqdm(total=total_batches, desc=f"Training Epoch {epoch}", unit='batch')

    for batch_idx, (tokens, labels) in enumerate(loader):
        tokens, labels = tokens.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(tokens, labels=labels)
        loss, logits = outputs.loss, outputs.logits
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        total_correct += (logits.argmax(dim=1) == labels).sum().item()

        all_labels.extend(labels.cpu().numpy())
        all_preds.extend(logits.argmax(dim=1).cpu().numpy())

        if (batch_idx + 1) % (total_batches // 10) == 0:
            progress_bar.update(total_batches // 10)
            progress_bar.set_postfix(loss=total_loss / (batch_idx + 1), accuracy=total_correct / ((batch_idx + 1) * loader.batch_size))

    progress_bar.close()

    # 정밀도, 재현율, F1 스코어 계산
    precision = precision_score(all_labels, all_preds, average='macro')
    recall = recall_score(all_labels, all_preds, average='macro')
    f1 = f1_score(all_labels, all_preds, average='macro')

    return total_loss / total_batches, total_correct / (total_batches * loader.batch_size), precision, recall, f1

# 검증 함수
def validate(model, loader, epoch):
    model.eval()
    total_loss, total_correct = 0, 0
    total_batches = len(loader)
    all_labels = []
    all_preds = []

    progress_bar = tqdm(total=total_batches, desc=f"Validation Epoch {epoch}", unit='batch')

    with torch.no_grad():
        for batch_idx, (tokens, labels) in enumerate(loader):
            tokens, labels = tokens.to(device), labels.to(device)
            outputs = model(tokens, labels=labels)
            loss, logits = outputs.loss, outputs.logits
            total_loss += loss.item()
            total_correct += (logits.argmax(dim=1) == labels).sum().item()

            all_labels.extend(labels.cpu().numpy())
            all_preds.extend(logits.argmax(dim=1).cpu().numpy())

            if (batch_idx + 1) % (total_batches // 2) == 0:
                progress_bar.update(total_batches // 2)
                progress_bar.set_postfix(loss=total_loss / (batch_idx + 1), accuracy=total_correct / ((batch_idx + 1) * loader.batch_size))

    progress_bar.close()

    # 정밀도, 재현율, F1 스코어 계산
    precision = precision_score(all_labels, all_preds, average='macro')
    recall = recall_score(all_labels, all_preds, average='macro')
    f1 = f1_score(all_labels, all_preds, average='macro')

    return total_loss / total_batches, total_correct / (total_batches * loader.batch_size), precision, recall, f1


In [None]:
# 모델 학습 및 저장 기능 추가
best_val_loss = float('inf')
early_stopping_patience = 3
early_stopping_counter = 0
min_epochs = 4  # 최소 5 에포크 설정

with tqdm(range(20), desc="Training Epochs") as epochs:
    for epoch in epochs:  # 최대 20 에포크 설정
        train_loss, train_acc, train_prec, train_rec, train_f1 = train(model, train_loader, optimizer, epoch)
        val_loss, val_acc, val_prec, val_rec, val_f1 = validate(model, valid_loader, epoch)

        # 평가 지표 저장
        train_losses.append(train_loss)
        val_losses.append(val_loss)
        train_accuracies.append(train_acc)
        val_accuracies.append(val_acc)
        precisions.append(val_prec)
        recalls.append(val_rec)
        f1_scores.append(val_f1)

        # 조기 종료 조건 확인 및 모델 저장
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            early_stopping_counter = 0
            torch.save(model.state_dict(), 'model_weight_4.pth')  # 모델 저장
            print(f"Model saved at epoch {epoch}")
        else:
            early_stopping_counter += 1

        if early_stopping_counter >= early_stopping_patience and epoch >= min_epochs:
            print(f"Early stopping triggered at epoch {epoch}")
            break


Training Epochs:   0%|          | 0/20 [00:00<?, ?it/s]

Training Epoch 0:   0%|          | 0/3695 [00:00<?, ?batch/s][A[A

Training Epoch 0:  10%|▉         | 369/3695 [01:01<09:12,  6.02batch/s][A[A

Training Epoch 0:  10%|▉         | 369/3695 [01:01<09:12,  6.02batch/s, accuracy=0.68, loss=1.1][A[A

Training Epoch 0:  20%|█▉        | 738/3695 [02:02<08:11,  6.02batch/s, accuracy=0.68, loss=1.1][A[A

Training Epoch 0:  20%|█▉        | 738/3695 [02:02<08:11,  6.02batch/s, accuracy=0.772, loss=0.813][A[A

Training Epoch 0:  30%|██▉       | 1107/3695 [03:03<07:09,  6.02batch/s, accuracy=0.772, loss=0.813][A[A

Training Epoch 0:  30%|██▉       | 1107/3695 [03:03<07:09,  6.02batch/s, accuracy=0.809, loss=0.684][A[A

Training Epoch 0:  40%|███▉      | 1476/3695 [04:05<06:08,  6.02batch/s, accuracy=0.809, loss=0.684][A[A

Training Epoch 0:  40%|███▉      | 1476/3695 [04:05<06:08,  6.02batch/s, accuracy=0.833, loss=0.599][A[A

Training Epoch 0:  50%|████▉     | 1845/3695 [0

Model saved at epoch 0




Training Epoch 1:   0%|          | 0/3695 [00:00<?, ?batch/s][A[A

Training Epoch 1:  10%|▉         | 369/3695 [01:01<09:11,  6.03batch/s][A[A

Training Epoch 1:  10%|▉         | 369/3695 [01:01<09:11,  6.03batch/s, accuracy=0.947, loss=0.17][A[A

Training Epoch 1:  20%|█▉        | 738/3695 [02:02<08:10,  6.03batch/s, accuracy=0.947, loss=0.17][A[A

Training Epoch 1:  20%|█▉        | 738/3695 [02:02<08:10,  6.03batch/s, accuracy=0.945, loss=0.174][A[A

Training Epoch 1:  30%|██▉       | 1107/3695 [03:03<07:09,  6.03batch/s, accuracy=0.945, loss=0.174][A[A

Training Epoch 1:  30%|██▉       | 1107/3695 [03:03<07:09,  6.03batch/s, accuracy=0.948, loss=0.165][A[A

Training Epoch 1:  40%|███▉      | 1476/3695 [04:04<06:08,  6.02batch/s, accuracy=0.948, loss=0.165][A[A

Training Epoch 1:  40%|███▉      | 1476/3695 [04:04<06:08,  6.02batch/s, accuracy=0.946, loss=0.17] [A[A

Training Epoch 1:  50%|████▉     | 1845/3695 [05:06<05:07,  6.02batch/s, accuracy=0.946, loss=0.17]

Model saved at epoch 1




Training Epoch 2:   0%|          | 0/3695 [00:00<?, ?batch/s][A[A

Training Epoch 2:  10%|▉         | 369/3695 [01:01<09:12,  6.02batch/s][A[A

Training Epoch 2:  10%|▉         | 369/3695 [01:01<09:12,  6.02batch/s, accuracy=0.952, loss=0.149][A[A

Training Epoch 2:  20%|█▉        | 738/3695 [02:02<08:11,  6.02batch/s, accuracy=0.952, loss=0.149][A[A

Training Epoch 2:  20%|█▉        | 738/3695 [02:02<08:11,  6.02batch/s, accuracy=0.95, loss=0.153] [A[A

Training Epoch 2:  30%|██▉       | 1107/3695 [03:03<07:09,  6.02batch/s, accuracy=0.95, loss=0.153][A[A

Training Epoch 2:  30%|██▉       | 1107/3695 [03:03<07:09,  6.02batch/s, accuracy=0.95, loss=0.155][A[A

Training Epoch 2:  40%|███▉      | 1476/3695 [04:05<06:08,  6.02batch/s, accuracy=0.95, loss=0.155][A[A

Training Epoch 2:  40%|███▉      | 1476/3695 [04:05<06:08,  6.02batch/s, accuracy=0.95, loss=0.154][A[A

Training Epoch 2:  50%|████▉     | 1845/3695 [05:06<05:07,  6.02batch/s, accuracy=0.95, loss=0.154][

Model saved at epoch 2




Training Epoch 3:   0%|          | 0/3695 [00:00<?, ?batch/s][A[A

Training Epoch 3:  10%|▉         | 369/3695 [01:01<09:12,  6.02batch/s][A[A

Training Epoch 3:  10%|▉         | 369/3695 [01:01<09:12,  6.02batch/s, accuracy=0.961, loss=0.122][A[A

Training Epoch 3:  20%|█▉        | 738/3695 [02:02<08:10,  6.02batch/s, accuracy=0.961, loss=0.122][A[A

Training Epoch 3:  20%|█▉        | 738/3695 [02:02<08:10,  6.02batch/s, accuracy=0.96, loss=0.129] [A[A

Training Epoch 3:  30%|██▉       | 1107/3695 [03:03<07:09,  6.03batch/s, accuracy=0.96, loss=0.129][A[A

Training Epoch 3:  30%|██▉       | 1107/3695 [03:03<07:09,  6.03batch/s, accuracy=0.957, loss=0.138][A[A

Training Epoch 3:  40%|███▉      | 1476/3695 [04:04<06:08,  6.03batch/s, accuracy=0.957, loss=0.138][A[A

Training Epoch 3:  40%|███▉      | 1476/3695 [04:04<06:08,  6.03batch/s, accuracy=0.956, loss=0.138][A[A

Training Epoch 3:  50%|████▉     | 1845/3695 [05:06<05:06,  6.03batch/s, accuracy=0.956, loss=0.13

Model saved at epoch 3




Training Epoch 4:   0%|          | 0/3695 [00:00<?, ?batch/s][A[A

Training Epoch 4:  10%|▉         | 369/3695 [01:01<09:12,  6.02batch/s][A[A

Training Epoch 4:  10%|▉         | 369/3695 [01:01<09:12,  6.02batch/s, accuracy=0.957, loss=0.134][A[A

Training Epoch 4:  20%|█▉        | 738/3695 [02:02<08:10,  6.02batch/s, accuracy=0.957, loss=0.134][A[A

Training Epoch 4:  20%|█▉        | 738/3695 [02:02<08:10,  6.02batch/s, accuracy=0.956, loss=0.137][A[A

Training Epoch 4:  30%|██▉       | 1107/3695 [03:03<07:09,  6.03batch/s, accuracy=0.956, loss=0.137][A[A

Training Epoch 4:  30%|██▉       | 1107/3695 [03:03<07:09,  6.03batch/s, accuracy=0.958, loss=0.129][A[A

Training Epoch 4:  40%|███▉      | 1476/3695 [04:04<06:08,  6.03batch/s, accuracy=0.958, loss=0.129][A[A

Training Epoch 4:  40%|███▉      | 1476/3695 [04:04<06:08,  6.03batch/s, accuracy=0.955, loss=0.139][A[A

Training Epoch 4:  50%|████▉     | 1845/3695 [05:06<05:07,  6.02batch/s, accuracy=0.955, loss=0.1

Model saved at epoch 4




Training Epoch 5:   0%|          | 0/3695 [00:00<?, ?batch/s][A[A

Training Epoch 5:  10%|▉         | 369/3695 [01:01<09:12,  6.02batch/s][A[A

Training Epoch 5:  10%|▉         | 369/3695 [01:01<09:12,  6.02batch/s, accuracy=0.966, loss=0.114][A[A

Training Epoch 5:  20%|█▉        | 738/3695 [02:02<08:10,  6.03batch/s, accuracy=0.966, loss=0.114][A[A

Training Epoch 5:  20%|█▉        | 738/3695 [02:02<08:10,  6.03batch/s, accuracy=0.967, loss=0.113][A[A

Training Epoch 5:  30%|██▉       | 1107/3695 [03:03<07:09,  6.03batch/s, accuracy=0.967, loss=0.113][A[A

Training Epoch 5:  30%|██▉       | 1107/3695 [03:03<07:09,  6.03batch/s, accuracy=0.964, loss=0.117][A[A

Training Epoch 5:  40%|███▉      | 1476/3695 [04:04<06:08,  6.03batch/s, accuracy=0.964, loss=0.117][A[A

Training Epoch 5:  40%|███▉      | 1476/3695 [04:04<06:08,  6.03batch/s, accuracy=0.964, loss=0.117][A[A

Training Epoch 5:  50%|████▉     | 1845/3695 [05:06<05:06,  6.03batch/s, accuracy=0.964, loss=0.1

Model saved at epoch 5




Training Epoch 6:   0%|          | 0/3695 [00:00<?, ?batch/s][A[A

Training Epoch 6:  10%|▉         | 369/3695 [01:01<09:11,  6.03batch/s][A[A

Training Epoch 6:  10%|▉         | 369/3695 [01:01<09:11,  6.03batch/s, accuracy=0.967, loss=0.0983][A[A

Training Epoch 6:  20%|█▉        | 738/3695 [02:02<08:10,  6.03batch/s, accuracy=0.967, loss=0.0983][A[A

Training Epoch 6:  20%|█▉        | 738/3695 [02:02<08:10,  6.03batch/s, accuracy=0.966, loss=0.102] [A[A

Training Epoch 6:  30%|██▉       | 1107/3695 [03:03<07:09,  6.03batch/s, accuracy=0.966, loss=0.102][A[A

Training Epoch 6:  30%|██▉       | 1107/3695 [03:03<07:09,  6.03batch/s, accuracy=0.964, loss=0.107][A[A

Training Epoch 6:  40%|███▉      | 1476/3695 [04:04<06:08,  6.03batch/s, accuracy=0.964, loss=0.107][A[A

Training Epoch 6:  40%|███▉      | 1476/3695 [04:04<06:08,  6.03batch/s, accuracy=0.962, loss=0.115][A[A

Training Epoch 6:  50%|████▉     | 1845/3695 [05:06<05:06,  6.03batch/s, accuracy=0.962, loss=

Early stopping triggered at epoch 8





In [None]:
# 평가 지표를 DataFrame으로 저장
metrics_df = pd.DataFrame({
    'train_accuracy': train_accuracies,
    'train_loss': train_losses,
    'val_accuracy': val_accuracies,
    'val_loss': val_losses,
    'precision': precisions,
    'recall': recalls,
    'f1_score': f1_scores
})

metrics_df.to_csv('training_metrics_full.csv', index=False)

In [None]:
files.download('training_metrics_full.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
files.download('model_weight_4.pth')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
print(metrics_df.head())

   train_accuracy  train_loss  val_accuracy  val_loss  precision    recall  \
0        0.890934    0.382760      0.917377  0.258922   0.495141  0.567165   
1        0.946786    0.169810      0.948706  0.190249   0.594771  0.711912   
2        0.951996    0.149592      0.939315  0.187275   0.615608  0.668698   
3        0.956360    0.137212      0.943971  0.173937   0.612835  0.680025   
4        0.957409    0.130949      0.958886  0.150641   0.612141  0.666834   

   f1_score  
0  0.516558  
1  0.629325  
2  0.610407  
3  0.620393  
4  0.626023  


# **`여기서부터 비교모델`**

In [None]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification, ElectraForSequenceClassification, AdamW
from torch.utils.data import DataLoader
from sklearn.metrics import precision_recall_fscore_support
import pandas as pd

# CPU로 설정 (GPU 사용하지 않을 때)
device = torch.device('cpu')

# 모델 이름 목록
model_names = ['klue/bert-base', 'monologg/koelectra-base-v3-discriminator', 'beomi/KcBERT-base']

In [None]:
# 각 모델의 결과 저장을 위한 리스트 초기화
all_metrics = []

# 데이터셋 준비 (df_save는 원본 데이터셋이어야 함)
train_data = df_save.sample(frac=0.7, random_state=42)  # 70% 훈련 데이터
valid_data = df_save.drop(train_data.index)  # 나머지 30% 검증 데이터

# 배치 사이즈 설정
batch_size = 8

In [None]:
# 데이터로더 정의 함수
def preprocess_data(data, column_name, tokenizer):
    tokenized_data = []
    for text in data[column_name]:
        tokens = tokenizer.encode(text, add_special_tokens=True, max_length=512, truncation=True)
        tokenized_data.append(tokens)
    return tokenized_data

In [None]:
# 데이터셋 정의
class LegalDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        tokens = torch.tensor(self.data.iloc[idx]['tokens'], dtype=torch.long)
        label = torch.tensor(self.data.iloc[idx]['label'], dtype=torch.long)
        return tokens, label

In [None]:
# 학습 및 검증 함수 정의
def train(model, loader, optimizer, epoch):
    model.train()
    total_loss, total_correct = 0, 0
    total_batches = len(loader)

    for batch_idx, (tokens, labels) in enumerate(loader):
        tokens, labels = tokens.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(tokens, labels=labels)
        loss, logits = outputs.loss, outputs.logits
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        total_correct += (logits.argmax(dim=1) == labels).sum().item()

    return total_loss / total_batches, total_correct / len(loader.dataset)

def validate(model, loader, epoch):
    model.eval()
    total_loss, total_correct = 0, 0
    total_batches = len(loader)

    with torch.no_grad():
        for batch_idx, (tokens, labels) in enumerate(loader):
            tokens, labels = tokens.to(device), labels.to(device)
            outputs = model(tokens, labels=labels)
            loss, logits = outputs.loss, outputs.logits
            total_loss += loss.item()
            total_correct += (logits.argmax(dim=1) == labels).sum().item()

    return total_loss / total_batches, total_correct / len(loader.dataset)

In [None]:
# NaN 값을 빈 문자열로 대체
train_data['판례내용'].fillna('', inplace=True)
valid_data['판례내용'].fillna('', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_data['판례내용'].fillna('', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  valid_data['판례내용'].fillna('', inplace=True)


klue/bert-base 모델

In [None]:
import torch
from transformers import BertTokenizer
import pandas as pd

# 모델과 토크나이저 불러오기
model_name = 'klue/bert-base'
tokenizer = BertTokenizer.from_pretrained(model_name)

# 원본 데이터프레임 불러오기 (df_save가 원본 데이터)
df = df_save.copy()

# 토큰화 수행
df['tokens_klue'] = df['판례내용'].apply(lambda x: tokenizer.encode(x, add_special_tokens=True, max_length=512, truncation=True) if pd.notna(x) else [])

# 파일로 저장
df.to_csv('df_klue.csv', index=False)
print("File saved: df_klue.csv")
files.download('df_klue.csv')



File saved: df_klue.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

monologg/koelectra-base-v3-discriminator 모델

In [None]:
import torch
from transformers import BertTokenizer
import pandas as pd

# 모델과 토크나이저 불러오기
model_name = 'monologg/koelectra-base-v3-discriminator'
tokenizer = BertTokenizer.from_pretrained(model_name)

# 원본 데이터프레임 불러오기 (df_save가 원본 데이터)
df = df_save.copy()

# 토큰화 수행
df['tokens_koelectra'] = df['판례내용'].apply(lambda x: tokenizer.encode(x, add_special_tokens=True, max_length=512, truncation=True) if pd.notna(x) else [])

# 파일로 저장
df.to_csv('df_koelectra.csv', index=False)
print("File saved: df_koelectra.csv")


tokenizer_config.json:   0%|          | 0.00/61.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/263k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/467 [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'ElectraTokenizer'. 
The class this function is called from is 'BertTokenizer'.


File saved: df_koelectra.csv


In [None]:
files.download('df_koelectra.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

beomi/KcBERT-base 모델

In [None]:
import torch
from transformers import BertTokenizer
import pandas as pd

# 모델과 토크나이저 불러오기
model_name = 'beomi/KcBERT-base'
tokenizer = BertTokenizer.from_pretrained(model_name)

# 원본 데이터프레임 불러오기 (df_save가 원본 데이터)
df = df_save.copy()

# 토큰화 수행
df['tokens_KcBERT'] = df['판례내용'].apply(lambda x: tokenizer.encode(x, add_special_tokens=True, max_length=512, truncation=True) if pd.notna(x) else [])

# 파일로 저장
df.to_csv('df_KcBERT.csv', index=False)
print("File saved: df_KcBERT.csv")

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/250k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/619 [00:00<?, ?B/s]



File saved: df_KcBERT.csv


In [None]:
files.download('df_KcBERT.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

klue-bert-base 모델

In [None]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
import pandas as pd
from torch.utils.data import DataLoader, Dataset

# CPU 설정
device = torch.device('cpu')

# 데이터셋 클래스 정의
class LegalDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        tokens = eval(self.data.iloc[idx]['tokens_klue'])  # 토큰화된 데이터를 리스트로 변환
        tokens = torch.tensor(tokens, dtype=torch.long)
        label = torch.tensor(self.data.iloc[idx]['label'], dtype=torch.long)  # 라벨을 적절하게 변환
        return tokens, label

# 모델과 토크나이저 불러오기
model_name = 'klue/bert-base'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=13)
model.to(device)

# 미리 토큰화된 데이터 불러오기
df = df_klue

# 데이터 7:3 비율로 분리
train_data = df.sample(frac=0.7, random_state=42)
valid_data = df.drop(train_data.index)

# 학습 데이터셋과 데이터로더 정의
train_dataset = LegalDataset(train_data)
valid_dataset = LegalDataset(valid_data)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=8, shuffle=False)

# Optimizer 설정
optimizer = AdamW(model.parameters(), lr=2e-5)

# 학습 및 검증 진행
best_val_loss = float('inf')
train_accuracies, val_accuracies = [], []
train_losses, val_losses = [], []

# 학습 및 검증 함수 (이미 정의된 train 및 validate 함수 사용)
for epoch in range(20):
    train_loss, train_acc = train(model, train_loader, optimizer, epoch)
    val_loss, val_acc = validate(model, valid_loader, epoch)

    # 각 모델의 결과 저장
    train_losses.append(train_loss)
    val_losses.append(val_loss)
    train_accuracies.append(train_acc)
    val_accuracies.append(val_acc)

    # 조기 종료 및 모델 저장
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save(model.state_dict(), f'{model_name}_best_model.pth')

# 성능 지표 출력 및 저장
metrics = {
    'model_name': model_name,
    'train_accuracy': train_accuracies,
    'val_accuracy': val_accuracies,
    'train_loss': train_losses,
    'val_loss': val_losses,
}
metrics_df = pd.DataFrame([metrics])
metrics_df.to_csv(f'{model_name}_metrics.csv', index=False)
print(f"{model_name} metrics saved.")

monologg/koelectra-base-v3-discriminator 모델

In [None]:
import torch
from transformers import BertTokenizer, ElectraForSequenceClassification, AdamW
import pandas as pd
from torch.utils.data import DataLoader, Dataset

# CPU 설정
device = torch.device('cpu')

# 데이터셋 클래스 정의
class LegalDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        tokens = eval(self.data.iloc[idx]['tokens_koelectra'])  # 토큰화된 데이터를 리스트로 변환
        tokens = torch.tensor(tokens, dtype=torch.long)
        label = torch.tensor(self.data.iloc[idx]['label'], dtype=torch.long)  # 라벨을 적절하게 변환
        return tokens, label

# 모델과 토크나이저 불러오기
model_name = 'monologg/koelectra-base-v3-discriminator'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = ElectraForSequenceClassification.from_pretrained(model_name, num_labels=13)
model.to(device)

# 미리 토큰화된 데이터 불러오기
df = df_koelectra

# 데이터 7:3 비율로 분리
train_data = df.sample(frac=0.7, random_state=42)
valid_data = df.drop(train_data.index)

# 학습 데이터셋과 데이터로더 정의
train_dataset = LegalDataset(train_data)
valid_dataset = LegalDataset(valid_data)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=8, shuffle=False)

# Optimizer 설정
optimizer = AdamW(model.parameters(), lr=2e-5)

# 학습 및 검증 진행
best_val_loss = float('inf')
train_accuracies, val_accuracies = [], []
train_losses, val_losses = [], []

# 학습 및 검증 함수 (이미 정의된 train 및 validate 함수 사용)
for epoch in range(20):
    train_loss, train_acc = train(model, train_loader, optimizer, epoch)
    val_loss, val_acc = validate(model, valid_loader, epoch)

    # 각 모델의 결과 저장
    train_losses.append(train_loss)
    val_losses.append(val_loss)
    train_accuracies.append(train_acc)
    val_accuracies.append(val_acc)

    # 조기 종료 및 모델 저장
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save(model.state_dict(), f'{model_name}_best_model.pth')

# 성능 지표 출력 및 저장
metrics = {
    'model_name': model_name,
    'train_accuracy': train_accuracies,
    'val_accuracy': val_accuracies,
    'train_loss': train_losses,
    'val_loss': val_losses,
}
metrics_df = pd.DataFrame([metrics])
metrics_df.to_csv(f'{model_name}_metrics.csv', index=False)
print(f"{model_name} metrics saved.")


beomi/KcBERT-base 모델

In [None]:
# 모델과 토크나이저 불러오기
model_name = 'beomi/KcBERT-base'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=13)
model.to(device)

# 미리 토큰화된 데이터 불러오기
train_data = pd.read_csv(f'df_beomi_KcBERT-base.csv')
valid_data = pd.read_csv(f'df_beomi_KcBERT-base.csv')

# 학습 데이터셋과 데이터로더 정의
train_dataset = LegalDataset(train_data)
valid_dataset = LegalDataset(valid_data)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=8, shuffle=False)

# Optimizer 설정
optimizer = AdamW(model.parameters(), lr=2e-5)

# 학습 및 검증 진행
best_val_loss = float('inf')
train_accuracies, val_accuracies = [], []
train_losses, val_losses = [], []

for epoch in range(20):
    train_loss, train_acc = train(model, train_loader, optimizer, epoch)
    val_loss, val_acc = validate(model, valid_loader, epoch)

    # 각 모델의 결과 저장
    train_losses.append(train_loss)
    val_losses.append(val_loss)
    train_accuracies.append(train_acc)
    val_accuracies.append(val_acc)

    # 조기 종료 및 모델 저장
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save(model.state_dict(), f'{model_name}_best_model.pth')

# 성능 지표 출력 및 저장
metrics = {
    'model_name': model_name,
    'train_accuracy': train_accuracies,
    'val_accuracy': val_accuracies,
    'train_loss': train_losses,
    'val_loss': val_losses,
}
metrics_df = pd.DataFrame([metrics])
metrics_df.to_csv(f'{model_name}_metrics.csv', index=False)
print(f"{model_name} metrics saved.")