In [24]:
!pip install transformers



In [25]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, ElectraTokenizer, BertForSequenceClassification, ElectraForSequenceClassification
from tqdm.notebook import tqdm
import re
import torch.nn.functional as F
import numpy as np

In [26]:
def clean_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove special characters and numbers
    text = re.sub(r'[^가-힣a-z\s]', ' ', text)
    # Remove extra spaces
    text = ' '.join(text.split())
    # 한국어 불용어 리스트
    stopwords = [
        '이', '있', '하', '것', '들', '그', '되', '수', '이', '보', '않', '없', '나', '사람', '주', '아니',
        '등', '같', '우리', '때', '년', '가', '한', '지', '대하', '오', '말', '일', '그렇', '위하',
        '때문', '그것', '두', '말하', '알', '그러나', '받', '못하', '일', '그런', '또', '문제', '더', '사회',
        '많', '그리고', '좋', '크', '따르', '중', '나오', '가지', '씨', '시키', '만들', '지금', '생각하',
        '그러', '속', '하나', '집', '살', '모르', '적', '월', '데', '자신', '안', '어떤', '내', '내', '경우',
        '명', '생각', '시간', '그녀', '다시', '이런', '앞', '보이', '번', '나', '다른', '어떻', '여자', '개',
        '전', '들', '사실', '이렇', '점', '싶', '말', '정도', '좀', '원', '잘', '통하', '소리', '놓'
    ]
    # 불용어 제거
    text = ' '.join(word for word in text.split() if word not in stopwords)
    return text

In [27]:
# 데이터 로드
train = pd.read_csv('/content/drive/MyDrive/모두의 연구소/Aiffel/DLcon/data/train.csv')
test = pd.read_json('/content/drive/MyDrive/모두의 연구소/Aiffel/DLcon/data/test.json').transpose()

# train과 test 데이터의 텍스트 열 정규화
train['conversation'] = train['conversation'].apply(clean_text)
test['text'] = test['text'].apply(clean_text)

# 지정된 클래스를 숫자로 인코딩
label_dict = {
    '협박 대화': 0,
    '갈취 대화': 1,
    '직장 내 괴롭힘 대화': 2,
    '기타 괴롭힘 대화': 3
}
train['label_encoded'] = train['class'].map(label_dict)

In [28]:
# KLUE용 Dataset 정의
class KLUEDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe['conversation' if 'conversation' in dataframe else 'text'].tolist()
        self.labels = dataframe['label_encoded'].tolist() if 'label_encoded' in dataframe else [0] * len(dataframe)
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text[index])
        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'targets': torch.tensor(self.labels[index], dtype=torch.long)
        }

# KoELECTRA용 Dataset 정의
class KoELECTRADataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe['conversation' if 'conversation' in dataframe else 'text'].tolist()
        self.labels = dataframe['label_encoded'].tolist() if 'label_encoded' in dataframe else [0] * len(dataframe)
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text[index])
        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'targets': torch.tensor(self.labels[index], dtype=torch.long)
        }

In [29]:
# DataLoader 설정
MAX_LEN = 350
BATCH_SIZE = 4

klue_tokenizer = BertTokenizer.from_pretrained('klue/bert-base')
koelectra_tokenizer = ElectraTokenizer.from_pretrained('monologg/koelectra-base-v3-discriminator')

klue_train_dataset = KLUEDataset(train, klue_tokenizer, MAX_LEN)
koelectra_train_dataset = KoELECTRADataset(train, koelectra_tokenizer, MAX_LEN)

klue_train_data_loader = DataLoader(klue_train_dataset, batch_size=BATCH_SIZE)
koelectra_train_data_loader = DataLoader(koelectra_train_dataset, batch_size=BATCH_SIZE)

klue_test_dataset = KLUEDataset(test, klue_tokenizer, MAX_LEN)
koelectra_test_dataset = KoELECTRADataset(test, koelectra_tokenizer, MAX_LEN)

klue_test_data_loader = DataLoader(klue_test_dataset, batch_size=BATCH_SIZE)
koelectra_test_data_loader = DataLoader(koelectra_test_dataset, batch_size=BATCH_SIZE)

In [30]:
# 모델 정의
class KLUEClass(torch.nn.Module):
    def __init__(self):
        super(KLUEClass, self).__init__()
        self.l1 = BertForSequenceClassification.from_pretrained('klue/bert-base', num_labels=4)

    def forward(self, ids, mask):
        output = self.l1(ids, attention_mask=mask)
        return output.logits

class KoELECTRAClass(torch.nn.Module):
    def __init__(self):
        super(KoELECTRAClass, self).__init__()
        self.l1 = ElectraForSequenceClassification.from_pretrained('monologg/koelectra-base-v3-discriminator', num_labels=4)

    def forward(self, ids, mask):
        output = self.l1(ids, attention_mask=mask)
        return output.logits

In [31]:
# KLUE 모델 로드하기
SAVE_PATH = '/content/drive/MyDrive/모두의 연구소/Aiffel/DLcon/klue_model.pth'
klue_model = KLUEClass()
#klue_model.load_state_dict(torch.load(SAVE_PATH))
klue_model.load_state_dict(torch.load(SAVE_PATH), strict=False)
klue_model.to(torch.device("cuda"))
klue_model.eval()  # 항상 eval 모드로 설정한 후 예측을 수행해야 합니다.

# KoELECTRA 모델 인스턴스 생성
koelectra_model = KoELECTRAClass()
koelectra_model.to(torch.device("cuda"))

# Electra 모델 로드하기
# loaded_electra_model = KoELECTRAlass()
# loaded_electra_model.load_state_dict(torch.load(SAVE_PATH_el))
# loaded_electra_model.to(torch.device("cuda"))
# loaded_electra_model.eval()

# 손실 함수 및 최적화 함수 설정
loss_function = torch.nn.CrossEntropyLoss()
klue_optimizer = torch.optim.Adam(params=klue_model.parameters(), lr=1e-5)
koelectra_optimizer = torch.optim.Adam(params=koelectra_model.parameters(), lr=1e-5)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at klue/bert-base and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at monologg/koelectra-base-v3-discriminator and are newly initialized: ['classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [32]:
# 학습 함수 정의
def train(model, tokenizer, optimizer, data_loader, epoch):
    model.train()
    for _, data in tqdm(enumerate(data_loader), total=len(data_loader)):
        ids = data['ids'].to(torch.device("cuda"), dtype=torch.long)
        mask = data['mask'].to(torch.device("cuda"), dtype=torch.long)
        targets = data['targets'].to(torch.device("cuda"), dtype=torch.long)

        outputs = model(ids, mask)
        optimizer.zero_grad()
        loss = loss_function(outputs, targets)
        loss.backward()
        optimizer.step()

In [33]:
# 학습 실행
# KLUE_EPOCHS = 2 이미 best condition
KOELECTRA_EPOCHS = 4

#for epoch in range(KLUE_EPOCHS):
#    print("Training KLUE model, Epoch:", epoch)
#    train(klue_model, klue_tokenizer, klue_optimizer, klue_train_data_loader, epoch)

for epoch in range(KOELECTRA_EPOCHS):
    print("Training KoELECTRA model, Epoch:", epoch)
    train(koelectra_model, koelectra_tokenizer, koelectra_optimizer, koelectra_train_data_loader, epoch)

Training KoELECTRA model, Epoch: 0


  0%|          | 0/988 [00:00<?, ?it/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Training KoELECTRA model, Epoch: 1


  0%|          | 0/988 [00:00<?, ?it/s]

Training KoELECTRA model, Epoch: 2


  0%|          | 0/988 [00:00<?, ?it/s]

Training KoELECTRA model, Epoch: 3


  0%|          | 0/988 [00:00<?, ?it/s]

In [34]:
# 확률 예측 함수 정의
def predict_proba(model, data_loader):
    model.eval()
    all_probs = []
    with torch.no_grad():
        for _, data in tqdm(enumerate(data_loader), total=len(data_loader)):
            ids = data['ids'].to(torch.device("cuda"), dtype=torch.long)
            mask = data['mask'].to(torch.device("cuda"), dtype=torch.long)

            outputs = model(ids, mask)
            probs = F.softmax(outputs, dim=1)
            all_probs.extend(probs.cpu().numpy().tolist())
    return all_probs

# 각 모델로 확률 예측 실행
klue_probs = predict_proba(klue_model, klue_test_data_loader)
koelectra_probs = predict_proba(koelectra_model, koelectra_test_data_loader)

# KLUE 예측값을 submission.csv로 저장
klue_predictions = [np.argmax(prob) for prob in klue_probs]
klue_submission = pd.DataFrame({'file_name': test.index, 'class': klue_predictions})
klue_submission.to_csv('./klue_submission.csv', index=False)

# Koelectra 예측값을 submission.csv로 저장
koelectra_predictions = [np.argmax(prob) for prob in koelectra_probs]
koelectra_submission = pd.DataFrame({'file_name': test.index, 'class': koelectra_predictions})
koelectra_submission.to_csv('./koelectra_submission.csv', index=False)

# 확률을 평균내어 최종 클래스 결정
final_predictions = []
for klue_prob, k_electra_prob in zip(klue_probs, koelectra_probs):
    avg_prob = [(a+b)/2 for a, b in zip(klue_prob, k_electra_prob)]
    # avg_prob = [(a*1.1+b*1.1)/2 for a, b in zip(klue_prob, k_electra_prob)]
    final_predictions.append(np.argmax(avg_prob))

# 앙상블 결과를 submission.csv로 저장
ensemble_submission = pd.DataFrame({'file_name': test.index, 'class': final_predictions})
ensemble_submission.to_csv('./ensemble_submission.csv', index=False)

  0%|          | 0/100 [00:00<?, ?it/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


  0%|          | 0/100 [00:00<?, ?it/s]

In [35]:
# 90일 경우 저장

# Ko-Electra 모델 저장하기
# SAVE_PATH_el = './koelectra_model.pth'
# torch.save(koelectra_model.state_dict(), SAVE_PATH_el)

In [36]:
_

''