In [1]:
!pip install transformers



In [2]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, ElectraTokenizer, BertForSequenceClassification, ElectraForSequenceClassification
from tqdm.notebook import tqdm
import re
import torch.nn.functional as F
import numpy as np

In [3]:
def clean_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove special characters and numbers
    text = re.sub(r'[^가-힣a-z\s]', ' ', text)

    # Remove extra spaces
    text = ' '.join(text.split())

    # 한국어 불용어 리스트
    stopwords = [
        '이', '있', '하', '것', '들', '그', '되', '수', '이', '보', '않', '없', '나', '사람', '주', '아니',
        '등', '같', '우리', '때', '년', '가', '한', '지', '대하', '오', '말', '일', '그렇', '위하',
        '때문', '그것', '두', '말하', '알', '그러나', '받', '못하', '일', '그런', '또', '문제', '더', '사회',
        '많', '그리고', '좋', '크', '따르', '중', '나오', '가지', '씨', '시키', '만들', '지금', '생각하',
        '그러', '속', '하나', '집', '살', '모르', '적', '월', '데', '자신', '안', '어떤', '내', '내', '경우',
        '명', '생각', '시간', '그녀', '다시', '이런', '앞', '보이', '번', '나', '다른', '어떻', '여자', '개',
        '전', '들', '사실', '이렇', '점', '싶', '말', '정도', '좀', '원', '잘', '통하', '소리', '놓'
    ]

    # 불용어 제거
    text = ' '.join(word for word in text.split() if word not in stopwords)

    return text

In [4]:
# 데이터 로드
train = pd.read_csv('/content/drive/MyDrive/모두의 연구소/Aiffel/DLcon/data/train.csv')
test = pd.read_json('/content/drive/MyDrive/모두의 연구소/Aiffel/DLcon/data/test.json').transpose()

# train과 test 데이터의 텍스트 열 정규화
train['conversation'] = train['conversation'].apply(clean_text)
test['text'] = test['text'].apply(clean_text)
test.columns=['conversation']

# 지정된 클래스를 숫자로 인코딩
label_dict = {
    '협박 대화': 0,
    '갈취 대화': 1,
    '직장 내 괴롭힘 대화': 2,
    '기타 괴롭힘 대화': 3
}
train['label_encoded'] = train['class'].map(label_dict)

In [5]:
train.head()

Unnamed: 0,idx,class,conversation,label_encoded
0,0,협박 대화,너 스스로를 죽여달라고 애원하는 것인가 아닙니다 죄송합니다 죽을 거면 혼자 죽지 우...,0
1,1,협박 대화,길동경찰서입니다 시 분 마트에 폭발물을 설치할거다 네 똑바로 들어 한번만 얘기한다 ...,0
2,2,기타 괴롭힘 대화,너 되게 귀여운거 알지 나보다 작은 남자는 첨봤어 그만해 니들 놀리는거 재미없어 지...,3
3,3,갈취 대화,어이 거기 예 너 말이야 너 이리 오라고 무슨 너 옷 좋아보인다 얘 돈 있나봐 아니...,1
4,4,갈취 대화,저기요 혹시 날이 너무 뜨겁잖아요 저희 회사에서 선크림 파는데 손등에 발라보실래요 ...,1


In [6]:
test.head()

Unnamed: 0,conversation
t_000,아가씨 담배한갑주소 네 원입니다 어 네 지갑어디갔지 에이 버스에서 잃어버렸나보네 그...
t_001,우리팀에서 다른팀으로 갈 없나 그럼 영지씨가 가는건 어때 네 제가요 그렇지 달만 파...
t_002,너 오늘 그게 뭐야 네 제가 뭘 잘못했나요 제대로 하지 네 똑바로 하지 행실이 맘에...
t_004,아무튼 앞으로 니가 와이파이야 응 와이파이 온 켰어 반말 주인님이라고도 말해야지 켰...
t_005,그러니까 빨리 말해 선생님 제발 살려주십시오 비밀번호 틀릴 때마다 손톱 하나씩 뺀찌...


In [7]:
# KoBERT용 Dataset 정의
class KoBERTDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe['conversation'].tolist()
        self.labels = dataframe['label_encoded'].tolist() if 'label_encoded' in dataframe else [0] * len(dataframe)
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text[index])
        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'targets': torch.tensor(self.labels[index], dtype=torch.long)
        }

# KoELECTRA용 Dataset 정의
class KoELECTRADataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe['conversation'].tolist()
        self.labels = dataframe['label_encoded'].tolist() if 'label_encoded' in dataframe else [0] * len(dataframe)
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text[index])
        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'targets': torch.tensor(self.labels[index], dtype=torch.long)
        }

In [8]:
# DataLoader 설정
MAX_LEN = 350
BATCH_SIZE = 4

kobert_tokenizer = BertTokenizer.from_pretrained('monologg/kobert')
koelectra_tokenizer = ElectraTokenizer.from_pretrained('monologg/koelectra-base-v3-discriminator')

kobert_train_dataset = KoBERTDataset(train, kobert_tokenizer, MAX_LEN)
koelectra_train_dataset = KoELECTRADataset(train, koelectra_tokenizer, MAX_LEN)

kobert_train_data_loader = DataLoader(kobert_train_dataset, batch_size=BATCH_SIZE)
koelectra_train_data_loader = DataLoader(koelectra_train_dataset, batch_size=BATCH_SIZE)

kobert_test_dataset = KoBERTDataset(test, kobert_tokenizer, MAX_LEN)
koelectra_test_dataset = KoELECTRADataset(test, koelectra_tokenizer, MAX_LEN)

kobert_test_data_loader = DataLoader(kobert_test_dataset, batch_size=BATCH_SIZE)
koelectra_test_data_loader = DataLoader(koelectra_test_dataset, batch_size=BATCH_SIZE)

In [9]:
# 모델 정의
class KoBERTClass(torch.nn.Module):
    def __init__(self):
        super(KoBERTClass, self).__init__()
        self.l1 = BertForSequenceClassification.from_pretrained('monologg/kobert', num_labels=4)

    def forward(self, ids, mask):
        output = self.l1(ids, attention_mask=mask)
        return output.logits

class KoELECTRAClass(torch.nn.Module):
    def __init__(self):
        super(KoELECTRAClass, self).__init__()
        self.l1 = ElectraForSequenceClassification.from_pretrained('monologg/koelectra-base-v3-discriminator', num_labels=4)

    def forward(self, ids, mask):
        output = self.l1(ids, attention_mask=mask)
        return output.logits

In [10]:
# 모델 인스턴스 생성
kobert_model = KoBERTClass()
kobert_model.to(torch.device("cuda"))

koelectra_model = KoELECTRAClass()
koelectra_model.to(torch.device("cuda"))

# 손실 함수 및 최적화 함수 설정
loss_function = torch.nn.CrossEntropyLoss()
kobert_optimizer = torch.optim.Adam(params=kobert_model.parameters(), lr=1e-5)
koelectra_optimizer = torch.optim.Adam(params=koelectra_model.parameters(), lr=1e-5)

# 학습 함수 정의
def train(model, tokenizer, optimizer, data_loader, epoch):
    model.train()
    for _, data in tqdm(enumerate(data_loader), total=len(data_loader)):
        ids = data['ids'].to(torch.device("cuda"), dtype=torch.long)
        mask = data['mask'].to(torch.device("cuda"), dtype=torch.long)
        targets = data['targets'].to(torch.device("cuda"), dtype=torch.long)

        outputs = model(ids, mask)
        optimizer.zero_grad()
        loss = loss_function(outputs, targets)
        loss.backward()
        optimizer.step()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at monologg/kobert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at monologg/koelectra-base-v3-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
# 학습 실행
EPOCHS = 5
for epoch in range(EPOCHS):
    print("Training KoBERT model, Epoch:", epoch)
    for bertE in range(2):
      train(kobert_model, kobert_tokenizer, kobert_optimizer, kobert_train_data_loader, epoch)

    print("Training KoELECTRA model, Epoch:", epoch)
    train(koelectra_model, koelectra_tokenizer, koelectra_optimizer, koelectra_train_data_loader, epoch)

Training KoBERT model, Epoch: 0


  0%|          | 0/988 [00:00<?, ?it/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


  0%|          | 0/988 [00:00<?, ?it/s]

Training KoELECTRA model, Epoch: 0


  0%|          | 0/988 [00:00<?, ?it/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Training KoBERT model, Epoch: 1


  0%|          | 0/988 [00:00<?, ?it/s]

  0%|          | 0/988 [00:00<?, ?it/s]

Training KoELECTRA model, Epoch: 1


  0%|          | 0/988 [00:00<?, ?it/s]

Training KoBERT model, Epoch: 2


  0%|          | 0/988 [00:00<?, ?it/s]

  0%|          | 0/988 [00:00<?, ?it/s]

Training KoELECTRA model, Epoch: 2


  0%|          | 0/988 [00:00<?, ?it/s]

Training KoBERT model, Epoch: 3


  0%|          | 0/988 [00:00<?, ?it/s]

  0%|          | 0/988 [00:00<?, ?it/s]

Training KoELECTRA model, Epoch: 3


  0%|          | 0/988 [00:00<?, ?it/s]

Training KoBERT model, Epoch: 4


  0%|          | 0/988 [00:00<?, ?it/s]

  0%|          | 0/988 [00:00<?, ?it/s]

Training KoELECTRA model, Epoch: 4


  0%|          | 0/988 [00:00<?, ?it/s]

In [12]:
# 확률 예측 함수 정의

def predict_proba(model, data_loader):
    model.eval()
    all_probs = []
    with torch.no_grad():
        for _, data in tqdm(enumerate(data_loader), total=len(data_loader)):
            ids = data['ids'].to(torch.device("cuda"), dtype=torch.long)
            mask = data['mask'].to(torch.device("cuda"), dtype=torch.long)

            outputs = model(ids, mask)
            probs = F.softmax(outputs, dim=1)
            all_probs.extend(probs.cpu().numpy().tolist())
    return all_probs

# 각 모델로 확률 예측 실행
kobert_probs = predict_proba(kobert_model, kobert_test_data_loader)
koelectra_probs = predict_proba(koelectra_model, koelectra_test_data_loader)

# Kobert 예측값을 submission.csv로 저장
kobert_predictions = [np.argmax(prob) for prob in kobert_probs]
kobert_submission = pd.DataFrame({'file_name': test.index, 'class': kobert_predictions})
kobert_submission.to_csv('./kobert_submission.csv', index=False)

# Koelectra 예측값을 submission.csv로 저장
koelectra_predictions = [np.argmax(prob) for prob in koelectra_probs]
koelectra_submission = pd.DataFrame({'file_name': test.index, 'class': koelectra_predictions})
koelectra_submission.to_csv('./koelectra_submission.csv', index=False)

# 확률을 평균내어 최종 클래스 결정
final_predictions = []
for k_bert_prob, k_electra_prob in zip(kobert_probs, koelectra_probs):
    avg_prob = [(a+b)/2 for a, b in zip(k_bert_prob, k_electra_prob)]
    final_predictions.append(np.argmax(avg_prob))

# 결과를 submission.csv로 저장
submission = pd.DataFrame({'file_name': test.index, 'class': final_predictions})
submission.to_csv('./ensemble_submission.csv', index=False)

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]