In [1]:
#!pip install mxnet
#!pip install gluonnlp pandas tqdm
#!pip install sentencepiece
#!pip install transformers
#!pip install torch

In [2]:
import pandas as pd
import numpy as np
import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from transformers import AdamW, get_linear_schedule_with_warmup
from transformers import BertModel, BertTokenizer
from tqdm.notebook import tqdm
import gluonnlp as nlp
from gluonnlp.data import SentencepieceTokenizer
from transformers import BertForSequenceClassification
import re

In [3]:
def clean_text(text):
    # Convert to lowercase
    text = text.lower()
    
    # Remove special characters and numbers
    text = re.sub(r'[^가-힣a-z\s]', ' ', text)
    
    # Remove extra spaces
    text = ' '.join(text.split())
    
    # 한국어 불용어 리스트
    stopwords = [
        '이', '있', '하', '것', '들', '그', '되', '수', '이', '보', '않', '없', '나', '사람', '주', '아니', 
        '등', '같', '우리', '때', '년', '가', '한', '지', '대하', '오', '말', '일', '그렇', '위하', 
        '때문', '그것', '두', '말하', '알', '그러나', '받', '못하', '일', '그런', '또', '문제', '더', '사회', 
        '많', '그리고', '좋', '크', '따르', '중', '나오', '가지', '씨', '시키', '만들', '지금', '생각하', 
        '그러', '속', '하나', '집', '살', '모르', '적', '월', '데', '자신', '안', '어떤', '내', '내', '경우',
        '명', '생각', '시간', '그녀', '다시', '이런', '앞', '보이', '번', '나', '다른', '어떻', '여자', '개',
        '전', '들', '사실', '이렇', '점', '싶', '말', '정도', '좀', '원', '잘', '통하', '소리', '놓'
    ]
    
    # 불용어 제거
    text = ' '.join(word for word in text.split() if word not in stopwords)
    
    return text

In [4]:
# 데이터 로드
train = pd.read_csv('./data/train.csv')
test = pd.read_json('./data/test.json').transpose()

# train과 test 데이터의 텍스트 열 정규화
train['conversation'] = train['conversation'].apply(clean_text)
test['text'] = test['text'].apply(clean_text)

# 지정된 클래스를 숫자로 인코딩
label_dict = {
    '협박 대화': 0,
    '갈취 대화': 1,
    '직장 내 괴롭힘 대화': 2,
    '기타 괴롭힘 대화': 3
}
train['label_encoded'] = train['class'].map(label_dict)

# Tokenizer 설정
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('monologg/kobert')

# Dataset 정의
class BERTDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe['conversation'].tolist()
        self.labels = dataframe['label_encoded'].tolist()
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text[index])
        text = " ".join(text.split())
        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'targets': torch.tensor(self.labels[index], dtype=torch.long)
        }
        
# DataLoader 설정
MAX_LEN = 512
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 8
train_dataset = BERTDataset(train, tokenizer, MAX_LEN)
train_data_loader = DataLoader(train_dataset, batch_size=TRAIN_BATCH_SIZE)


In [5]:
# 모델 정의
class BERTClass(torch.nn.Module):
    def __init__(self):
        super(BERTClass, self).__init__()
        self.l1 = BertForSequenceClassification.from_pretrained('monologg/kobert', num_labels=4)
        
    def forward(self, ids, mask):
        output = self.l1(ids, attention_mask=mask)
        return output.logits

model = BERTClass()
model.to(torch.device("cuda"))

# 손실 함수 및 최적화 함수 설정
loss_function = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(params=model.parameters(), lr=1e-5)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at monologg/kobert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
# 학습 함수 정의
def train(epoch):
    model.train()
    for _, data in tqdm(enumerate(train_data_loader), total=len(train_data_loader)):
        ids = data['ids'].to(torch.device("cuda"), dtype=torch.long)
        mask = data['mask'].to(torch.device("cuda"), dtype=torch.long)
        targets = data['targets'].to(torch.device("cuda"), dtype=torch.long)

        outputs = model(ids, mask)
        optimizer.zero_grad()
        loss = loss_function(outputs, targets)
        loss.backward()
        optimizer.step()

# 학습 실행
EPOCHS = 3
for epoch in range(EPOCHS):
    train(epoch)


  0%|          | 0/494 [00:00<?, ?it/s]

  0%|          | 0/494 [00:00<?, ?it/s]

  0%|          | 0/494 [00:00<?, ?it/s]

In [9]:
# 테스트 데이터를 위한 DataLoader 설정
class BERTTestDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe['text'].tolist()
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text[index])
        text = " ".join(text.split())
        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long)
        }

test_dataset = BERTTestDataset(test, tokenizer, MAX_LEN)
test_data_loader = DataLoader(test_dataset, batch_size=VALID_BATCH_SIZE)

# 예측 함수 정의
def predict():
    model.eval()
    predictions = []
    with torch.no_grad():
        for _, data in tqdm(enumerate(test_data_loader), total=len(test_data_loader)):
            ids = data['ids'].to(torch.device("cuda"), dtype=torch.long)
            mask = data['mask'].to(torch.device("cuda"), dtype=torch.long)
            
            outputs = model(ids, mask)
            _, predicted = torch.max(outputs, 1)
            
            predictions.extend(predicted.cpu().numpy().tolist())
    return predictions

# 예측 실행
predictions = predict()

# 결과를 submission.csv로 저장
submission = pd.DataFrame({'file_name': test.index, 'class': predictions})
submission.to_csv('./final_submission.csv', index=False)


  0%|          | 0/50 [00:00<?, ?it/s]