In [1]:
#!pip install mxnet
#!pip install gluonnlp pandas tqdm
#!pip install sentencepiece
#!pip install transformers
#!pip install torch

In [3]:
import pandas as pd
import numpy as np
import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from transformers import AdamW, get_linear_schedule_with_warmup
from transformers import BertModel, BertTokenizer
from tqdm.notebook import tqdm
import gluonnlp as nlp
from gluonnlp.data import SentencepieceTokenizer
from transformers import BertForSequenceClassification

In [5]:
# 데이터 로드
train = pd.read_csv('./data/train.csv')
test = pd.read_json('./data/test.json').transpose()

# train 데이터에서 라벨 인코딩
label_dict = {label: idx for idx, label in enumerate(train['class'].unique())}
train['label_encoded'] = train['class'].map(label_dict)

# Tokenizer 설정
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('monologg/kobert')

# Dataset 정의
class BERTDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe['conversation'].tolist()  # 'text'를 'conversation'으로 변경
        self.labels = dataframe['label_encoded'].tolist()
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text[index])
        text = " ".join(text.split())
        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'targets': torch.tensor(self.labels[index], dtype=torch.long)
        }
        
# DataLoader 설정
MAX_LEN = 256
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 4
train_dataset = BERTDataset(train, tokenizer, MAX_LEN)
train_data_loader = DataLoader(train_dataset, batch_size=TRAIN_BATCH_SIZE)


In [None]:
# 모델 정의
class BERTClass(torch.nn.Module):
    def __init__(self):
        super(BERTClass, self).__init__()
        self.l1 = BertForSequenceClassification.from_pretrained('monologg/kobert', num_labels=4)
        
    def forward(self, ids, mask):
        output = self.l1(ids, attention_mask=mask)
        return output.logits

model = BERTClass()
model.to(torch.device("cuda"))

# 손실 함수 및 최적화 함수 설정
loss_function = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(params=model.parameters(), lr=1e-5)

In [None]:
# 학습 함수 정의
def train(epoch):
    model.train()
    for _, data in tqdm(enumerate(train_data_loader), total=len(train_data_loader)):
        ids = data['ids'].to(torch.device("cuda"), dtype=torch.long)
        mask = data['mask'].to(torch.device("cuda"), dtype=torch.long)
        targets = data['targets'].to(torch.device("cuda"), dtype=torch.long)

        outputs = model(ids, mask)
        optimizer.zero_grad()
        loss = loss_function(outputs, targets)
        loss.backward()
        optimizer.step()

# 학습 실행
EPOCHS = 3
for epoch in range(EPOCHS):
    train(epoch)


In [None]:
# 테스트 데이터를 위한 DataLoader 설정
class BERTTestDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe['text'].tolist()
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text[index])
        text = " ".join(text.split())
        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long)
        }

test_dataset = BERTTestDataset(test, tokenizer, MAX_LEN)
test_data_loader = DataLoader(test_dataset, batch_size=VALID_BATCH_SIZE)

# 예측 함수 정의
def predict():
    model.eval()
    predictions = []
    with torch.no_grad():
        for _, data in tqdm(enumerate(test_data_loader), total=len(test_data_loader)):
            ids = data['ids'].to(torch.device("cuda"), dtype=torch.long)
            mask = data['mask'].to(torch.device("cuda"), dtype=torch.long)
            
            outputs = model(ids, mask)
            _, predicted = torch.max(outputs, 1)
            
            predictions.extend(predicted.cpu().numpy().tolist())
    return predictions

# 예측 실행
predictions = predict()

# 결과를 submission.csv로 저장
submission = pd.DataFrame({'file_name': test.index, 'class': [list(label_dict.keys())[list(label_dict.values()).index(pred)] for pred in predictions]})
submission.to_csv('./final_submission.csv', index=False)