In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install mxnet
!pip install gluonnlp pandas tqdm
!pip install sentencepiece
!pip install transformers
!pip install torch
#깃허브에서 KoBERT 파일 로드
!pip install 'git+https://github.com/SKTBrain/KoBERT.git#egg=kobert_tokenizer&subdirectory=kobert_hf'


Collecting kobert_tokenizer
  Cloning https://github.com/SKTBrain/KoBERT.git to /tmp/pip-install-qftchu8o/kobert-tokenizer_ffae42fa2f624c238afca48a78e04c87
  Running command git clone --filter=blob:none --quiet https://github.com/SKTBrain/KoBERT.git /tmp/pip-install-qftchu8o/kobert-tokenizer_ffae42fa2f624c238afca48a78e04c87
  Resolved https://github.com/SKTBrain/KoBERT.git to commit 5c46b1c68e4755b54879431bd302db621f4d2f47
  Preparing metadata (setup.py) ... [?25l[?25hdone


In [3]:
import numpy as np
np.bool = np.bool_

In [4]:
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import gluonnlp as nlp
import numpy as np
from tqdm import tqdm, tqdm_notebook
from kobert_tokenizer import KoBERTTokenizer
from transformers import BertModel

from transformers import AdamW
from transformers.optimization import get_cosine_schedule_with_warmup
device = torch.device("cuda:0")
tokenizer = KoBERTTokenizer.from_pretrained('skt/kobert-base-v1')
bertmodel = BertModel.from_pretrained('skt/kobert-base-v1', return_dict=False)
vocab = nlp.vocab.BERTVocab.from_sentencepiece(tokenizer.vocab_file, padding_token='[PAD]')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'XLNetTokenizer'. 
The class this function is called from is 'KoBERTTokenizer'.


In [5]:
import pandas as pd

## 경고 무시
import warnings
warnings.filterwarnings('ignore')

## 시각화 툴
import matplotlib.pyplot as plt

## encoding
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
# from sklearn.utils import class_weight
# from tensorflow.keras import regularizers

# Data 준비

In [6]:
preprocessed_train = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/preprocessed_train_v0.csv')
preprocessed_test = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/preprocessed_test_v0.csv')

In [7]:
preprocessed_train["conversation"].apply(lambda x: len(x.split())).max()

## CLASS_NAMES에 '일반 대화'를 포함시킴
CLASS_NAMES = ['협박 대화', '갈취 대화', '직장 내 괴롭힘 대화', '기타 괴롭힘 대화', '일반 대화']

# 수동 매핑 설정
class_mapping = {
    '협박 대화': 0,
    '갈취 대화': 1,
    '직장 내 괴롭힘 대화': 2,
    '기타 괴롭힘 대화': 3,
    '일반 대화': 4
}


# 'class' 열을 수동 매핑 적용하기 전에 문자열로 변환
preprocessed_train['class'] = preprocessed_train['class'].astype(str).map(class_mapping)
# labels = preprocessed_train['class']

In [8]:
dataset_list = []
for q, label in zip(preprocessed_train['conversation'], preprocessed_train['class']):
    data = []
    data.append(q)
    data.append(str(label))

    dataset_list.append(data)

from sklearn.model_selection import train_test_split
dataset_train, dataset_valid = train_test_split(dataset_list, test_size = 0.2, shuffle = True, random_state = 1004)

In [9]:
tokenizer = KoBERTTokenizer.from_pretrained('skt/kobert-base-v1')
tok = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower = False)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'XLNetTokenizer'. 
The class this function is called from is 'KoBERTTokenizer'.


In [10]:
# BERT모델에 넣을 데이터셋을 만들어줄 클래스이다. 위에서 설명한 transform형식으로 데이터셋을 바꿔준다.
class BERTDataset(Dataset):
    def __init__(self, dataset, sent_idx, label_idx, bert_tokenizer, vocab, max_len,
                 pad, pair):
        transform = nlp.data.BERTSentenceTransform(
            bert_tokenizer, max_seq_length=max_len, vocab=vocab, pad=pad, pair=pair)

        self.sentences = [transform([i[sent_idx]]) for i in dataset]
        self.labels = [np.int32(i[label_idx]) for i in dataset]

    def __getitem__(self, i):
        return (self.sentences[i] + (self.labels[i], ))

    def __len__(self):
        return (len(self.labels))

In [11]:
max_len = 191
batch_size = 16
warmup_ratio = 0.1
num_epochs = 10
max_grad_norm = 1
log_interval = 200
learning_rate =  5e-5

## 토큰화 및 모델 정의

In [12]:
# BERTDataset : 각 데이터가 BERT 모델의 입력으로 들어갈 수 있도록 tokenization, int encoding, padding하는 함수
tok = tokenizer.tokenize

data_train = BERTDataset(dataset_train, 0, 1, tok, vocab, max_len, True, False)
data_valid = BERTDataset(dataset_valid, 0, 1, tok, vocab, max_len, True, False)

# torch 형식의 dataset을 만들어 입력 데이터셋의 전처리 마무리
train_dataloader = torch.utils.data.DataLoader(data_train, batch_size = batch_size, num_workers = 5)
valid_dataloader = torch.utils.data.DataLoader(data_valid, batch_size = batch_size, num_workers = 5)

In [13]:
class BERTClassifier(nn.Module):
    def __init__(self,
                 bert,
                 hidden_size = 768,
                 num_classes = 5,   # 클래스 수로 조정
                 dr_rate = None,
                 params = None):
        super(BERTClassifier, self).__init__()
        self.bert = bert
        self.dr_rate = dr_rate

        self.classifier = nn.Linear(hidden_size , num_classes)
        if dr_rate:
            self.dropout = nn.Dropout(p = dr_rate)

    def gen_attention_mask(self, token_ids, valid_length):
        attention_mask = torch.zeros_like(token_ids)
        for i, v in enumerate(valid_length):
            attention_mask[i][:v] = 1
        return attention_mask.float()

    def forward(self, token_ids, valid_length, segment_ids):
        attention_mask = self.gen_attention_mask(token_ids, valid_length)

        _, pooler = self.bert(input_ids = token_ids, token_type_ids = segment_ids.long(), attention_mask = attention_mask.float().to(token_ids.device),return_dict = False)
        if self.dr_rate:
            out = self.dropout(pooler)
        return self.classifier(out)

In [14]:
# BERT  모델 불러오기
model = BERTClassifier(bertmodel,  dr_rate = 0.5).to(device)

# optimizer와 schedule 설정
# Prepare optimizer and schedule (linear warmup and decay)
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]


optimizer = AdamW(optimizer_grouped_parameters, lr = learning_rate)
loss_fn = nn.CrossEntropyLoss() # 다중분류를 위한 loss function

t_total = len(train_dataloader) * num_epochs
warmup_step = int(t_total * warmup_ratio)

scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps = warmup_step, num_training_steps = t_total)

# calc_accuracy : 정확도 측정을 위한 함수
def calc_accuracy(X,Y):
    max_vals, max_indices = torch.max(X, 1)
    train_acc = (max_indices == Y).sum().data.cpu().numpy()/max_indices.size()[0]
    return train_acc

train_dataloader

<torch.utils.data.dataloader.DataLoader at 0x799db35a4a90>

In [15]:

# 손실 값을 저장할 리스트 초기화
train_losses = []
test_losses = []

for e in range(num_epochs):
    train_acc = 0.0
    model.train()

    # 훈련 단계
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(train_dataloader)):
        optimizer.zero_grad()
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length = valid_length
        label = label.long().to(device)

        # 모델의 출력을 가져오고 손실을 계산
        out = model(token_ids, valid_length, segment_ids)
        loss = loss_fn(out, label)

        # 훈련 손실 값을 리스트에 추가
        train_losses.append(loss.item())  # 손실 값을 리스트에 추가

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
        optimizer.step()
        scheduler.step()  # Learning rate schedule 업데이트
        train_acc += calc_accuracy(out, label)

        if batch_id % log_interval == 0:
            print("epoch {} batch id {} loss {} train acc {}".format(e + 1, batch_id + 1, loss.data.cpu().numpy(), train_acc / (batch_id + 1)))

    print("epoch {} train acc {}".format(e + 1, train_acc / (batch_id + 1)))

    model.eval()
    test_acc = 0.0
    test_loss = 0.0  # 테스트 손실 초기화

    # 검증 단계
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(valid_dataloader)):
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length = valid_length
        label = label.long().to(device)

        out = model(token_ids, valid_length, segment_ids)
        test_loss_value = loss_fn(out, label)  # 검증 손실 계산
        test_loss += test_loss_value.item()  # 테스트 손실 값을 추가

        test_acc += calc_accuracy(out, label)

    # 평균 테스트 손실 계산
    test_losses.append(test_loss / (batch_id + 1))  # 평균 손실을 리스트에 추가

    print("epoch {} test acc {}".format(e + 1, test_acc / (batch_id + 1)))

  0%|          | 0/249 [00:00<?, ?it/s]

epoch 1 batch id 1 loss 1.638222575187683 train acc 0.1875
epoch 1 batch id 201 loss 0.494468092918396 train acc 0.5102611940298507
epoch 1 train acc 0.5715361445783133


  0%|          | 0/63 [00:00<?, ?it/s]

epoch 1 test acc 0.8353174603174603


  0%|          | 0/249 [00:00<?, ?it/s]

epoch 2 batch id 1 loss 0.9004606604576111 train acc 0.5625
epoch 2 batch id 201 loss 0.31487852334976196 train acc 0.8666044776119403
epoch 2 train acc 0.8689759036144579


  0%|          | 0/63 [00:00<?, ?it/s]

epoch 2 test acc 0.875


  0%|          | 0/249 [00:00<?, ?it/s]

epoch 3 batch id 1 loss 0.5615015625953674 train acc 0.875
epoch 3 batch id 201 loss 0.0910734310746193 train acc 0.9123134328358209
epoch 3 train acc 0.9159136546184738


  0%|          | 0/63 [00:00<?, ?it/s]

epoch 3 test acc 0.8591269841269841


  0%|          | 0/249 [00:00<?, ?it/s]

epoch 4 batch id 1 loss 0.7308026552200317 train acc 0.8125
epoch 4 batch id 201 loss 0.016920344904065132 train acc 0.9437189054726368
epoch 4 train acc 0.944611780455154


  0%|          | 0/63 [00:00<?, ?it/s]

epoch 4 test acc 0.876984126984127


  0%|          | 0/249 [00:00<?, ?it/s]

epoch 5 batch id 1 loss 0.24691163003444672 train acc 0.875
epoch 5 batch id 201 loss 0.010991675779223442 train acc 0.9654850746268657
epoch 5 train acc 0.9667838018741632


  0%|          | 0/63 [00:00<?, ?it/s]

epoch 5 test acc 0.9107142857142857


  0%|          | 0/249 [00:00<?, ?it/s]

epoch 6 batch id 1 loss 0.41748562455177307 train acc 0.875
epoch 6 batch id 201 loss 0.004867465700954199 train acc 0.9782338308457711
epoch 6 train acc 0.9803380187416332


  0%|          | 0/63 [00:00<?, ?it/s]

epoch 6 test acc 0.9067460317460317


  0%|          | 0/249 [00:00<?, ?it/s]

epoch 7 batch id 1 loss 0.035874512046575546 train acc 1.0
epoch 7 batch id 201 loss 0.003728854935616255 train acc 0.9872512437810945
epoch 7 train acc 0.9881191432396251


  0%|          | 0/63 [00:00<?, ?it/s]

epoch 7 test acc 0.9017857142857143


  0%|          | 0/249 [00:00<?, ?it/s]

epoch 8 batch id 1 loss 0.0037296840455383062 train acc 1.0
epoch 8 batch id 201 loss 0.0025781402364373207 train acc 0.9906716417910447
epoch 8 train acc 0.9916331994645248


  0%|          | 0/63 [00:00<?, ?it/s]

epoch 8 test acc 0.9067460317460317


  0%|          | 0/249 [00:00<?, ?it/s]

epoch 9 batch id 1 loss 0.0023771352134644985 train acc 1.0
epoch 9 batch id 201 loss 0.0022388817742466927 train acc 0.9953358208955224
epoch 9 train acc 0.9956492637215528


  0%|          | 0/63 [00:00<?, ?it/s]

epoch 9 test acc 0.9087301587301587


  0%|          | 0/249 [00:00<?, ?it/s]

epoch 10 batch id 1 loss 0.003704391187056899 train acc 1.0
epoch 10 batch id 201 loss 0.0021964809857308865 train acc 0.996268656716418
epoch 10 train acc 0.9964022757697456


  0%|          | 0/63 [00:00<?, ?it/s]

epoch 10 test acc 0.9047619047619048


In [17]:
dataset_test_list = []
for q in preprocessed_test['text']:
    data = []
    data.append(q)
    data.append(str(0))

    dataset_test_list.append(data)

bert_dataset_test = BERTDataset(dataset_test_list, 0, 1, tok, vocab, max_len, True, False)
test_dataloader = torch.utils.data.DataLoader(bert_dataset_test, batch_size=batch_size, num_workers=5)

model.eval()
output=[]
for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm(test_dataloader)):
    token_ids = token_ids.long().to(device)
    segment_ids = segment_ids.long().to(device)

    valid_length= valid_length
    label = label.long().to(device)

    out = model(token_ids, valid_length, segment_ids)

    for i in out:
        logits=i
        logits = logits.detach().cpu().numpy()
        output.append(logits)

100%|██████████| 32/32 [00:05<00:00,  5.48it/s]


In [20]:
len(output)

500

# submission

In [22]:
y_pred = np.argmax(output, axis=1)

In [26]:
def save_submission(y_pred, user_name, f1_score=None):
    data_path ="/content/drive/MyDrive/Colab Notebooks/"
    save_path ="/content/drive/MyDrive/Colab Notebooks/"
    submission_path = data_path + 'new_submission.csv'
    submission = pd.read_csv(submission_path)
    submission['class'] = y_pred
    submission_csv_path = '{}/submission_{}_f1score_{}.csv'.format(save_path, user_name, f1_score)
    submission.to_csv(submission_csv_path, index=False)
    print('{} saved!'.format(submission_csv_path))

In [27]:
save_submission(y_pred,'lhc_v2_kobert','0.89')

/content/drive/MyDrive/Colab Notebooks//submission_lhc_v2_kobert_f1score_0.89.csv saved!
