In [None]:
!pip install mxnet
!pip install gluonnlp tqdm
!pip install sentencepiece
!pip install transformers
!pip install soynlp
!pip install emoji
!pip install AdamP

In [2]:
import pandas as pd
import numpy as np
import torch
from torch.nn import functional as F
from torch.utils.data import DataLoader, Dataset
from torch.utils.data import TensorDataset, RandomSampler, SequentialSampler
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW, BertConfig
from tqdm.notebook import tqdm
from adamp import AdamP

from sklearn.model_selection import train_test_split
from keras.preprocessing.sequence import pad_sequences

# Data

In [3]:
train_ds = pd.read_excel("/content/drive/MyDrive/비타민 컨퍼런스/텍스트 분석/train.xlsx")
test_ds = pd.read_excel("/content/drive/MyDrive/비타민 컨퍼런스/텍스트 분석/test.xlsx")

In [None]:
train_ds['Emotion'].value_counts()

sad         25637
fear        18162
happy       15606
angry       13896
disgust     12871
surprise    11728
Name: Emotion, dtype: int64

In [4]:
train_sentence, train_emotion = train_ds.Sentence, train_ds.Emotion
test_sentence, test_emotion  = test_ds.Sentence, test_ds.Emotion
train_sentence = ["[CLS] " + str(s) + " [SEP]" for s in train_sentence]
test_sentence = ["[CLS] " + str(s) + " [SEP]" for s in test_sentence]

#beomi/KcELECTRA-base
#monologg/koelectra-base-v3-discriminator
#beomi/kcbert-base
tokenizer = AutoTokenizer.from_pretrained("beomi/kcbert-base", do_lower_case=False)

Downloading:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/619 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/244k [00:00<?, ?B/s]

In [5]:
from sklearn.preprocessing import LabelEncoder

Encoder = LabelEncoder()

train_emotion =Encoder.fit_transform(train_emotion)
test_emotion = Encoder.transform(test_emotion)

In [None]:
# import re
# import emoji
# from soynlp.normalizer import repeat_normalize

# emojis = list({y for x in emoji.UNICODE_EMOJI.values() for y in x.keys()})
# emojis = ''.join(emojis)
# pattern = re.compile(f'[^ .,?!/@$%~％·∼()\x00-\x7Fㄱ-ㅣ가-힣{emojis}]+')
# url_pattern = re.compile(
#     r'https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)')

# def clean(x):
#     x = pattern.sub(' ', x)
#     x = url_pattern.sub('', x)
#     x = x.strip()
#     x = repeat_normalize(x, num_repeats=2)
#     return x

In [None]:
# function ClickConnect(){
# console.log("Working"); 
# document.querySelector("colab-toolbar-button#connect").click() 
# }setInterval(ClickConnect, 1800000)

In [6]:
x_train,x_valid,y_train,y_valid = train_test_split(train_sentence,train_emotion,test_size=0.3)

y_train = y_train
y_valid = y_valid
y_test = test_emotion

train_tokenized_texts = [tokenizer.tokenize(s) for s in x_train]
valid_tokenized_texts = [tokenizer.tokenize(s) for s in x_valid]
test_tokenized_texts = [tokenizer.tokenize(s) for s in test_sentence]

MAX_LEN = 128 #최대 시퀀스 길이 설정
train_input_ids = [tokenizer.convert_tokens_to_ids(x) for x in train_tokenized_texts]
valid_input_ids = [tokenizer.convert_tokens_to_ids(x) for x in valid_tokenized_texts]
test_input_ids = [tokenizer.convert_tokens_to_ids(x) for x in test_tokenized_texts]

train_input_ids = pad_sequences(train_input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")
valid_input_ids = pad_sequences(valid_input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")
test_input_ids = pad_sequences(test_input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

def make_seg_mask(input_ids):
  attention_mask = []
  for seg in input_ids:
    seg_mask = [float(i>0) for i in seg]
    attention_mask.append(seg_mask)
  
  return attention_mask


train_attention_masks = make_seg_mask(train_input_ids)
valid_attention_masks = make_seg_mask(valid_input_ids)
test_attention_masks = make_seg_mask(test_input_ids)


train_inputs = torch.tensor(train_input_ids)
train_labels = torch.tensor(y_train)
train_masks = torch.tensor(train_attention_masks)

validation_inputs = torch.tensor(valid_input_ids)
validation_labels = torch.tensor(y_valid)
validation_masks = torch.tensor(valid_attention_masks)

test_inputs = torch.tensor(test_input_ids)
test_labels = torch.tensor(y_test)
test_masks = torch.tensor(test_attention_masks)

In [7]:
batch_size = 32

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

test_data = TensorDataset(test_inputs, test_masks, test_labels)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

# Model

In [19]:
cuda = torch.device('cuda')

config = BertConfig.from_pretrained('beomi/kcbert-base')
config.num_labels = 6
model = AutoModelForSequenceClassification.from_pretrained("beomi/kcbert-base",
                                                         config = config).to(cuda)

Some weights of the model checkpoint at beomi/kcbert-base were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initiali

## Hyperparameter

In [20]:
epochs = 10
learning_rate = 0.0001
optimizer = AdamP(model.parameters(), lr=learning_rate)

## Training

In [None]:
losses = []
accuracies = []
epoch_cnt = 1

for i in range(epochs):
  total_loss = 0.0
  correct = 0
  total = 0
  batches = 0

  print(f"{epoch_cnt} Training...")

  model.train()

  for input_ids_batch, attention_masks_batch, y_batch in tqdm(train_dataloader):
    optimizer.zero_grad()
    y_batch = y_batch.to(cuda)
    y_pred = model(input_ids_batch.to(cuda), attention_mask=attention_masks_batch.to(cuda))[0]
    loss = F.cross_entropy(y_pred, y_batch)
    loss.backward()
    optimizer.step()

    total_loss += loss.item()

    _, predicted = torch.max(y_pred, 1)
    correct += (predicted == y_batch).sum()
    total += len(y_batch)

    batches += 1
    if batches % 500 == 0:
      print("Batch Loss:", total_loss, "Train_accuracy:", correct.float() / total)
  
  losses.append(total_loss)
  accuracies.append(correct.float() / total)
  print("Train Loss:", total_loss, "Train_accuracy:", correct.float() / total)

  print("")
  print("Validation...")

  model.eval()

  # 변수 초기화
  valid_correct = 0
  valid_total = 0

  for input_ids_batch, attention_masks_batch, y_batch in tqdm(validation_dataloader):
    y_batch = y_batch.to(cuda)
    y_pred = model(input_ids_batch.to(cuda), attention_mask=attention_masks_batch.to(cuda))[0]
    _, predicted = torch.max(y_pred, 1)
    valid_correct += (predicted == y_batch).sum()
    valid_total += len(y_batch)

  epoch_cnt += 1

  optimizer = AdamP(model.parameters(), lr=learning_rate*0.8)
  
  print("Validatoion_accuracy:", valid_correct.float() / valid_total)
  print("Next Epoch")
  print("")

1 Training...


  0%|          | 0/2142 [00:00<?, ?it/s]

Batch Loss: 646.4032490849495 Train_accuracy: tensor(0.5164, device='cuda:0')
Batch Loss: 1245.524569272995 Train_accuracy: tensor(0.5397, device='cuda:0')
Batch Loss: 1831.191876590252 Train_accuracy: tensor(0.5497, device='cuda:0')
Batch Loss: 2412.3084183335304 Train_accuracy: tensor(0.5560, device='cuda:0')
Train Loss: 2581.674887597561 Train_accuracy: tensor(0.5574, device='cuda:0')

Validation...


  0%|          | 0/918 [00:00<?, ?it/s]

Validatoion_accuracy: tensor(0.5734, device='cuda:0')
Next Epoch

2 Training...


  0%|          | 0/2142 [00:00<?, ?it/s]

Batch Loss: 522.8455384969711 Train_accuracy: tensor(0.6266, device='cuda:0')
Batch Loss: 1033.7512140870094 Train_accuracy: tensor(0.6337, device='cuda:0')
Batch Loss: 1543.8034824728966 Train_accuracy: tensor(0.6353, device='cuda:0')
Batch Loss: 2044.8598702549934 Train_accuracy: tensor(0.6367, device='cuda:0')
Train Loss: 2186.8724579811096 Train_accuracy: tensor(0.6370, device='cuda:0')

Validation...


  0%|          | 0/918 [00:00<?, ?it/s]

Validatoion_accuracy: tensor(0.5910, device='cuda:0')
Next Epoch

3 Training...


  0%|          | 0/2142 [00:00<?, ?it/s]

Batch Loss: 432.1069944500923 Train_accuracy: tensor(0.7018, device='cuda:0')
Batch Loss: 878.488820284605 Train_accuracy: tensor(0.6979, device='cuda:0')
Batch Loss: 1319.9470502138138 Train_accuracy: tensor(0.6956, device='cuda:0')
Batch Loss: 1762.9815773963928 Train_accuracy: tensor(0.6935, device='cuda:0')
Train Loss: 1894.7409998774529 Train_accuracy: tensor(0.6927, device='cuda:0')

Validation...


  0%|          | 0/918 [00:00<?, ?it/s]

Validatoion_accuracy: tensor(0.6045, device='cuda:0')
Next Epoch

4 Training...


  0%|          | 0/2142 [00:00<?, ?it/s]

Batch Loss: 353.79922434687614 Train_accuracy: tensor(0.7587, device='cuda:0')
Batch Loss: 728.856386244297 Train_accuracy: tensor(0.7506, device='cuda:0')
Batch Loss: 1117.5141346156597 Train_accuracy: tensor(0.7439, device='cuda:0')
Batch Loss: 1497.6448447406292 Train_accuracy: tensor(0.7418, device='cuda:0')
Train Loss: 1607.6439443528652 Train_accuracy: tensor(0.7414, device='cuda:0')

Validation...


  0%|          | 0/918 [00:00<?, ?it/s]

Validatoion_accuracy: tensor(0.5964, device='cuda:0')
Next Epoch

5 Training...


  0%|          | 0/2142 [00:00<?, ?it/s]

Batch Loss: 287.03994496166706 Train_accuracy: tensor(0.8083, device='cuda:0')
Batch Loss: 594.3509060740471 Train_accuracy: tensor(0.8004, device='cuda:0')
Batch Loss: 905.0046787261963 Train_accuracy: tensor(0.7975, device='cuda:0')
Batch Loss: 1226.318619415164 Train_accuracy: tensor(0.7933, device='cuda:0')
Train Loss: 1323.7806705385447 Train_accuracy: tensor(0.7913, device='cuda:0')

Validation...


  0%|          | 0/918 [00:00<?, ?it/s]

Validatoion_accuracy: tensor(0.5668, device='cuda:0')
Next Epoch

6 Training...


  0%|          | 0/2142 [00:00<?, ?it/s]

Batch Loss: 234.72801759094 Train_accuracy: tensor(0.8486, device='cuda:0')
Batch Loss: 488.36367709189653 Train_accuracy: tensor(0.8415, device='cuda:0')
Batch Loss: 747.1316600739956 Train_accuracy: tensor(0.8372, device='cuda:0')
Batch Loss: 1010.3218374699354 Train_accuracy: tensor(0.8349, device='cuda:0')
Train Loss: 1088.6397218704224 Train_accuracy: tensor(0.8332, device='cuda:0')

Validation...


  0%|          | 0/918 [00:00<?, ?it/s]

Validatoion_accuracy: tensor(0.5819, device='cuda:0')
Next Epoch

7 Training...


  0%|          | 0/2142 [00:00<?, ?it/s]

In [None]:
# 모델 저장하기
torch.save(model.state_dict(), "/content/drive/MyDrive/비타민 컨퍼런스/Model/kcbert_4")

# Test

In [None]:
file_path = "/content/drive/MyDrive/비타민 컨퍼런스/Model/kcbert_3"
model.load_state_dict(torch.load(file_path))
model.to(cuda)

In [None]:
model.eval()

test_correct = 0
test_total = 0

for input_ids_batch, attention_masks_batch, y_batch in tqdm(test_dataloader):
  y_batch = y_batch.to(cuda)
  y_pred = model(input_ids_batch.to(cuda), attention_mask=attention_masks_batch.to(cuda))[0]
  _, predicted = torch.max(y_pred, 1)
  test_correct += (predicted == y_batch).sum()
  test_total += len(y_batch)

print("Accuracy:", test_correct.float() / test_total)

  0%|          | 0/340 [00:00<?, ?it/s]

Accuracy: tensor(0.6021, device='cuda:0')


## New Data

In [None]:
# 입력 데이터 변환
def convert_input_data(sentences):
    global tokenizer

    # BERT의 토크나이저로 문장을 토큰으로 분리
    tokenized_texts = tokenizer.tokenize(sentences)

    # 입력 토큰의 최대 시퀀스 길이
    MAX_LEN = 128

    # 토큰을 숫자 인덱스로 변환
    input_ids = [[tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]]
    
    # 문장을 MAX_LEN 길이에 맞게 자르고, 모자란 부분을 패딩 0으로 채움
    input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

    # 어텐션 마스크 초기화
    attention_masks = []

    # 어텐션 마스크를 패딩이 아니면 1, 패딩이면 0으로 설정
    # 패딩 부분은 BERT 모델에서 어텐션을 수행하지 않아 속도 향상
    for seq in input_ids:
        seq_mask = [float(i>0) for i in seq]
        attention_masks.append(seq_mask)

    # 데이터를 파이토치의 텐서로 변환
    inputs = torch.tensor(input_ids)
    masks = torch.tensor(attention_masks)

    return inputs, masks

In [None]:
def logits_to_softmax(logits):
  odds = np.exp(logits)
  total = odds.sum()
  softmax = odds/total
  return softmax

In [None]:
def classify_sentence(sentence):
  model.eval()

  inputs, masks = convert_input_data(new_sentence)
  b_input_ids = inputs.to(cuda)
  b_input_mask = masks.to(cuda)

  with torch.no_grad():     
    # Forward 수행
    outputs = model(b_input_ids, 
                    token_type_ids=None, 
                    attention_mask=b_input_mask)

  logits = outputs[0]
  logits = logits.detach().cpu().numpy()[0]

  result = logits_to_softmax(logits)

  emotion_dict = {0:"angry",1:"disgust",2:"fear",3:"happy",4:"sad",5:"surprise"}

  for i in range(len(result)):
    print(f"{emotion_dict[i]} : {round(result[i]*100,3)}%")

In [None]:
new_sentence = "올해도 좋은일만 가득하길!"
classify_sentence(new_sentence)