<a href="https://colab.research.google.com/github/GwonilJoo/Coding/blob/master/bert_sentiment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##**준비 사항**

In [None]:
%cd drive/MyDrive/Pytorch

/content/drive/MyDrive/Pytorch


In [None]:
# Hugging Face의 trainsformer model 설치
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/98/87/ef312eef26f5cecd8b17ae9654cdd8d1fae1eb6dbd87257d6d73c128a4d0/transformers-4.3.2-py3-none-any.whl (1.8MB)
[K     |████████████████████████████████| 1.8MB 5.5MB/s 
Collecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/fd/5b/44baae602e0a30bcc53fbdbc60bd940c15e143d252d658dfdefce736ece5/tokenizers-0.10.1-cp36-cp36m-manylinux2010_x86_64.whl (3.2MB)
[K     |████████████████████████████████| 3.2MB 23.8MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 42.1MB/s 
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
  Created wheel for sacremoses: filename=sacremoses-0.0.43-cp36-none-any.whl size=893261 sha256=bdde04c1424

In [None]:
import torch

from transformers import BertTokenizer
from transformers import BertForSequenceClassification, AdamW, BertConfig
from transformers import get_linear_schedule_with_warmup
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

import pandas as pd
import numpy as np
import random
import time
import datetime

<br>
<br>

##**Load Data**

In [None]:
# 디렉토리의 파일 목록
!ls datasets -la

total 3308
-rw------- 1 root root 1848305 Feb 20 14:25 sentiment_4.csv
-rw------- 1 root root 1538457 Feb 20 14:25 sentiment_5.csv


In [None]:
# Load train datasets using pandas
train1 = pd.read_csv("datasets/sentiment_4.csv")
train2 = pd.read_csv("datasets/sentiment_5.csv")
train = pd.concat([train1, train2])

print(train1.shape)
print(train2.shape)
print(train.shape)

train.head(10)

(13912, 13)
(10011, 13)
(23923, 13)


Unnamed: 0,wav_id,발화문,상황,1번 감정,1번 감정세기,2번 감정,2번 감정세기,3번 감정,3번 감정세기,4번 감정,4번감정세기,5번 감정,5번 감정세기
0,5e258fd1305bcf3ad153a6a4,청소 네가 대신 해 줘,anger,Neutral,0,Angry,1,Neutral,0,Neutral,0,Angry,1
1,5e258fe2305bcf3ad153a6a5,둘 다 청소 하기 싫어 귀찮아,anger,Neutral,0,Angry,1,Neutral,0,Neutral,0,Angry,1
2,5e258ff5305bcf3ad153a6a6,둘 다 하기 싫어서 화내,anger,Angry,1,Angry,1,Neutral,0,Angry,1,Angry,1
3,5e25902f305bcf3ad153a6a9,그럼 방세는 어떡해,anger,Sadness,1,Sadness,1,Sadness,1,Sadness,1,Sadness,1
4,5e27f90b5807b852d9e0157b,권택인 줄 알았는데 그런 사람이 생겼나 보더라고,sad,Sadness,1,Sadness,1,Sadness,1,Sadness,2,Sadness,1
5,5e27fa1c5807b852d9e01586,그냥 걷고 있어,sad,Neutral,0,Neutral,0,Neutral,0,Sadness,2,Neutral,0
6,5e27fb575807b852d9e01595,고등학교 동창인데,anger,Disgust,2,Disgust,1,Angry,1,Disgust,2,Angry,1
7,5e2840225807b852d9e01618,처음 학원에서 만났다가 서로 좋아해서 사귀게 되었지,sad,Neutral,0,Neutral,0,Happiness,1,Neutral,0,Neutral,0
8,5e2840415807b852d9e01619,내가 애정 표현을 잘 못해서 자주 싸우긴 했어,sad,Neutral,0,Sadness,1,Sadness,1,Sadness,1,Sadness,2
9,5e28405b5807b852d9e0161a,오늘 헤어졌어,sad,Sadness,1,Neutral,0,Sadness,1,Sadness,1,Sadness,1


<br>
<br>

##**전처리**

In [None]:
# 발화문 추출
sentences = train['발화문']
sentences[:10]

0                    청소 네가 대신 해 줘
1                둘 다 청소 하기 싫어 귀찮아
2                   둘 다 하기 싫어서 화내
3                      그럼 방세는 어떡해
4      권택인 줄 알았는데 그런 사람이 생겼나 보더라고
5                        그냥 걷고 있어
6                       고등학교 동창인데
7    처음 학원에서 만났다가 서로 좋아해서 사귀게 되었지
8       내가 애정 표현을 잘 못해서 자주 싸우긴 했어
9                         오늘 헤어졌어
Name: 발화문, dtype: object

In [None]:
# BERT의 입력 형식에 맞게 변환
sentences = ["[CLS] " + str(sentence) + " [SEP]" for sentence in sentences]
sentences[:10]

['[CLS] 청소 네가 대신 해 줘 [SEP]',
 '[CLS] 둘 다 청소 하기 싫어 귀찮아 [SEP]',
 '[CLS] 둘 다 하기 싫어서 화내 [SEP]',
 '[CLS] 그럼 방세는 어떡해 [SEP]',
 '[CLS] 권택인 줄 알았는데 그런 사람이 생겼나 보더라고 [SEP]',
 '[CLS] 그냥 걷고 있어 [SEP]',
 '[CLS] 고등학교 동창인데 [SEP]',
 '[CLS] 처음 학원에서 만났다가 서로 좋아해서 사귀게 되었지 [SEP]',
 '[CLS] 내가 애정 표현을 잘 못해서 자주 싸우긴 했어 [SEP]',
 '[CLS] 오늘 헤어졌어 [SEP]']

In [None]:
# label names
labels_names = ('Happiness', 'Angry', 'Disgust', 'Fear', 'Neutral', 'Sadness', 'Surprise')

# labels 추출
tmp_labels = pd.concat([train['1번 감정'], train['2번 감정'], train['3번 감정'], train['4번 감정'], train['5번 감정']], axis=1)
tmp_labels = tmp_labels.values

# 가장 많은 label 하나만 뽑기
labels = []
for tmp_label in tmp_labels:
    names = [0]*7
    for name in tmp_label:
        names[labels_names.index(name)] += 1
    
    labels.append(names.index(max(names)))

labels = np.array(labels)
labels

array([4, 4, 1, ..., 1, 5, 5])

In [None]:
from collections import Counter
count = Counter(labels)
print(count)

Counter({5: 12518, 1: 6256, 3: 1923, 4: 1625, 2: 1200, 0: 342, 6: 59})


In [None]:
count = Counter(train['상황'])
print(count)

Counter({'sad': 10879, 'anger': 8088, 'fear': 2679, 'disgust': 2268, 'neutral': 9})


In [None]:
# BERT의 tokenizer로 문장을 토큰으로 분리
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased', do_lower_case=False)
tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]

print (sentences[0])
print (tokenized_texts[0])

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=995526.0, style=ProgressStyle(descripti…


[CLS] 청소 네가 대신 해 줘 [SEP]
['[CLS]', '청', '##소', '네', '##가', '대신', '해', '줘', '[SEP]']


In [None]:
# 입력 토큰의 최대 시퀀스 길이
MAX_LEN = 0

for text in tokenized_texts:
    MAX_LEN = max(MAX_LEN, len(text))

MAX_LEN += 1

# 토큰을 숫자 인덱스로 변환
input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]

# 문장을 MAX_LEN 길이에 맞게 자르고, 모자란 부분을 패딩 0으로 채움
input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

input_ids[0]

array([  101,  9751, 22333,  9011, 11287, 82642,  9960,  9695,   102,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0])

In [None]:
# initailize attention mask
attention_masks = []

# attention mask를 패딩이 아니면 1, 패딩이면 0으로 설정
# 패딩 부분은 BERT 모델에서 attention을 수행하지 않아 속도 향상
for seq in input_ids:
    seq_mask = [float(i>0) for i in seq]
    attention_masks.append(seq_mask)

print(attention_masks[0])

[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]


In [None]:
# train과 test dataset으로 분리
train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_ids,
                                                                                    labels, 
                                                                                    random_state=2018, 
                                                                                    test_size=0.1)

# attention mask를 train과 test dataset으로 분리
train_masks, validation_masks, _, _ = train_test_split(attention_masks, 
                                                       input_ids,
                                                       random_state=2018, 
                                                       test_size=0.1)

# 데이터를 파이토치의 텐서로 변환
train_inputs = torch.tensor(train_inputs)
train_labels = torch.tensor(train_labels)
train_masks = torch.tensor(train_masks)
validation_inputs = torch.tensor(validation_inputs)
validation_labels = torch.tensor(validation_labels)
validation_masks = torch.tensor(validation_masks)				

print(train_inputs[0])
print(train_labels[0])
print(train_masks[0])
print(validation_inputs[0])
print(validation_labels[0])
print(validation_masks[0])

tensor([   101,   9641,  10739,   9249,  16985,  12508,  25503, 118671,   9998,
         12945,   9056,  25503,  12310,   9490,  12965,    102,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0])
tensor(1)
tensor([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0

In [None]:
# 배치 사이즈
batch_size = 32

# 파이토치의 DataLoader로 input, mask, label을 묶어 데이터 설정
# 학습시 배치 사이즈 만큼 데이터를 가져옴
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

<br>
<br>

# **모델 생성**

In [None]:
# device
if torch.cuda.is_available():    
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print('No GPU available, using the CPU instead.')

There are 1 GPU(s) available.
We will use the GPU: Tesla K80


In [None]:
# 분류를 위한 BERT 모델 생성
model = BertForSequenceClassification.from_pretrained("bert-base-multilingual-cased", num_labels=len(labels_names))
model.cuda()

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model ch

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elemen

In [None]:
# optimizer
optimizer = AdamW(model.parameters(),
                  lr = 2e-5, # 학습률
                  eps = 1e-8 # 0으로 나누는 것을 방지하기 위한 epsilon 값
                )

# epoch
epochs = 10

# total steps : 배치반복 횟수 * epochs
total_steps = len(train_dataloader) * epochs

# scheduler
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0,
                                            num_training_steps = total_steps)

<br>
<br>

# **모델 학습**

In [None]:
# 정확도 계산 함수
def flat_accuracy(preds, labels):
    
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()

    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [None]:
# 시간 표시 함수
def format_time(elapsed):

    # 반올림
    elapsed_rounded = int(round((elapsed)))
    
    # hh:mm:ss으로 형태 변경
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [None]:
# 재현을 위해 random seed 고정
seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

# best
best_acc = 0.0

# load model
model.load_state_dict(torch.load('bert_sentiment_last.pth'))

# 에폭만큼 반복
for epoch_i in range(0, epochs):
    
    # ========================================
    #               Training
    # ========================================
    
    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    # 시작 시간 설정
    t0 = time.time()

    # initialize loss
    total_loss = 0

    # train mode
    model.train()
        
    # dataloader에서 배치만큼 반복하여 가져옴
    for step, batch in enumerate(train_dataloader):
        # 경과 정보 표시
        if step % 100 == 0 and not step == 0:
            elapsed = format_time(time.time() - t0)
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

        # 배치를 GPU에 할당
        batch = tuple(t.to(device) for t in batch)
        
        # 배치에서 데이터 추출
        b_input_ids, b_input_mask, b_labels = batch

        # initialize gradient
        model.zero_grad()

        # forward           
        outputs = model(b_input_ids, 
                        token_type_ids=None, 
                        attention_mask=b_input_mask, 
                        labels=b_labels)
        
        # loss
        loss = outputs[0]

        # total loss
        total_loss += loss.item()

        # backward
        loss.backward()

        # gradient clipping
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # update parameters
        optimizer.step()

        # scheduler
        scheduler.step()

    # average loss
    avg_train_loss = total_loss / len(train_dataloader)            

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epcoh took: {:}".format(format_time(time.time() - t0)))
        
    # ========================================
    #               Validation
    # ========================================

    print("")
    print("Running Validation...")

    #시작 시간 설정
    t0 = time.time()

    # test mode
    model.eval()

    # 변수 초기화
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0

    # dataloader에서 배치만큼 반복하여 가져옴
    for batch in validation_dataloader:
        # 배치를 GPU에 할당
        batch = tuple(t.to(device) for t in batch)
        
        # 배치에서 데이터 추출
        b_input_ids, b_input_mask, b_labels = batch
        
        # gradient 계산 안함
        with torch.no_grad():     
            # forward
            outputs = model(b_input_ids, 
                            token_type_ids=None, 
                            attention_mask=b_input_mask)
        
        # loss
        logits = outputs[0]

        # CPU로 데이터 이동
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        
        # 정확도 계산
        tmp_eval_accuracy = flat_accuracy(logits, label_ids)
        eval_accuracy += tmp_eval_accuracy
        nb_eval_steps += 1

    print("  Accuracy: {0:.2f}".format(eval_accuracy/nb_eval_steps))
    print("  Validation took: {:}".format(format_time(time.time() - t0)))

    # save weights
    if (eval_accuracy/nb_eval_steps) > best_acc:
        best_acc = (eval_accuracy/nb_eval_steps)
        torch.save(model.state_dict(), 'bert_sentiment_best.pth')
    
    torch.save(model.state_dict(), 'bert_sentiment_last.pth')

print("")
print("Training complete!")
print("Best Accuracy: {0:.2f}".format(best_acc))


Training...
  Batch   100  of    673.    Elapsed: 0:01:34.
  Batch   200  of    673.    Elapsed: 0:03:08.
  Batch   300  of    673.    Elapsed: 0:04:43.
  Batch   400  of    673.    Elapsed: 0:06:17.
  Batch   500  of    673.    Elapsed: 0:07:52.
  Batch   600  of    673.    Elapsed: 0:09:27.

  Average training loss: 0.31
  Training epcoh took: 0:10:36

Running Validation...
  Accuracy: 0.84
  Validation took: 0:00:24

Training...
  Batch   100  of    673.    Elapsed: 0:01:35.
  Batch   200  of    673.    Elapsed: 0:03:09.
  Batch   300  of    673.    Elapsed: 0:04:44.
  Batch   400  of    673.    Elapsed: 0:06:19.
  Batch   500  of    673.    Elapsed: 0:07:53.
  Batch   600  of    673.    Elapsed: 0:09:28.

  Average training loss: 0.22
  Training epcoh took: 0:10:36

Running Validation...
  Accuracy: 0.83
  Validation took: 0:00:24

Training...
  Batch   100  of    673.    Elapsed: 0:01:34.
  Batch   200  of    673.    Elapsed: 0:03:09.
  Batch   300  of    673.    Elapsed: 0:04:43

<br>
<br>

##**새로운 문장 테스트**

In [None]:
# 입력 데이터 변환
def convert_input_data(sentences, MAX_LEN):

    # BERT의 tokenizer로 문장을 토큰으로 분리
    tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]

    # 토큰을 숫자 인덱스로 변환
    input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]
    
    # 문장을 MAX_LEN 길이에 맞게 자르고, 모자란 부분을 패딩 0으로 채움
    input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

    # initialize attention mask
    attention_masks = []

    # attention mask를 패딩이 아니면 1, 패딩이면 0으로 설정
    # 패딩 부분은 BERT 모델에서 attention을 수행하지 않아 속도 향상
    for seq in input_ids:
        seq_mask = [float(i>0) for i in seq]
        attention_masks.append(seq_mask)

    # 데이터를 파이토치의 텐서로 변환
    inputs = torch.tensor(input_ids)
    masks = torch.tensor(attention_masks)

    return inputs, masks

In [None]:
# 문장 테스트
def test_sentences(sentences):

    model.load_state_dict(torch.load('bert_sentiment_last.pth'))

    # test mode
    model.eval()

    # 문장을 입력 데이터로 변환
    inputs, masks = convert_input_data(sentences, MAX_LEN)

    # 데이터를 GPU에 할당
    b_input_ids = inputs.to(device)
    b_input_mask = masks.to(device)
            
    # gradient 계산 안함
    with torch.no_grad():     
        # forward
        outputs = model(b_input_ids, 
                        token_type_ids=None, 
                        attention_mask=b_input_mask)

    # loss
    logits = outputs[0]

    # CPU로 데이터 이동
    logits = logits.detach().cpu().numpy()

    return logits

In [None]:
logits = test_sentences(['이거 재밌는데!'])

print(logits)
print(np.argmax(logits))
print(labels_names[np.argmax(logits)])

[[ 2.7215378  -3.1550646  -1.4041913  -0.33851758  0.82448757  1.5097432
  -0.5408663 ]]
0
Happiness
