In [3]:
# !pip install transformers
# !pip install tensorflow_addons
! pip install torch

Collecting torch
  Using cached torch-1.12.0-cp39-cp39-win_amd64.whl (161.8 MB)
Installing collected packages: torch
Successfully installed torch-1.12.0


In [4]:
import os
import pandas as pd
import numpy as np
import re
from tqdm import tqdm
import urllib.request
import tensorflow as tf
import tensorflow_addons as tfa
from transformers import BertTokenizer, TFBertForSequenceClassification

MODEL_NAME = "klue/bert-base"
model = TFBertForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=7, from_pt=True)
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertForSequenceClassification: ['bert.embeddings.position_ids']
- This IS expected if you are initializing TFBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading vocab.txt:   0%|          | 0.00/243k [00:00<?, ?B/s]

Downloading special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/289 [00:00<?, ?B/s]

In [9]:
data_set = pd.read_csv('./news_labeling.txt')
data_set.columns=['message']

data_set['sentiment'] = 0
data_set = data_set.dropna()
data_set = data_set.reset_index(drop = True)

X_data = data_set['message']
y_data = data_set['sentiment']

In [None]:
# data_set

In [5]:
# 입력 데이터(문장) 길이 제한
MAX_SEQ_LEN = 64

def convert_data(X_data, y_data):
    # BERT 입력으로 들어가는 token, mask, segment, target 저장용 리스트
    tokens, masks, segments, targets = [], [], [], []
    
    for X, y in tqdm(zip(X_data, y_data)):
        # token: 입력 문장 토큰화
        token = tokenizer.encode(X, truncation = True, padding = 'max_length', max_length = MAX_SEQ_LEN)
        
        # Mask: 토큰화한 문장 내 패딩이 아닌 경우 1, 패딩인 경우 0으로 초기화
        num_zeros = token.count(0)
        mask = [1] * (MAX_SEQ_LEN - num_zeros) + [0] * num_zeros
        
        # segment: 문장 전후관계 구분: 오직 한 문장이므로 모두 0으로 초기화
        segment = [0]*MAX_SEQ_LEN

        tokens.append(token)
        masks.append(mask)
        segments.append(segment)
        targets.append(y)

    # numpy array로 저장
    tokens = np.array(tokens)
    masks = np.array(masks)
    segments = np.array(segments)
    targets = np.array(targets)

    return [tokens, masks, segments], targets

In [7]:
# 최고 성능의 모델 불러오기
sentiment_model_best = tf.keras.models.load_model('./best_model.h5',
                                                  custom_objects={'TFBertForSequenceClassification': TFBertForSequenceClassification})

In [None]:
data_X, data_y = convert_data(X_data, y_data)
predicted_value = sentiment_model_best.predict(data_X)
predicted_label = np.argmax(predicted_value, axis = 1)

20725it [00:13, 1482.93it/s]


 80/648 [==>...........................] - ETA: 1:03:37

In [None]:
for i in range(len(predicted_label)):
    data_set.loc[i, 'sentiment'] = predicted_label[i]
    if data_set.loc[i, 'sentiment'] == 0:
        data_set.loc[i, 'sentiment'] = '공포'
    elif data_set.loc[i, 'sentiment'] == 1:
        data_set.loc[i, 'sentiment'] = '놀람'
    elif data_set.loc[i, 'sentiment'] == 2:
        data_set.loc[i, 'sentiment'] = '분노'
    elif data_set.loc[i, 'sentiment'] == 3:
        data_set.loc[i, 'sentiment'] = '슬픔'
    elif data_set.loc[i, 'sentiment'] == 4:
        data_set.loc[i, 'sentiment'] = '중립'
    elif data_set.loc[i, 'sentiment'] == 5:
        data_set.loc[i, 'sentiment'] = '행복'
    elif data_set.loc[i, 'sentiment'] == 6:
        data_set.loc[i, 'sentiment'] = '혐오'

data_set

In [None]:
data_set['sentiment'].value_counts()