In [1]:
!pip install transformers



In [2]:
import pandas as pd
import numpy as np
import urllib.request
import os
import tensorflow as tf
from tqdm import tqdm
from transformers import AutoTokenizer, TFGPT2Model
from tensorflow.keras.preprocessing.sequence import pad_sequences

# 1) 데이터 확인

In [3]:
urllib.request.urlretrieve("https://raw.githubusercontent.com/e9t/nsmc/master/ratings_train.txt", filename="ratings_train.txt")
urllib.request.urlretrieve("https://raw.githubusercontent.com/e9t/nsmc/master/ratings_test.txt", filename="ratings_test.txt")

('ratings_test.txt', <http.client.HTTPMessage at 0x7a87d68811d0>)

In [4]:
train_data = pd.read_table('ratings_train.txt')
test_data = pd.read_table('ratings_test.txt')

In [5]:
print(len(train_data), len(test_data))

150000 50000


In [6]:
#중복, 결측값 제거
train_data = train_data.drop_duplicates(subset = ['document'])
train_data = train_data.dropna(how = 'any')
len(train_data)

146182

In [7]:
test_data = test_data.dropna(how = 'any')
len(test_data)

49997

# 2) GPT의 입력

In [8]:
tokenizer = AutoTokenizer.from_pretrained("skt/kogpt2-base-v2", bos_token = '</s>', eos_token = '</s>', pad_token = '<pad>')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

In [9]:
print(tokenizer.tokenize("보는내내 그대로 들어맞는 예측 카리스마 없는 악역"))

['▁보는', '내', '내', '▁그대로', '▁들어', '맞', '는', '▁예측', '▁카', '리스', '마', '▁없는', '▁악', '역']


In [10]:
tokenizer.decode(tokenizer.encode("보는내내 그대로 들어맞는 예측 카리스마 없는 악역"))

'보는내내 그대로 들어맞는 예측 카리스마 없는 악역'

In [11]:
print(tokenizer.decode(3))

<pad>


In [12]:
max_seq_len = 128
encoded_result = tokenizer.encode("전율을 일으키는 영화. 다시 보고 싶은 영화", max_length = max_seq_len, padding = 'max_length', truncation = True)
print(encoded_result)
print(len(encoded_result))

[9034, 13555, 16447, 10584, 389, 9427, 10056, 22386, 10584, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]
128


In [13]:
# 전체 데이터 전처리
def convert_features(examples,labels, max_seq_len, tokenizer):
  input_ids, data_labels = [], []

  for example, label in tqdm(zip(examples, labels), total = len(examples)):
    eos_token = [tokenizer.eos_token]
    bos_token = [tokenizer.bos_token]
    tokens = bos_token + tokenizer.tokenize(example) + eos_token

    input_id = tokenizer.convert_tokens_to_ids(tokens)
    input_id = pad_sequences([input_id] , maxlen = max_seq_len, value = tokenizer.pad_token_id, padding = 'post')[0]

    assert len(input_id) == max_seq_len, "Error with input length {} vs {}".format(len(input_id), max_seq_len)

    input_ids.append(input_id)
    data_labels.append(label)

  input_ids = np.array(input_ids, dtype = 'int')
  data_labels = np.asarray(data_labels, dtype = np.int32)
  return input_ids, data_labels

In [14]:
train_X, train_y = convert_features(train_data['document'], train_data['label'], max_seq_len = max_seq_len, tokenizer = tokenizer)

100%|██████████| 146182/146182 [00:27<00:00, 5274.84it/s]


In [15]:
test_X, test_y = convert_features(test_data['document'], test_data['label'], max_seq_len=max_seq_len, tokenizer=tokenizer)

100%|██████████| 49997/49997 [00:07<00:00, 6306.34it/s]


In [16]:
input_id = train_X[0]
label = train_y[0]

print('단어에 대한 정수 인코딩 :',input_id)
print('각 인코딩의 길이 :', len(input_id))
print('정수 인코딩 복원 :',tokenizer.decode(input_id))
print('레이블 :',label)

단어에 대한 정수 인코딩 : [    1  9050  9267  7700  9705 23971 12870  8262  7055  7098  8084 48213
     1     3     3     3     3     3     3     3     3     3     3     3
     3     3     3     3     3     3     3     3     3     3     3     3
     3     3     3     3     3     3     3     3     3     3     3     3
     3     3     3     3     3     3     3     3     3     3     3     3
     3     3     3     3     3     3     3     3     3     3     3     3
     3     3     3     3     3     3     3     3     3     3     3     3
     3     3     3     3     3     3     3     3     3     3     3     3
     3     3     3     3     3     3     3     3     3     3     3     3
     3     3     3     3     3     3     3     3     3     3     3     3
     3     3     3     3     3     3     3     3]
각 인코딩의 길이 : 128
정수 인코딩 복원 : </s> 아 더빙.. 진짜 짜증나네요 목소리</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pa

# 3) GPT를 이용한 텍스트 분류 모델 만들기

In [17]:
model = TFGPT2Model.from_pretrained("skt/kogpt2-base-v2", from_pt = True)

pytorch_model.bin:   0%|          | 0.00/513M [00:00<?, ?B/s]

TensorFlow and JAX classes are deprecated and will be removed in Transformers v5. We recommend migrating to PyTorch classes or pinning your version of Transformers.
Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFGPT2Model: ['transformer.h.8.attn.masked_bias', 'lm_head.weight', 'transformer.h.1.attn.masked_bias', 'transformer.h.6.attn.masked_bias', 'transformer.h.3.attn.masked_bias', 'transformer.h.0.attn.masked_bias', 'transformer.h.10.attn.masked_bias', 'transformer.h.4.attn.masked_bias', 'transformer.h.7.attn.masked_bias', 'transformer.h.11.attn.masked_bias', 'transformer.h.9.attn.masked_bias', 'transformer.h.5.attn.masked_bias', 'transformer.h.2.attn.masked_bias']
- This IS expected if you are initializing TFGPT2Model from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFGPT2Model from 

In [18]:
class TFGPT2ForSequenceClassification(tf.keras.Model):
  def __init__(self, model_name):
    super(TFGPT2ForSequenceClassification, self).__init__()
    self.gpt = TFGPT2Model.from_pretrained(model_name, from_pt = True)
    self.dropout = tf.keras.layers.Dropout(0.2)
    self.classifier = tf.keras.layers.Dense(1, kernel_initializer =tf.keras.initializers.TruncatedNormal(0.02), activation = 'sigmoid', name = 'classifier')

  def call(self, inputs):
    outputs = self.gpt(input_ids = inputs)
    cls_token = outputs[0][:, -1]
    cls_token = self.dropout(cls_token)
    prediction = self.classifier(cls_token)

    return prediction

In [19]:
model = TFGPT2ForSequenceClassification("skt/kogpt2-base-v2")
optimizer = tf.keras.optimizers.Adam(learning_rate = 5e-5)
loss = tf.keras.losses.BinaryCrossentropy()
model.compile(optimizer = optimizer, loss = loss, metrics = ['accuracy'])

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFGPT2Model: ['transformer.h.8.attn.masked_bias', 'lm_head.weight', 'transformer.h.1.attn.masked_bias', 'transformer.h.6.attn.masked_bias', 'transformer.h.3.attn.masked_bias', 'transformer.h.0.attn.masked_bias', 'transformer.h.10.attn.masked_bias', 'transformer.h.4.attn.masked_bias', 'transformer.h.7.attn.masked_bias', 'transformer.h.11.attn.masked_bias', 'transformer.h.9.attn.masked_bias', 'transformer.h.5.attn.masked_bias', 'transformer.h.2.attn.masked_bias']
- This IS expected if you are initializing TFGPT2Model from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFGPT2Model from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All t

In [20]:
model.fit(train_X, train_y, epochs=2, batch_size=32, validation_split=0.2)


Epoch 1/2
[1m3655/3655[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1157s[0m 312ms/step - accuracy: 0.4960 - loss: 0.8772 - val_accuracy: 0.5989 - val_loss: 0.6709
Epoch 2/2
[1m3655/3655[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1156s[0m 311ms/step - accuracy: 0.6028 - loss: 0.6656 - val_accuracy: 0.6553 - val_loss: 0.6409


<keras.src.callbacks.history.History at 0x7a87d5a439d0>

In [21]:
results = model.evaluate(test_X, test_y, batch_size = 1024)
print('test loss, test acc: ', results)

[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m354s[0m 7s/step - accuracy: 0.6421 - loss: 0.6461
test loss, test acc:  [0.6450586915016174, 0.6477188467979431]


#4) 리뷰 예측해보기

In [22]:
def sentiment_predict(new_sentence):
  bos_token = [tokenizer.bos_token]
  eos_token = [tokenizer.eos_token]
  tokens = bos_token + tokenizer.tokenize(new_sentence) + eos_token
  input_id = tokenizer.convert_tokens_to_ids(tokens)
  input_id = pad_sequences([input_id], maxlen = max_seq_len, value = tokenizer.pad_token_id, padding = 'post')[0]
  input_id = np.array([input_id])
  score = model.predict(input_id)[0][0]

  if (score > 0.5):
    print("{:.2f}% 확률로 긍정 리뷰입니다.\n".format(score * 100))
  else:
    print("{:.2f}% 확률로 부정 리뷰입니다.\n".format((1- score) * 100))

In [23]:
sentiment_predict("보던거라 계속보고있는데 전개도 느리고 주인공인 은희는 한두컷 나오면서 소극적인모습에")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4s/step
60.10% 확률로 부정 리뷰입니다.



In [24]:
sentiment_predict("스토리는 확실히 실망이였지만 배우들 연기력이 대박이였다 특히 이제훈 연기 정말 ... 이 배우들로 이렇게밖에 만들지 못한 영화는 아쉽지만 배우들 연기력과 사운드는 정말 빛났던 영화. 기대하고 극장에서 보면 많이 실망했겠지만 평점보고 기대없이 집에서 편하게 보면 괜찮아요. 이제훈님 연기력은 최고인 것 같습니다")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
51.65% 확률로 긍정 리뷰입니다.



In [25]:
sentiment_predict('이 영화 개꿀잼 ㅋㅋㅋ')

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 66ms/step
68.17% 확률로 긍정 리뷰입니다.

