<a href="https://colab.research.google.com/github/Hwangtaehun/gameAI/blob/main/final_exam_first.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

데이터 로드하기

In [None]:
from google.colab import drive
drive.mount('/content/drive')
from google.colab import files
uploaded = files.upload()

In [None]:
import tensorflow as tf
tf.__version__

In [None]:
pip install konlpy

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import re
from konlpy.tag import Okt
from tqdm import tqdm
from tensorflow.keras.preprocessing.text import Tokenizer
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
train_data = pd.read_table('train_data.txt')
test_data = pd.read_table('test_data.txt')

In [None]:
print('훈련용 리뷰 개수 : ',len(train_data))

In [None]:
train_data[:5]

In [None]:
print("테스트용 리뷰 개수 : ",len(test_data))

In [None]:
test_data[:5]

train_data 정제화

In [None]:
train_data['document'].nunique(), train_data['label'].nunique()

In [None]:
train_data.drop_duplicates(subset=['document'], inplace=True)

In [None]:
print('총 샘플의 수:',len(train_data))

In [None]:
train_data['label'].value_counts().plot(kind='bar')

In [None]:
print(train_data.groupby('label').size().reset_index(name = 'count'))

In [None]:
print(train_data.isnull().values.any())

In [None]:
#한글과 공백을 제외하고 모두 제거
train_data['document'] = train_data['document'].str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]","")
train_data[:5]

test_data 정제화

In [None]:
test_data['document'].nunique(), test_data['label'].nunique()

In [None]:
test_data.drop_duplicates(subset=['document'], inplace=True)

In [None]:
test_data['document'] = train_data['document'].str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]","")

In [None]:
print('전처리 후 테스트용 샘플의 개수 :',len(test_data))

토큰화

In [None]:
stopwords = ['의','가','이','은','들','는','좀','잘','걍','과','도','를','으로','자','에','와','한','하다']
okt = Okt()

train_data 토큰화

In [None]:
X_train = []
for sentence in tqdm(train_data['document']):
    tokenized_sentence = okt.morphs(sentence, stem=True) # 토큰화
    stopwords_removed_sentence = [word for word in tokenized_sentence if not word in stopwords] # 불용어 제거
    X_train.append(stopwords_removed_sentence)

In [None]:
X_test = []
for sentence in tqdm(test_data['document']):
    tokenized_sentence = okt.morphs(sentence, stem=True)
    stopwords_removed_sentence = [word for word in tokenized_sentence if not word in stopwords]
    X_test.append(stopwords_removed_sentence)

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)

In [None]:
threshold = 3
total_cnt = len(tokenizer.word_index)
rare_cnt = 0
total_freq = 0
rare_freq = 0

for key, value in tokenizer.word_counts.items():
  total_freq = total_freq + value

  if(value < threshold):
    rare_cnt = rare_cnt + 1
    rare_freq = rare_freq + value

In [None]:
vocab_size = total_cnt - rare_cnt + 1
print('단어 집합의 크기 :',vocab_size)

In [None]:
tokenizer = Tokenizer(vocab_size)
tokenizer.fit_on_texts(X_train)
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

In [None]:
print(X_train[:3])

In [None]:
y_train = np.array(train_data['label'])
y_test = np.array(test_data['label'])

빈 샘플 제거

In [None]:
drop_train = [index for index, sentence in enumerate(X_train) if len(sentence) < 1]
drop_test = [index for index, sentence in enumerate(X_test) if len(sentence) < 1]

train_data 빈 샘플 제거

In [None]:
X_train = np.delete(X_train, drop_train, axis=0)
y_train = np.delete(y_train, drop_train, axis=0)
print(len(X_train))
print(len(y_train))

test_data 빈 샘플 제거

In [None]:
X_test = np.delete(X_test, drop_test, axis=0)
y_test = np.delete(y_test, drop_test, axis=0)
print(len(X_test))
print(len(y_test))

In [None]:
print('리뷰의 최대 길이 :',max(len(review) for review in X_train))
print('리뷰의 평균 길이 :',sum(map(len, X_train))/len(X_train))
plt.hist([len(review) for review in X_train], bins=50)
plt.xlabel('length of samples')
plt.ylabel('number of samples')
plt.show()

In [None]:
def below_threshold_len(max_len, nested_list):
  count = 0
  for sentence in nested_list:
    if(len(sentence) <= max_len):
        count = count + 1
  print('전체 샘플 중 길이가 %s 이하인 샘플의 비율: %s'%(max_len, (count / len(nested_list))*100))

In [None]:
max_len = 17
below_threshold_len(max_len, X_train)

In [None]:
X_train = pad_sequences(X_train, maxlen=max_len)
X_test = pad_sequences(X_test, maxlen=max_len)

LSTM 분류

In [None]:
from tensorflow.keras.layers import Embedding, Dense, LSTM
from tensorflow.keras.models import Sequential
from tensorflow.keras.models import load_model
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

embedding_dim = 100
hidden_units = 128

model = Sequential()
model.add(Embedding(vocab_size, embedding_dim))
model.add(LSTM(hidden_units))
model.add(Dense(1, activation='sigmoid'))

es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=4)
mc = ModelCheckpoint('best_model.h5', monitor='val_acc', mode='max', verbose=1, save_best_only=True)

model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])
history = model.fit(X_train, y_train, epochs=15, callbacks=[es, mc], batch_size=64, validation_split=0.2)

In [None]:
loaded_model = load_model('best_model.h5')
print("\n 테스트 정확도: %.4f" % (loaded_model.evaluate(X_test, y_test)[1]))

리뷰 예측

In [None]:
def sentiment_predict(new_sentence):
  new_sentence = re.sub(r'[^ㄱ-ㅎㅏ-ㅣ가-힣 ]','', new_sentence)
  new_sentence = okt.morphs(new_sentence, stem=True)
  new_sentence = [word for word in new_sentence if not word in stopwords]
  encoded = tokenizer.texts_to_sequences([new_sentence])
  pad_new = pad_sequences(encoded, maxlen = max_len)
  score = float(loaded_model.predict(pad_new))
  print(score);

In [None]:
sentiment_predict('기쁨')
sentiment_predict('놀람')
sentiment_predict('분노')
sentiment_predict('불안')
sentiment_predict('혐오')
sentiment_predict('슬픔')

In [None]:
sentiment_predict('나 오늘 합격했어')

In [None]:
sentiment_predict('짜증나! 오늘 과제 너무 많아')

In [None]:
sentiment_predict('앗, 깜짝이야!')

In [None]:
sentiment_predict('오늘 큰이모께서 돌아가셨어!')