<a href="https://colab.research.google.com/github/JSJeong-me/AI-Innovation-2024/blob/main/NLP/4-2-Semantic-Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import tensorflow as tf
from tensorflow.keras.datasets import imdb
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

# Hyperparameters
max_features = 10000  # 단어 사전의 크기 (빈도수가 높은 상위 10,000개 단어만 사용)
maxlen = 500  # 각 리뷰의 최대 길이 (500 단어로 자르거나 패딩)
embedding_dim = 128  # 임베딩 벡터의 차원

# 데이터셋 로드
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)

# 시퀀스 패딩 (모든 시퀀스를 동일한 길이로 맞춤)
x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
x_test = sequence.pad_sequences(x_test, maxlen=maxlen)

# 모델 정의
model = Sequential()
model.add(Embedding(input_dim=max_features, output_dim=embedding_dim, input_length=maxlen))
model.add(LSTM(128, return_sequences=False))  # LSTM 층 추가
model.add(Dropout(0.5))  # 과적합 방지를 위한 드롭아웃
model.add(Dense(1, activation='sigmoid'))  # 출력층 (이진 분류)

# 모델 컴파일
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# 모델 요약
model.summary()

# 모델 학습
batch_size = 64
epochs = 5
history = model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, validation_split=0.2)

# 모델 평가
test_loss, test_acc = model.evaluate(x_test, y_test)
print(f'Test Accuracy: {test_acc:.4f}')


Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz
[1m17464789/17464789[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step




Epoch 1/5
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m400s[0m 1s/step - accuracy: 0.6458 - loss: 0.6128 - val_accuracy: 0.7786 - val_loss: 0.4950
Epoch 2/5
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m462s[0m 1s/step - accuracy: 0.8610 - loss: 0.3534 - val_accuracy: 0.8730 - val_loss: 0.3122
Epoch 3/5
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m440s[0m 1s/step - accuracy: 0.9176 - loss: 0.2210 - val_accuracy: 0.8638 - val_loss: 0.3202
Epoch 4/5
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m403s[0m 1s/step - accuracy: 0.9385 - loss: 0.1682 - val_accuracy: 0.8664 - val_loss: 0.3550
Epoch 5/5
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m443s[0m 1s/step - accuracy: 0.9531 - loss: 0.1287 - val_accuracy: 0.7548 - val_loss: 0.5060
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m214s[0m 273ms/step - accuracy: 0.7535 - loss: 0.5014
Test Accuracy: 0.7573


In [3]:
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences # import the pad_sequences function

# IMDB 단어 인덱스 로드
word_index = imdb.get_word_index()

# 텍스트를 시퀀스로 변환하는 함수
def text_to_sequence(text):
    # 텍스트를 소문자로 변환하고 공백으로 나눔
    words = text.lower().split()
    # 각 단어를 사전에 매핑된 숫자로 변환, 사전에 없는 단어는 0으로 처리
    sequence = [word_index.get(word, 0) for word in words]
    return sequence

# 새로운 리뷰 예측하기 위한 함수
def predict_review(text):
    # 입력 텍스트를 시퀀스로 변환
    sequence = text_to_sequence(text)
    # 모델의 입력 길이에 맞게 시퀀스를 패딩
    padded_sequence = pad_sequences([sequence], maxlen=maxlen) # call pad_sequences correctly
    # 예측 수행
    prediction = model.predict(padded_sequence)
    # 예측 결과 출력 (0에 가까우면 부정, 1에 가까우면 긍정)
    print(f'Review: {text}')
    print(f'Prediction (0 = Negative, 1 = Positive): {prediction[0][0]:.4f}')
    if prediction[0][0] >= 0.5:
        print('Sentiment: Positive')
    else:
        print('Sentiment: Negative')

# 예측 예제
sample_review = "The movie was fantastic and thrilling"
predict_review(sample_review)

sample_review2 = "It was a terrible movie, I hated it"
predict_review(sample_review2)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 460ms/step
Review: The movie was fantastic and thrilling
Prediction (0 = Negative, 1 = Positive): 0.7125
Sentiment: Positive
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 156ms/step
Review: It was a terrible movie, I hated it
Prediction (0 = Negative, 1 = Positive): 0.7943
Sentiment: Positive
