<a href="https://colab.research.google.com/github/Hyuncastl/MACHINE_LEARNING/blob/main/10%EC%A3%BC%EC%B0%A8.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import tensorflow as tf

import warnings
warnings.filterwarnings(action='ignore')

In [None]:
# Naver sentiment movie corpus v1.0 데이터 불러오기
train_file = tf.keras.utils.get_file(
    'ratings_train.txt', origin='https://raw.githubusercontent.com/e9t/nsmc/master/ratings_train.txt', extract=True)

train = pd.read_csv(train_file, sep='\t')

In [None]:
# 데이터 크기 및 샘플 확인
print("train shape: ", train.shape)
train.head()

In [None]:
type(train)

In [None]:
# 레이블별 개수
cnt = train['label'].value_counts()
print(cnt)

In [None]:
# 레이블별 비율 
sns.countplot(x='label',data=train)

In [None]:
# 결측치 확인
train.isnull().sum() 

In [None]:
# 결측치(의견없음)가 특정 label값만 있는지 확인
train[train['document'].isnull()]

In [None]:
# 레이블 별 텍스트 길이
fig,(ax1,ax2)=plt.subplots(1,2,figsize=(10,5))
data_len=train[train['label']==1]['document'].str.len()
ax1.hist(data_len)
ax1.set_title('positive')

data_len=train[train['label']==0]['document'].str.len()
ax2.hist(data_len)
ax2.set_title('negative')
fig.suptitle('Number of characters')
plt.show()

In [None]:
! git clone https://github.com/SOMJANG/Mecab-ko-for-Google-Colab.git

In [None]:
cd Mecab-ko-for-Google-Colab/

In [None]:
! bash install_mecab-ko_on_colab_light_220429.sh

In [None]:
mecab = Mecab()
print(mecab.pos)

In [None]:
# Kkma, Komoran, Okt, Mecab 형태소
import konlpy
from konlpy.tag import Kkma, Komoran, Okt, Mecab

kkma = Kkma()
komoran = Komoran()
okt = Okt()
mecab = Mecab()

In [None]:
# 형태소별 샘플
text = "영실아안녕오늘날씨어때?"

def sample_ko_pos(text):
    print(f"==== {text} ====")
    print("kkma:",kkma.pos(text))
    print("komoran:",komoran.pos(text))
    print("okt:",okt.pos(text))
    print("mecab:",mecab.pos(text))
    print("\n")

sample_ko_pos(text)

In [None]:
# 텍스트 전처리(영어와 한글만 남기고 삭제)
train['document'] = train['document'].str.replace("[^A-Za-z가-힣ㄱ-ㅎㅏ-ㅣ ]","")
train['document'].head()

In [None]:
# 결측치 제거
train = train.dropna()
train.shape

In [None]:
# 스탑워드와 형태소 분석
def word_tokenization(text):
  stop_words = ["는", "을", "를", '이', '가', '의', '던', '고', '하', '다', '은', '에', '들', '지', '게', '도'] # 한글 불용어
  return [word for word in mecab.morphs(text) if word not in stop_words]

In [None]:
data = train['document'].apply((lambda x: word_tokenization(x)))
data.head()

In [None]:
# train과 validation 분할

training_size = 120000

# train 분할
train_sentences = data[:training_size]
valid_sentences = data[training_size:]

# label 분할
train_labels = train['label'][:training_size]
valid_labels = train['label'][training_size:]

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# vocab_size 설정
tokenizer = Tokenizer()
tokenizer.fit_on_texts(data)
print("총 단어 갯수 : ",len(tokenizer.word_index))

# 5회 이상만 vocab_size에 포함
def get_vocab_size(threshold):
  cnt = 0
  for x in tokenizer.word_counts.values():
    if x >= threshold:
      cnt = cnt + 1
  return cnt

vocab_size = get_vocab_size(5) # 5회 이상 출현 단어
print("vocab_size: ", vocab_size)

In [None]:
oov_tok = "" # 사전에 없는 단어
vocab_size = 15000

tokenizer = Tokenizer(oov_token=oov_tok, num_words=vocab_size)
tokenizer.fit_on_texts(data)
print(tokenizer.word_index)
print("단어 사전 개수:", len(tokenizer.word_counts))

In [None]:
# 문자를 숫자로 표현
print(train_sentences[:2])
print(valid_sentences[:2])
train_sequences = tokenizer.texts_to_sequences(train_sentences)
valid_sequences = tokenizer.texts_to_sequences(valid_sentences)
print(train_sequences[:2])
print(valid_sentences[:2])

In [None]:
# 문장의 최대 길이
max_length = max(len(x) for x in train_sequences)
print("문장 최대 길이:", max_length)

In [None]:
# 문장 길이를 동일하게 맞춘다
trunc_type='post'
padding_type='post'

train_padded = pad_sequences(train_sequences, truncating=trunc_type, padding=padding_type, maxlen=max_length)
valid_padded = pad_sequences(valid_sequences, truncating=trunc_type, padding=padding_type, maxlen=max_length)

train_labels = np.asarray(train_labels).reshape(-1,1)
valid_labels = np.asarray(valid_labels).reshape(-1,1)

print("샘플:", train_padded[:1])

In [None]:
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, LSTM, Embedding, Bidirectional

def create_model():
    model = Sequential([
                Embedding(vocab_size, 32),
                Bidirectional(LSTM(32, return_sequences=True)),    
                Dense(32, activation='relu'),
                Dense(1, activation='sigmoid')
    ])
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

model = create_model()
model.summary()

In [None]:
# 가장 좋은 loss의 가중치 저장
checkpoint_path = 'best_performed_model.ckpt'
checkpoint = tf.keras.callbacks.ModelCheckpoint(checkpoint_path, 
                                                save_weights_only=True, 
                                                save_best_only=True, 
                                                monitor='val_loss',
                                                verbose=1)

In [None]:
# 학습조기종료
early_stop = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=2)

# 학습
history = model.fit(train_padded, train_labels, 
                validation_data=(valid_padded, valid_labels), 
                callbacks=[early_stop, checkpoint], batch_size=64, epochs=10, verbose=2)

In [None]:
def plot_graphs(history, metric):
  plt.plot(history.history[metric])
  plt.plot(history.history['val_'+metric], '')
  plt.xlabel("Epochs")
  plt.ylabel(metric)
  plt.legend([metric, 'val_'+metric])
  plt.show()
plot_graphs(history, 'accuracy')

In [None]:
plot_graphs(history, 'loss')

In [None]:
# 테스트 데이터 불러오기
test_file = tf.keras.utils.get_file(
    'ratings_test.txt', origin='https://raw.githubusercontent.com/e9t/nsmc/master/ratings_test.txt', extract=True)

test = pd.read_csv(test_file, sep='\t')
test.head()
     

In [None]:
# 데이터 전처리
def preprocessing(df):
  df['document'] = df['document'].str.replace("[^A-Za-z가-힣ㄱ-ㅎㅏ-ㅣ ]","")
  df = df.dropna()
  test_label = np.asarray(df['label'])
  test_data =  df['document'].apply((lambda x: word_tokenization(x)))
  test_data = tokenizer.texts_to_sequences(test_data)
  test_data = pad_sequences(test_data, truncating=trunc_type, padding=padding_type, maxlen=max_length)
  return test_data, test_label

test_data, test_label = preprocessing(test)
print(model.evaluate(test_data, test_label))

In [None]:
# 기본 모델 로드 후 평가
model2 = create_model()
model2.evaluate(test_data, test_label)

In [None]:
# 저장된 가중치 적용된 모델 로드 후 평가
model2.load_weights(checkpoint_path)
model2.evaluate(test_data, test_label)

In [None]:
import pickle
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import urllib.request
from konlpy.tag import Okt
from tqdm import tqdm
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
okt=Okt() ##Open Korean Text 트위터에서 만든 형태소 분석기
okt.morphs('와 이런 것도 영화라고 차라리 뮤직비디오를 만드는 게 나을 뻔',stem = True)

In [None]:
##문자열 치환 re.sub(검색패턴, 변경하고 싶은 문자, 검색되는 문자열) 
new_sentence = '이 영화 개꿀잼ㅎㅎㅎㅋㅋㅋ'
new_sentence = re.sub(r'[^ㄱ-ㅎ ㅏ-ㅣ 가-힣]','',new_sentence)
new_sentence

In [None]:
new_sentence = okt.morphs(new_sentence, stem=True)
new_sentence

In [None]:
stop_words = ["는", "을", "를", '이', '가', '의', '던', '고', '하', '다', '은', '에', '들', '지', '게', '도']

In [None]:
new_sentence = [word for word in new_sentence if not word in stop_words]
new_sentence

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# vocab_size 설정
tokenizer = Tokenizer()

In [None]:
oov_tok = ""#사전에 없는 단어
vocab_size = 15000

tokenizer = Tokenizer(oov_token=oov_tok, num_words=vocab_size)
tokenizer.fit_on_texts(new_sentence)
     

In [None]:
encoded = tokenizer.texts_to_sequences(new_sentence)
encoded

In [None]:
pad_new=pad_sequences(encoded, maxlen=max_length)
pad_new.shape

In [None]:
model2.predict(pad_new).shape

In [None]:
model2.predict(pad_new)

In [None]:
score=np.max(model2.predict(pad_new))
print(score)

In [None]:
max_len = 74
stop_words = ["는", "을", "를", '이', '가', '의', '던', '고', '하', '다', '은', '에', '들', '지', '게', '도']

def sentiment_predict(new_sentence):
  new_sentence = new_sentence.replace("[^A-Za-z-가-힣ㄱ-ㅎㅏ-ㅣ ]","")
  new_sentence = okt.morphs(new_sentence, stem=True) #토큰화
  new_sentence = [word for word in new_sentence if not word in stop_words] #불용어 제거
  encoded = tokenizer.texts_to_sequences(new_sentence) #정수 인코딩
  pad_new = pad_sequences(encoded, maxlen = max_len) #패딩
  score = np.max(model2.predict(pad_new)) #예측
  if score > 0.5 :
    print("{:.2f}% 확률로 긍정 리뷰 입니다.\n".format(score*100))
  else:
    print("{:.2f}% 확률로 부정 리뷰 입니다.\n".format(score*100))

In [None]:
sentiment_predict('이 영화 개꿀잼 ㅋㅋㅋ~~!!')

In [None]:
sentiment_predict('이딴게 영화냐 ㅉㅉ')

In [None]:
sentiment_predict('와 개쩐다 정말 세계관 최강자들의 영화다')

In [None]:
sentiment_predict('감독 뭐하는 놈이냐?')