<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"></ul></div>

In [6]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras import Input, Model, Sequential
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing import image_dataset_from_directory
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.image import load_img, img_to_array, ImageDataGenerator
from tensorflow.keras.layers import Layer, Dense, Flatten, Dropout, Concatenate, Add, Dot, Multiply, Reshape, Activation, BatchNormalization, SimpleRNNCell, RNN, SimpleRNN, LSTM, Embedding, Bidirectional, TimeDistributed, Conv1D, Conv2D, MaxPool1D, MaxPool2D, GlobalMaxPool1D, GlobalMaxPool2D, AveragePooling1D, AveragePooling2D, GlobalAveragePooling1D, GlobalAveragePooling2D, ZeroPadding2D
from tensorflow.keras.optimizers import SGD, Adam, Adagrad
from tensorflow.keras.metrics import MeanSquaredError, RootMeanSquaredError, MeanAbsoluteError, MeanAbsolutePercentageError, BinaryCrossentropy, CategoricalCrossentropy, SparseCategoricalCrossentropy, CosineSimilarity
from tensorflow.keras.layers.experimental.preprocessing import Rescaling
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.activations import linear, sigmoid, relu
from tensorflow.keras.initializers import RandomNormal, glorot_uniform, he_uniform, Constant
from tensorflow.keras.models import load_model
import urllib.request
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sb
import string
# from hanspell import spell_checker
from tqdm.notebook import tqdm
# from soynlp.normalizer import *
import MeCab
from konlpy.tag import *
import json
import re
from sklearn.model_selection import train_test_split

class Mecab:
    def pos(self, text):
        p = re.compile(".+\t[A-Z]+")
        return [tuple(p.match(line).group().split("\t")) for line in MeCab.Tagger().parse(text).splitlines()[:-1]]
    
    def morphs(self, text):
        p = re.compile(".+\t[A-Z]+")
        return [p.match(line).group().split("\t")[0] for line in MeCab.Tagger().parse(text).splitlines()[:-1]]
    
    def nouns(self, text):
        p = re.compile(".+\t[A-Z]+")
        temp = [tuple(p.match(line).group().split("\t")) for line in MeCab.Tagger().parse(text).splitlines()[:-1]]
        nouns=[]
        for word in temp:
            if word[1] in ["NNG", "NNP", "NNB", "NNBC", "NP", "NR"]:
                nouns.append(word[0])
        return nouns
mcb = Mecab()
tqdm.pandas()
plt.style.use("dark_background")

  from pandas import Panel


In [2]:
urllib.request.urlretrieve("https://raw.githubusercontent.com/bab2min/corpus/master/sentiment/steam.txt", filename="./Datasets/Steam Reviews/steam.txt")
data = pd.read_table("./Datasets/Steam Reviews/steam.txt", names=["label", "review"])

data["review"] = data["review"].str.replace(f"[{string.punctuation}]", " ")
data["review"] = data["review"].str.replace(r" +", " ")
data = data[data["review"]!=" "]
data = data.dropna(axis=0)
data = data.drop_duplicates(["review"], keep="first")
data["morphs"] = data["review"].progress_apply(mcb.morphs)

HBox(children=(FloatProgress(value=0.0, max=99755.0), HTML(value='')))




In [7]:
tr, te = train_test_split(data, test_size=0.2, shuffle=True, random_state=231)
tr_X = tr["morphs"]
tr_y = np.array(tr["label"])

te_X = te["morphs"]
te_y = np.array(te["label"])

tokenizer = tf.keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(tr_X)
word2idx = tokenizer.word_index
cnts = sorted(tokenizer.word_counts.values(), reverse=True)
ratio = 0.99
for vocab_size, value in enumerate(np.cumsum(cnts)/np.sum(cnts)):
    if value >= ratio:
        break
print(f"{vocab_size:,}개의 단어로 전체 data의 {ratio:.0%}를 표현할 수 있습니다.")
print(f"{len(word2idx):,}개의 단어 중 {vocab_size/len(word2idx):.1%}에 해당합니다.")

20,753개의 단어로 전체 data의 99%를 표현할 수 있습니다.
37,753개의 단어 중 55.0%에 해당합니다.


In [8]:
tokenizer = Tokenizer(num_words=vocab_size + 2, oov_token="UNK")
tokenizer.fit_on_texts(tr_X)
word2idx = tokenizer.word_index
word2cnt = dict(sorted(tokenizer.word_counts.items(), key=lambda x:x[1], reverse=True))

tr_X = tokenizer.texts_to_sequences(tr_X)
te_X = tokenizer.texts_to_sequences(te_X)

lens = sorted([len(doc) for doc in tr_X])
ratio = 0.99
max_len = int(np.quantile(lens, 0.99))
print(f"길이가 가장 긴 문장의 길이는 {np.max(lens)}이고 길이가 {max_len} 이하인 문장이 전체의 {ratio:.0%}를 차지합니다.")

길이가 가장 긴 문장의 길이는 92이고 길이가 65 이하인 문장이 전체의 99%를 차지합니다.


In [13]:
tr_X = pad_sequences(tr_X, maxlen=max_len)
te_X = pad_sequences(te_X, maxlen=max_len)

inputs = Input(shape=(max_len))
z = Embedding(input_dim=vocab_size + 1, output_dim=64)(inputs)
h_size = 128
z = Bidirectional(LSTM(units=h_size))(z)
outputs = Dense(units=1, activation="sigmoid")(z)

model = Model(inputs=inputs, outputs=outputs)

# model = Sequential()
# model.add(Embedding(input_dim=vocab_size+2, output_dim=64))
# model.add(Bidirectional(LSTM(units=h_size)))
# model.add(Dense(units=1, activation="sigmoid"))

es = EarlyStopping(monitor="val_loss", mode="auto", verbose=1, patience=2)
model_path = "steam_review_bidirectional_lstm.h5"
mc = ModelCheckpoint(filepath=model_path, monitor="val_binary_accuracy", mode="auto", verbose=1, save_best_only=True)
model.compile(optimizer="rmsprop", loss="binary_crossentropy", metrics=["binary_accuracy"])
model.summary()

hist = model.fit(x=tr_X, y=tr_y, validation_split=0.2, batch_size=256, epochs=16, verbose=1, callbacks=[es, mc])

NotImplementedError: Cannot convert a symbolic Tensor (bidirectional_2/forward_lstm_2/strided_slice:0) to a numpy array. This error may indicate that you're trying to pass a Tensor to a NumPy call, which is not supported

In [None]:
# tr_X = pad_sequences(tr_X, maxlen=max_len)
# te_X = pad_sequences(te_X, maxlen=max_len)

# model = Sequential()
# model.add(Embedding(input_dim=vocab_size+2, output_dim=64))
# hidden_size = 128
# model.add(Bidirectional(LSTM(units=hidden_size)))
# model.add(Dense(units=1, activation="sigmoid"))

# es = EarlyStopping(monitor="val_loss", mode="auto", verbose=1, patience=2)
# model_path = "steam_review_bidirectional_lstm.h5"
# mc = ModelCheckpoint(filepath=model_path, monitor="val_binary_accuracy", mode="auto", verbose=1, save_best_only=True)

# model.compile(optimizer="rmsprop", loss="binary_crossentropy", metrics=["binary_accuracy"])
# model.summary()

# batch_size = 256
# hist = model.fit(x=tr_X, y=tr_y, validation_split=0.2, batch_size=batch_size, epochs=10, verbose=1, callbacks=[es, mc])

Epoch 1/10


KeyboardInterrupt: 

In [None]:
model = tf.keras.models.load_model(model_path)

model.evaluate(x=te_X, y=te_y, batch_size=batch_size, verbose=1)



[0.4575434923171997, 0.7930930852890015]

In [None]:
def sentiment_predict(sent):
    sent = mcb.morphs(sent)
    seq = tokenizer.texts_to_sequences([sent])
    pad = tf.keras.preprocessing.sequence.pad_sequences(seq, maxlen=max_len)
    score = float(model.predict(pad))
    print(f"이 문장의 긍정도는 {score:.1%}입니다.")

In [None]:
sentiment_predict("노잼 ..완전 재미 없음 ㅉㅉ")

이 문장의 긍정도는 3.4%입니다.


In [None]:
sentiment_predict("조금 어렵지만 재밌음ㅋㅋ")

이 문장의 긍정도는 98.7%입니다.


In [None]:
sentiment_predict("케릭터가 예뻐고 재밌기도 합니다만 애매하네요")

이 문장의 긍정도는 34.1%입니다.
