In [1]:
import numpy as np
import pandas as pd
from collections import Counter
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense
from keras.datasets import imdb

In [2]:
(X_train_raw, y_train_raw), (X_test_raw,y_test_raw) = imdb.load_data()

In [3]:
word_index = imdb.get_word_index()

In [4]:
index_word = { v:k for k,v in word_index.items()}

In [5]:
def make_sentence(x):
    doc = []
    for i in x:
        i = np.array(i)[np.array(i) <= max(word_index.values())]
        tmp = []
        for j in range(len(i)):
            tmp.append(index_word[i[j]])
            sentence = ' '.join(tmp)
        doc.append(sentence)
    return pd.DataFrame(doc,columns=['text'])

In [6]:
X_train = make_sentence(X_train_raw)

In [7]:
X_test = make_sentence(X_test_raw)

In [8]:
X = pd.concat([X_train,X_test])

In [10]:
y = np.concatenate([y_train_raw,y_test_raw])

In [11]:
X['label'] = y

In [57]:
# preprocessing
# 단어 unique -full_text, --> unique
# word_index (사전 정수인덱스:단어)
# index_word (사전 단어:정수인덱스)
# 정수 인코딩
# X_train, y_train, X_test, y_test, (8:2) 
# X_train, y_train, X_val, y_val (8:2)
# X_train, val, test ==> one-hot

In [13]:
X.reset_index(inplace=True)

In [14]:
full_text = []
for i in X.text:
    tmp = i.split()
    full_text.extend(tmp)

In [15]:
word_cnt = Counter(full_text)

In [16]:
common_word = word_cnt.most_common(1000)

In [17]:
used_word = [ i for i,j in common_word]

In [18]:
unique_word = list(set(used_word))

In [19]:
word_index = { k+1:v for k, v in enumerate(unique_word)}

In [20]:
index_word = { v:k for k,v in word_index.items()}

In [21]:
max(word_index.keys())

1000

In [22]:
def make_sentence(x):
    encoded_x = []
    for i in x.split():
        encoded_x.append(index_word.get(i,0))
    return encoded_x

In [23]:
encoded_x = X.text.apply(make_sentence)

In [24]:
def vectorize_word(x,dimension=1000):
    t = np.zeros((x.shape[0],dimension))
    for k,v in enumerate(x.values):
        for i in v:
            if i < dimension:
                t[k,i] += 1
    return t

In [25]:
vectorized_x = vectorize_word(encoded_x)

In [26]:
y = X.label.values

In [27]:
y = y.reshape(-1,1)

In [28]:
idx = int(vectorized_x.shape[0]*0.8)

In [29]:
X_train = vectorized_x[:idx]
y_train = y[:idx]
X_test = vectorized_x[idx:]
y_test = y[idx:]

In [30]:
idx = int(X_train.shape[0]*0.8)

In [31]:
X_val = X_train[idx:]
X_train = X_train[:idx]
y_val = y_train[idx:]
y_train = y_train[:idx]

In [32]:
input_shape = X_train.shape[1]
output_shape = y_train.shape[1]

In [33]:
# 모델생성
model = Sequential()
model.add(Dense(64,activation='relu',input_shape=(input_shape,)))
model.add(Dense(32,activation='relu'))
model.add(Dense(output_shape,activation='sigmoid'))

2022-05-03 13:56:03.609737: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-05-03 13:56:03.610530: I tensorflow/core/common_runtime/process_util.cc:146] Creating new thread pool with default inter op setting: 2. Tune using inter_op_parallelism_threads for best performance.


In [34]:
# 모델컴파일
optimizer = 'adam'
loss = 'binary_crossentropy'
metrics = ['accuracy']
model.compile(optimizer=optimizer,
             loss=loss,
             metrics=metrics)

In [35]:
# 모델학습
epochs = 10
batch_size = 100
validation_data = (X_val, y_val)
model.fit(X_train,
          y_train,
         epochs=epochs,
         batch_size= batch_size,
         validation_data=validation_data)

2022-05-03 13:56:09.958968: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f21d06e6130>

In [36]:
# 모델 테스트
model.evaluate(X_test,y_test)



[0.5050464868545532, 0.8456000089645386]

In [63]:
comment = X.text[0]

def make_sentence(x):
    encoded_x = []
    for i in x.split():
        encoded_x.append(index_word.get(i,0))
    return encoded_x

In [73]:
comment = X.text[0]
x = make_sentence(comment)
def vectorize(x):
    t = np.zeros(X_train.shape[1])
    for i in x:
        t[i] += 1
    return t.reshape(1,-1)
x = vectorize(x)
result = np.where(model.predict(x) > 0.5,1,0)
result

array([[1]])

In [81]:
# 1-긍정 0-부정
def answer(x):
    x = make_sentence(x)
    x = vectorize(x)
    result = np.where(model.predict(x) >0.5,'긍정','부정')
    print(str(result[0][0]))
    return result

In [82]:
comment = X.text[3]
comment

"the of bernadette mon they halfway of identity went plot actors watch of share was well these can this only coe ten so failing feels only novak killer theo of bill br gretal would find of films saw grade about hated it for br so ten remain by in of songs are of sahib gigantic is morality it's her or know would care i i br screen that obvious plot actors new would with paris not have attempt lead or of too would local that of every their it coming this eleven of information to concocts br singers movie was anxious that film is under by left this troble is entertainment ok this in own be house of sticks worker in bound my i i obviously sake things just as lost lot br comes never like thing start of obviously comes indeed coming want no bad than history from lost comes accidentally young to movie bad facts dream from reason these honor movie elizabeth it's movie so fi implanted enough to computer duo film paraphrasing almost jeffrey rarely obviously snag alive to appears i i only human i

In [83]:
answer(comment)

긍정


array([['긍정']], dtype='<U2')

In [1]:
from konlpy.tag import Komoran, Kkma, Okt, Hannanum

In [2]:
sentence = '아버지가방에들어가신다'

In [3]:
komoran = Komoran()
okt = Okt()
kkma = Kkma()
han = Hannanum()

In [4]:
komoran.nouns(sentence)

['아버지', '가방']

In [5]:
okt.nouns(sentence)

['아버지', '가방']

In [6]:
kkma.nouns(sentence)

['아버지', '아버지가방', '가방']

In [7]:
han.nouns(sentence)

['아버지가방에들어가']

In [8]:
komoran.pos(sentence)

[('아버지', 'NNG'),
 ('가방', 'NNP'),
 ('에', 'JKB'),
 ('들어가', 'VV'),
 ('시', 'EP'),
 ('ㄴ다', 'EC')]