# 감성분석

In [1]:
import pandas as pd

In [2]:
review = pd.read_excel('review.xlsx', index_col=0)

In [3]:
from kiwipiepy import Kiwi

kiwi = Kiwi()
kiwi.prepare()

0

In [4]:
def extract_keywords(text):
    result = kiwi.analyze(text)
    for token, pos, _, _ in result[0][0]:
        if pos[0] in 'NV':
            yield f'{token}/{pos}'

In [5]:
from sklearn.feature_extraction.text import CountVectorizer

In [6]:
cv = CountVectorizer(max_features=800, tokenizer=extract_keywords)
dtm = cv.fit_transform(review['review'])

In [7]:
words = cv.get_feature_names()

In [8]:
# 불용어 사전 만들기
stop_words = ['1스쿱','2스쿱','개','것','곳','그거','내','개인','나','때문','마이프로틴','보충제','마프','한스쿱','두스쿱','아이솔레이트','아이솔레이','아이솔']
# 이해가 안가거나 필요없는 단어들을 제거

In [9]:
cv = CountVectorizer(max_features=500, stop_words = stop_words, tokenizer=extract_keywords)
dtm = cv.fit_transform(review['review'])

  'stop_words.' % sorted(inconsistent))


In [10]:
words = cv.get_feature_names()

In [11]:
import joblib
joblib.dump({'words': words, 'dtm': dtm}, 'nsmc.pkl')

['nsmc.pkl']

In [12]:
data = joblib.load('nsmc.pkl')

In [13]:
locals().update(data)

In [14]:
len(review.query('label=="1"'))

3840

In [15]:
len(review.query('label=="0"'))

1084

In [16]:
# 긍정의 데이터가 많이 편향되어있어서 긍정데이터에서 2500개 삭제
review1 = review.drop(review[review['label'].eq(1)].sample(2500).index)

In [17]:
x = dtm
y = review.label

In [18]:
from sklearn.model_selection import train_test_split

In [19]:
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.3, random_state=0)

In [20]:
import tensorflow as tf

In [29]:
model = tf.keras.models.Sequential()
model.add(tf.keras.layers.Dense(1, activation='sigmoid'))

In [30]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [31]:
model.fit(x_train.A, y_train, epochs=100, validation_split=0.1,
         callbacks=[tf.keras.callbacks.EarlyStopping(monitor='val_accuracy')])

Epoch 1/100
Epoch 2/100
Epoch 3/100


<tensorflow.python.keras.callbacks.History at 0x1d6c7d7c248>

In [32]:
model.evaluate(x_test.A, y_test)



[0.5654105544090271, 0.7767252922058105]

In [33]:
w, b = model.weights

In [34]:
word_sent = pd.DataFrame({'토큰': words, '가중치': w.numpy().flat})

In [40]:
word_sent.sort_values('가중치').head(15)

Unnamed: 0,토큰,가중치
351,일반/NNG,-0.131544
283,쓴맛/NNG,-0.119568
245,설탕/NNG,-0.116767
292,아이스크림/NNG,-0.113999
200,버리/VX,-0.112348
287,아쉽/VA,-0.112054
495,흔들/VV,-0.10729
99,달/VA,-0.107233
269,스푼/NNG,-0.104571
426,취향/NNG,-0.104236


In [41]:
word_sent.sort_values('가중치').tail(15)

Unnamed: 0,토큰,가중치
423,추/NNG,0.147196
117,둘/NR,0.148408
478,하루/NNG,0.151357
7,가성비/NNG,0.151916
376,제일/NNG,0.153165
178,무맛/NNG,0.153778
106,대비/NNG,0.154327
309,여기/NP,0.155522
2,가격/NNG,0.158482
248,성분/NNG,0.15943
