### 감성 분석 : Sentimental Analysis

#### [1] 데이터 준비

In [1]:
# 모듈 로딩
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow_datasets as tfds
from tensorflow.keras.layers import Embedding, SimpleRNN, RNN, LSTM, GRU, Input, Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
import string

In [2]:
# 데이터 로딩
imdb_df=pd.read_csv('imdb_reviews.csv')

In [3]:
imdb_df.head()

Unnamed: 0,review,sentiment
0,My family and I normally do not watch local mo...,1
1,"Believe it or not, this was at one time the wo...",0
2,"After some internet surfing, I found the ""Home...",0
3,One of the most unheralded great works of anim...,1
4,"It was the Sixties, and anyone with long hair ...",0


In [4]:
imdb_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 781.4+ KB


In [5]:
# sentiment 컬럼 데이터 분포 확인
imdb_df.sentiment.value_counts()

1    25000
0    25000
Name: sentiment, dtype: int64

#### [2] 데이터 전처리

[2-1] 기본적인 전처리 : 결측치, 중복값, 이상값, 노이즈 ..

In [6]:
# 결측치 체크
imdb_df.isnull().sum()

review       0
sentiment    0
dtype: int64

In [7]:
# 결측치 체크2 (공백)
imdb_df[imdb_df.review==''].shape[0]

0

In [8]:
# 중복값 체크
imdb_df.duplicated().sum()

418

In [9]:
imdb_df.review.duplicated().sum()

418

In [10]:
imdb_df[imdb_df.review.duplicated()].value_counts()

review                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  

In [11]:
# 중복값 제거
imdb_df.drop_duplicates(subset=['review'],inplace=True)

[2-2] 텍스트 전처리 : 특정문자 외 제거, 구두점, 불용어 ..  

In [12]:
# a~z 외 제거 : 대소문자 통일

# review컬럼 소문자 변환
imdb_df.review=imdb_df.review.str.lower()

In [13]:
# review 컬럼 a~z만 남김
imdb_df.review=imdb_df.review.str.replace("[^a-z ]",'',regex=True)

In [14]:
# 구두점, 특수문자 제거
# imdb_df.review=imdb_df.review.str.replace(f'[{string.punctuation}]','',regex=True)
# 위에서 a~z,띄어쓰기 제외 다 날렸는데 구두점, 특수문자가 있나?

In [15]:
# 불용어
with open('../../datasets/nlp/stopwords_eng.txt') as f:
    stopwords=f.readlines()
def replace_n(text):
    text=text.replace('\n','')
    return text
stopwords=list(map(replace_n,stopwords))+['movie','movies','film','films','scene','scenes']

In [16]:
print(f'불용어 개수 : {len(stopwords)}',stopwords, sep='\n')

불용어 개수 : 858
['able', 'about', 'above', 'abroad', 'according', 'accordingly', 'across', 'actually', 'adj', 'after', 'afterwards', 'again', 'against', 'ago', 'ahead', "ain't", 'all', 'allow', 'allows', 'almost', 'alone', 'along', 'alongside', 'already', 'also', 'although', 'always', 'am', 'amid', 'amidst', 'among', 'amongst', 'an', 'and', 'another', 'any', 'anybody', 'anyhow', 'anyone', 'anything', 'anyway', 'anyways', 'anywhere', 'apart', 'appear', 'appreciate', 'appropriate', 'are', "aren't", 'around', 'as', "a's", 'aside', 'ask', 'asking', 'associated', 'at', 'available', 'away', 'awfully', 'back', 'backward', 'backwards', 'be', 'became', 'because', 'become', 'becomes', 'becoming', 'been', 'before', 'beforehand', 'begin', 'behind', 'being', 'believe', 'below', 'beside', 'besides', 'best', 'better', 'between', 'beyond', 'both', 'brief', 'but', 'by', 'came', 'can', 'cannot', 'cant', "can't", 'caption', 'cause', 'causes', 'certain', 'certainly', 'changes', 'clearly', "c'mon", 'co', 'co.

In [17]:
# 예시
removed=[word for word in imdb_df.review[0].split() if word not in stopwords]
' '.join(removed)

'family watch local simple reason lack depth worth timebr trailer nasaan ka man caught attention daughter laws daughters time watch afternoon exceeded expectations cinematography good story beautiful acting awesome jericho rosales good sos claudine barretto fact despised diether ocampo proves effective role touched moved local imagine cynic dabbing eyes congratulations star cinema jericho claudine'

In [18]:
for i in range(imdb_df.review.shape[0]):
    imdb_df.iloc[i,0]=' '.join([word for word in str(imdb_df.iloc[i,0]).split() if word not in stopwords])

In [19]:
imdb_df

Unnamed: 0,review,sentiment
0,family watch local simple reason lack depth wo...,1
1,time worst time worse fair tough,0
2,internet surfing homefront series dvd iofferco...,0
3,unheralded great works animation sophisticated...,1
4,sixties long hair hip distant attitude money m...,0
...,...,...
49995,people sick twisted freaks hell exploit people...,0
49996,script laughable turn actors lines sound stiff...,0
49997,bride crushed death statue falls day wedding y...,0
49998,mind satisfied nobudget doomsday thriller pray...,0


#### [3] 학습용 데이터 가공

[3-1] 텍스트 -> 수치화 : 토큰화, 벡터화  
- 텍스트 데이터와 맵핑될 숫자 테이블 생성 -> 단어사전(voca)

In [20]:
tk=Tokenizer()

In [21]:
tk.fit_on_texts(imdb_df.review)

In [22]:
print(f'사전 단어 수 : {len(tk.word_index)}')

사전 단어 수 : 175338


In [23]:
wordFreq=pd.DataFrame(tk.word_counts.items())

In [24]:
wordFreq[wordFreq[1]>500]

Unnamed: 0,0,1
0,family,5460
1,watch,13444
2,local,1723
3,simple,1994
4,reason,4375
...,...,...
12488,mike,545
12509,christopher,728
13821,realism,517
15248,desire,517


In [25]:
WORD_NUMS=wordFreq[wordFreq[1]>500].shape[0]
print(f'분석에 사용할 단어 수 : {WORD_NUMS}')

분석에 사용할 단어 수 : 1430


In [26]:
# 토큰화 시 상수
OOV='<OOV>'

In [27]:
tk=Tokenizer(oov_token=OOV,num_words=WORD_NUMS)
tk.fit_on_texts(imdb_df.review)

In [28]:
# 텍스트 -> 숫자
seqText=tk.texts_to_sequences(imdb_df.review)

In [29]:
seqTextLen=[len(text) for text in seqText]
print(f'최소 토큰수 : {min(seqTextLen)} \n최대 토큰수 : {max(seqTextLen)} \n평균토큰수 : {sum(seqTextLen)/len(seqTextLen)} \n토큰 중앙값 : {sorted(seqTextLen)[int(len(seqTextLen)/2)]}')

최소 토큰수 : 2 
최대 토큰수 : 1129 
평균토큰수 : 85.66021943447218 
토큰 중앙값 : 63


In [30]:
TEXT_LENGTH=sorted(seqTextLen)[int(len(seqTextLen)/2)]

In [31]:
# 토큰수 결정 -> review는 가변길이
# 중앙값으로 길이 일치
textData=pad_sequences(seqText,maxlen=sorted(seqTextLen)[int(len(seqTextLen)/2)])

In [32]:
textData.shape, type(textData), imdb_df.sentiment.shape

((49582, 63), numpy.ndarray, (49582,))

In [33]:
textLabel=imdb_df.sentiment

[3-2] 학습/검증/테스트 데이터 준비

In [34]:
X_train,X_test,y_train,y_test=train_test_split(textData,textLabel,test_size=0.2,stratify=textLabel)

In [35]:
X_train.shape, X_test.shape

((39665, 63), (9917, 63))

In [36]:
y_train.value_counts(), y_test.value_counts()

(1    19907
 0    19758
 Name: sentiment, dtype: int64,
 1    4977
 0    4940
 Name: sentiment, dtype: int64)

#### [4] 모델 설계 및 생성  
- Sequential API 방식
- SimpleRNN 층 <- 입력층
- Dense 층 <- 출력층

In [37]:
print(f'VOCA_SIZE : {len(tk.word_index)}, WORD_NUMS : {WORD_NUMS}, TEXT_LENGTH : {TEXT_LENGTH}')

VOCA_SIZE : 175339, WORD_NUMS : 1430, TEXT_LENGTH : 63


[4-1] RNN Layer에 맞는 shape으로 변환 -> 3D Tensor

In [38]:
#                            배치사이즈      문장당 토큰수   사용단어수
print(f'RNN input_size : ({X_train.shape[0]},{TEXT_LENGTH},{WORD_NUMS})')

RNN input_size : (39665,63,1430)


In [39]:
# 원핫인코딩
X_train=to_categorical(X_train)
X_test=to_categorical(X_test)

In [40]:
X_train.shape, X_test.shape[1:]

((39665, 63, 1430), (63, 1430))

[4-2] 모델 생성

In [41]:
model=Sequential()
model.add(SimpleRNN(8,input_shape=(X_train.shape[1:]))) # input=63,1430 output=8
model.add(Dense(1,activation='sigmoid'))    # input=8 output=1

In [42]:
model.summary()
# 파라미터 : (1430W + 1b+ 8(hidden_state))*8) = 11512 

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 simple_rnn (SimpleRNN)      (None, 8)                 11512     
                                                                 
 dense (Dense)               (None, 1)                 9         
                                                                 
Total params: 11,521
Trainable params: 11,521
Non-trainable params: 0
_________________________________________________________________


In [43]:
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

[4-3] 모델 학습

In [44]:
model.fit(X_train,y_train,epochs=2)

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x1f3936dbd00>

In [46]:
model.evaluate(X_test,y_test)



[0.3728998303413391, 0.8311989307403564]

[4-4] 모델2 LSTM

In [47]:
model2=Sequential()
model2.add(LSTM(8,input_shape=(X_train.shape[1:])))
model2.add(Dense(1,activation='sigmoid'))

In [48]:
model2.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm (LSTM)                 (None, 8)                 46048     
                                                                 
 dense_1 (Dense)             (None, 1)                 9         
                                                                 
Total params: 46,057
Trainable params: 46,057
Non-trainable params: 0
_________________________________________________________________


In [49]:
model2.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

In [50]:
model2.fit(X_train,y_train,epochs=2)

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x1f394cf9190>

In [51]:
model2.evaluate(X_test,y_test)



[0.3559083044528961, 0.842290997505188]

[4-6] 모델3

In [52]:
model3=Sequential()
model3.add(LSTM(8,input_shape=(X_train.shape[1:]),return_sequences=True))
model3.add(LSTM(8))
model3.add(Dense(1,activation='sigmoid'))
model3.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_1 (LSTM)               (None, 63, 8)             46048     
                                                                 
 lstm_2 (LSTM)               (None, 8)                 544       
                                                                 
 dense_2 (Dense)             (None, 1)                 9         
                                                                 
Total params: 46,601
Trainable params: 46,601
Non-trainable params: 0
_________________________________________________________________


In [53]:
model3.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

In [54]:
model3.fit(X_train,y_train,epochs=2)

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x1f39534ee20>

In [55]:
model3.evaluate(X_test,y_test)



[0.35951220989227295, 0.8395684361457825]

#### [5] 테스트 진행

#### [6] 예측 및 저장여부 결정

- 입력 : 텍스트
- 텍스트 -> 숫자 : Tokenizer.text_to_sequences()
    * 임베딩층 사용 -> 그대로
    * 사용 x ->  One-Hot-Encoding
- predict( fit()할때 사용된 데이터와 동일한 shape )

In [None]:
model.predict()