# 데이터 전처리

In [35]:
import re
import pandas as pd
import numpy as np
import json
import nltk
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from tensorflow.python.keras.preprocessing.sequence import pad_sequences
from tensorflow.python.keras.preprocessing.text import Tokenizer

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ysy\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [25]:
# 학습 데이터 불러오기
data_in_path = './sources/'
train_data =  pd.read_csv(data_in_path + 'labeledTrainData.tsv', header=0, delimiter='\t', quoting=3)


In [26]:
print(train_data['review'][0])

"With all this stuff going down at the moment with MJ i've started listening to his music, watching the odd documentary here and there, watched The Wiz and watched Moonwalker again. Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent. Moonwalker is part biography, part feature film which i remember going to see at the cinema when it was originally released. Some of it has subtle messages about MJ's feeling towards the press and also the obvious message of drugs are bad m'kay.<br /><br />Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyway then you are going to hate this and find it boring. Some may call MJ an egotist for consenting to the making of this movie BUT MJ and most of his fans would say that he made it for the fans which if true is really nice of him.<br /><br />The actual feature film bit when it finally sta

- HTML 태그와 특수문자가 포함된것을 확인 함

In [27]:
# HTML 태그와 특수문자 제거
review = train_data['review'][0]
review_text = BeautifulSoup(review, "html.parser").get_text()
review_text = re.sub('[^a-zA-Z]', " ", review_text)

- BeautifulSoup
    - HTML 태그 제거
    - get_text() 함수를 사용해서 html 태그를 제외한 나머지 텍스트를 얻음

- re
    - sub() 함수를 사용해서 알파벳을 제외한 문자를 제거

In [28]:
print(review_text)

With all this stuff going down at the moment with MJ i ve started listening to his music  watching the odd documentary here and there  watched The Wiz and watched Moonwalker again  Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent  Moonwalker is part biography  part feature film which i remember going to see at the cinema when it was originally released  Some of it has subtle messages about MJ s feeling towards the press and also the obvious message of drugs are bad m kay Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyway then you are going to hate this and find it boring  Some may call MJ an egotist for consenting to the making of this movie BUT MJ and most of his fans would say that he made it for the fans which if true is really nice of him The actual feature film bit when it finally starts is only on for    min

### NLTK를 사용해서 불용어(stopword) 제거하기

In [36]:
stop_words = set(stopwords.words('english'))

review_text = review_text.lower()
words = review_text.split()
words = [w for w in words if not w in stop_words]

- set 데이터 타입을 이용해서 속도를 향상 시킴
- lower() 함수를 이용해서 리뷰를 소문자로 변경
- split() 함수를 이용해서 띄어쓰기를 기준으로 텍스트 리뷰를 단어 리스트로 변경
- for와 if를 이용해서 불용어인 단어는 제외시킴

In [37]:
print(words)

['stuff', 'going', 'moment', 'mj', 'started', 'listening', 'music', 'watching', 'odd', 'documentary', 'watched', 'wiz', 'watched', 'moonwalker', 'maybe', 'want', 'get', 'certain', 'insight', 'guy', 'thought', 'really', 'cool', 'eighties', 'maybe', 'make', 'mind', 'whether', 'guilty', 'innocent', 'moonwalker', 'part', 'biography', 'part', 'feature', 'film', 'remember', 'going', 'see', 'cinema', 'originally', 'released', 'subtle', 'messages', 'mj', 'feeling', 'towards', 'press', 'also', 'obvious', 'message', 'drugs', 'bad', 'kay', 'visually', 'impressive', 'course', 'michael', 'jackson', 'unless', 'remotely', 'like', 'mj', 'anyway', 'going', 'hate', 'find', 'boring', 'may', 'call', 'mj', 'egotist', 'consenting', 'making', 'movie', 'mj', 'fans', 'would', 'say', 'made', 'fans', 'true', 'really', 'nice', 'actual', 'feature', 'film', 'bit', 'finally', 'starts', 'minutes', 'excluding', 'smooth', 'criminal', 'sequence', 'joe', 'pesci', 'convincing', 'psychopathic', 'powerful', 'drug', 'lord', 

- 하나의 문자열이었던 리뷰가 단어 리스트로 변경됨을 확인


In [38]:
# 모델에 적용하기 위해 하나의 문자열로 변경
clean_review = ' '.join(words)
print(clean_review)

stuff going moment mj started listening music watching odd documentary watched wiz watched moonwalker maybe want get certain insight guy thought really cool eighties maybe make mind whether guilty innocent moonwalker part biography part feature film remember going see cinema originally released subtle messages mj feeling towards press also obvious message drugs bad kay visually impressive course michael jackson unless remotely like mj anyway going hate find boring may call mj egotist consenting making movie mj fans would say made fans true really nice actual feature film bit finally starts minutes excluding smooth criminal sequence joe pesci convincing psychopathic powerful drug lord wants mj dead bad beyond mj overheard plans nah joe pesci character ranted wanted people know supplying drugs etc dunno maybe hates mj music lots cool things like mj turning car robot whole speed demon sequence also director must patience saint came filming kiddy bad sequence usually directors hate working

- 지금까지 하나의 리뷰를 전처리하는 과정
- 여러 리뷰들을 전처리하기 위해 함수를 구현함


In [40]:
# 전처리 함수
def preprocessing(review, remove_stopwords=False):
    review_text = BeautifulSoup(review, 'html.parser').get_text()
    review_text = re.sub('[^a-zA-Z]', " ", review_text)
    words = review_text.lower().split()

    if remove_stopwords:
        stops = set(stopwords.words('english'))
        words = [w for w in words if not w in stops]
        clean_review = ' '.join(words)

    else:
        clean_review = ' '.join(words)

    return clean_review

- 불용어 제거는 옵션

- 전처리 함수 순서
    1. HTML 태그 제거
    2. 영어가 아닌 특수문자를 공백(" ")으로 변경
    3. 대문자를 소문자로 바꾸고 공백 단위로 텍스트를 나눠서 리스트로 만듬
    4. 불용어 제거 시
        - 영어 불용어 불러오기(stops)
        - 불용어가 아닌 단어로 이뤄진 새로운 리스트 생성(words)
    5. 단어 리스트를 공백을 넣어서 하나의 글로 합침(clean_review)


In [41]:
# 함수를 이용해서 전체 데이터를 전처리
clean_train_reviews = []
for review in train_data['review']:
    clean_review = preprocessing(review, remove_stopwords=True)
    clean_train_reviews.append(clean_review)

In [42]:
# 전처리한 데이터의 첫 번째 데이터 출력
clean_train_reviews[0]

'stuff going moment mj started listening music watching odd documentary watched wiz watched moonwalker maybe want get certain insight guy thought really cool eighties maybe make mind whether guilty innocent moonwalker part biography part feature film remember going see cinema originally released subtle messages mj feeling towards press also obvious message drugs bad kay visually impressive course michael jackson unless remotely like mj anyway going hate find boring may call mj egotist consenting making movie mj fans would say made fans true really nice actual feature film bit finally starts minutes excluding smooth criminal sequence joe pesci convincing psychopathic powerful drug lord wants mj dead bad beyond mj overheard plans nah joe pesci character ranted wanted people know supplying drugs etc dunno maybe hates mj music lots cool things like mj turning car robot whole speed demon sequence also director must patience saint came filming kiddy bad sequence usually directors hate workin

In [43]:
# 지금까지 전처리한 데이터를 pandas의 데이터프레임으로 저장
# 추후 전처리한 데이터를 저장할때 같이 저장함 
clean_train_df = pd.DataFrame({
    'review': clean_train_reviews,
    'sentiment': train_data['sentiment']
})

In [45]:
# Tokenizer 모듈을 생성한 후 정제된 데이터에 적용하고 인덱스로 구성된 벡터로 변환
tokenizer = Tokenizer()
tokenizer.fit_on_texts(clean_train_reviews)
text_sequences = tokenizer.texts_to_sequences(clean_train_reviews)

- 각 리뷰가 텍스트가 아닌 인덱스의 벡터로 구성됨

In [48]:
print(text_sequences[0])

[404, 70, 419, 8815, 506, 2456, 115, 54, 873, 516, 178, 18686, 178, 11242, 165, 78, 14, 662, 2457, 117, 92, 10, 499, 4074, 165, 22, 210, 581, 2333, 1194, 11242, 71, 4826, 71, 635, 2, 253, 70, 11, 302, 1663, 486, 1144, 3265, 8815, 411, 793, 3342, 17, 441, 600, 1500, 15, 4424, 1851, 998, 146, 342, 1442, 743, 2424, 4, 8815, 418, 70, 637, 69, 237, 94, 541, 8815, 26055, 26056, 120, 1, 8815, 323, 8, 47, 20, 323, 167, 10, 207, 633, 635, 2, 116, 291, 382, 121, 15535, 3315, 1501, 574, 734, 10013, 923, 11578, 822, 1239, 1408, 360, 8815, 221, 15, 576, 8815, 22224, 2274, 13426, 734, 10013, 27, 28606, 340, 16, 41, 18687, 1500, 388, 11243, 165, 3962, 8815, 115, 627, 499, 79, 4, 8815, 1430, 380, 2163, 114, 1919, 2503, 574, 17, 60, 100, 4875, 5100, 260, 1268, 26057, 15, 574, 493, 744, 637, 631, 3, 394, 164, 446, 114, 615, 3266, 1160, 684, 48, 1175, 224, 1, 16, 4, 8815, 3, 507, 62, 25, 16, 640, 133, 231, 95, 7426, 600, 3439, 8815, 37248, 1864, 1, 128, 342, 1442, 247, 3, 865, 16, 42, 1487, 997, 2333, 12

- 텍스트로 돼 있던 리뷰가 각 단어의 인덱스로 구성됨을 확인


In [55]:
# 인덱스가 어떤 단어를 의미하는지 확인 하기 위해 단어 사전을 확인
word_vocab = tokenizer.word_index
print('단어 사전의 단어 개수: ', len(word_vocab))

단어 사전의 단어 개수:  74065


In [56]:
data_configs = {}

data_configs['vocab'] = word_vocab
data_configs['vocab_size'] = len(word_vocab)+1

- 이후 모델에 사용하기 위해 데이터에 대한 정보를 저장
- 단어사전과 전체 단어 개수를 저장

### 데이터의 길이를 통일 시키기
- 특정 길이보다 길면 뒷 부분을 제거
- 특정 길이보다 작으면 0 값으로 패딩하기

In [58]:
# 텐서플로의 전처리 모듈을 사용
# 문장의 최대 길이 설정
MAX_SEQUENCE_LENGTH = 174

train_inputs = pad_sequences(text_sequences, maxlen=MAX_SEQUENCE_LENGTH, padding='post')

print('Shape of train data: ', train_inputs.shape)

Shape of train data:  (25000, 174)


- pad_sequences(패딩을 적용할 데이터, 최대 길이값, 0값을 데이터 앞에 넣을지 뒤에 넣을지 여부)

- 최대 길이는 일반적으로 데이터의 중간 값을 사용함

- 패딩 처리 후 데이터의 형태 확인

In [59]:
train_labels = np.array(train_data['sentiment'])
print('Shape of label tensor: ', train_labels.shape)

Shape of label tensor:  (25000,)


- 데이터 하나당 하나의 값을 가지는 형태
