### popcorn 데이터 세트 전처리 과정

### [1] 라이브러리 설정

In [3]:
DATA_IN_PATH = "/dataset/"

In [1]:
import re, json
import pandas as pd
import numpy as np

from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from tensorflow.python.keras.preprocessing.sequence import pad_sequences
from tensorflow.python.keras.preprocessing.text import Tokenizer

### [2] Preprocessing 과정 진행

In [8]:
def preprocessing(review, remove_stopwords=False):
    # 불용어 제거 옵션으로 선택 가능
    # 1. HTML 태그 제거
    review_text = BeautifulSoup(review, "html5lib").get_text()

    # 2. 영어만 남기기
    review_text = re.sub("[^a-zA-Z]", " ", review_text )

    # 3. 대문자들을 소문자로 바꾸고 공백단위로 텍스트를 나눠서 리스트로 만든다
    words = review_text.lower().split()

    # 4. 불용어 제거
    if remove_stopwords: # 옵션이 True이면
        # 영어와 관련된 불용어 불러오기 => nltk 라이브러리에서 다운로드
        stops = set(stopwords.words('english'))
        # 문장에서 불용어 단어 제거
        words = [w for w in words if not w in stops]
        # 불용어 제거된 단어들을 다시 합쳐 문장으로 만들기
        clean_review = " ".join(words)
        
    return clean_review

In [4]:
# data loading + dataframe
train_data = pd.read_csv(DATA_IN_PATH + 'labeledTrainData.tsv', header=0, delimiter='\t', quoting=3)
train_data.head()

Unnamed: 0,id,sentiment,review
0,"""5814_8""",1,"""With all this stuff going down at the moment ..."
1,"""2381_9""",1,"""\""The Classic War of the Worlds\"" by Timothy ..."
2,"""7759_3""",0,"""The film starts with a manager (Nicholas Bell..."
3,"""3630_4""",0,"""It must be assumed that those who praised thi..."
4,"""9495_8""",1,"""Superbly trashy and wondrously unpretentious ..."


In [9]:
# data preprocessing
clean_train_reviews = []
for review in train_data["review"]:
    clean_train_reviews.append(preprocessing(review, remove_stopwords=True))

In [10]:
print(clean_train_reviews[2])

film starts manager nicholas bell giving welcome investors robert carradine primal park secret project mutating primal animal using fossilized dna like jurassik park scientists resurrect one nature fearsome predators sabretooth tiger smilodon scientific ambition turns deadly however high voltage fence opened creature escape begins savagely stalking prey human visitors tourists scientific meanwhile youngsters enter restricted area security center attacked pack large pre historical animals deadlier bigger addition security agent stacy haiduk mate brian wimmer fight hardly carnivorous smilodons sabretooths course real star stars astounding terrifyingly though convincing giant animals savagely stalking prey group run afoul fight one nature fearsome predators furthermore third sabretooth dangerous slow stalks victims movie delivers goods lots blood gore beheading hair raising chills full scares sabretooths appear mediocre special effects story provides exciting stirring entertainment result

### [3] clean data 새 프레임 생성

In [11]:
# 데이터 전처리 과정 진행한 데이터를 다시 새 프레임으로 생성
clean_train_df = pd.DataFrame({'review' : clean_train_reviews, 'sentiment' : train_data['sentiment']})

### [4] Tokenizer 진행 + 정수 인코딩

In [13]:
# tokenizer + 정수 encoding 진행
tokenizer = Tokenizer()
# fit_on_texts - 빈도수를 기준으로 단어 집합을 생성한다.
    # 단어 빈도수가 높은 순으로 낮은 정수 인덱스를 부여한다(word_index로 인덱스 확인 가능)
tokenizer.fit_on_texts(clean_train_reviews)
# fit_on_texts - 입력으로 들어온 코퍼스에 대해서 각 단어를 이미 정해진 인덱스로 변환합니다.
text_sequences = tokenizer.texts_to_sequences(clean_train_reviews)

In [14]:
word_vocab = tokenizer.word_index
word_vocab["<PAD>"] = 0 # 패딩 작업을 위해서 PAD값을 넣어 준다

In [15]:
print("단어 개수 : {}".format(len(word_vocab)))

단어 개수 : 74066


### [5] Padding 작업

In [16]:
data_configs = {}
data_configs['vocab'] = word_vocab
data_configs['vocab_size'] = len(word_vocab)

MAX_SEQUENCE_LENGTH = 174 # 174차원 이용

train_inputs = pad_sequences(text_sequences, maxlen = MAX_SEQUENCE_LENGTH, padding='post')

print('Shape of train data : ', train_inputs.shape) # (25000, 174)

train_labels = np.array(train_data['sentiment'])
print('Shape of train label tensor : ', train_labels.shape)

Shape of train data :  (25000, 174)
Shape of train label tensor :  (25000,)


### [6] 데이터 전처리 끝난 데이터 세트 파일로 저장

In [17]:
import os

if not os.path.exists(DATA_IN_PATH): os.makedirs(DATA_IN_PATH)

    
TRAIN_INPUT_DATA = "train_input.npy" # numpy로 저장 -> 굉장히 빠르게 저장 가능
TRAIN_LABEL_DATA = "train_label.npy" # numpy로 저장 -> 굉장히 빠르게 저장 가능

TRAIN_CLEAN_DATA = "train_clean.csv"

DATA_CONFIGS = 'data_configs.json' # 단어 사전(코퍼스) 정보 저장 (단어, 단어 개수)

np.save(open(DATA_IN_PATH + TRAIN_INPUT_DATA, 'wb'), train_inputs)
np.save(open(DATA_IN_PATH + TRAIN_LABEL_DATA, 'wb'), train_labels)

clean_train_df.to_csv(DATA_IN_PATH + TRAIN_CLEAN_DATA, index = False)

json.dump(data_configs, open(DATA_IN_PATH + DATA_CONFIGS, 'w'), ensure_ascii=False)

### Testdata 데이터 전처리 진행

In [18]:
TEST_DATA = DATA_IN_PATH + "/testData.tsv"
test_data =pd.read_csv(TEST_DATA, header=0, delimiter='\t', quoting=3)

In [19]:
test_data.head() # train과 달리 seniment 부분 없음!

Unnamed: 0,id,review
0,"""12311_10""","""Naturally in a film who's main themes are of ..."
1,"""8348_2""","""This movie is a disaster within a disaster fi..."
2,"""5828_4""","""All in all, this is a movie for kids. We saw ..."
3,"""7186_2""","""Afraid of the Dark left me with the impressio..."
4,"""12128_7""","""A very accurate depiction of small time mob l..."


In [20]:
# data preprocessing
clean_test_reviews = []
for review in test_data["review"]:
    clean_test_reviews.append(preprocessing(review, remove_stopwords=True))

In [21]:
print(clean_test_reviews[1])

movie disaster within disaster film full great action scenes meaningful throw away sense reality let see word wise lava burns steam burns stand next lava diverting minor lava flow difficult let alone significant one scares think might actually believe saw movie even worse significant amount talent went making film mean acting actually good effects average hard believe somebody read scripts allowed talent wasted guess suggestion would movie start tv look away like train wreck awful know coming watch look away spend time meaningful content


In [22]:
# 데이터 전처리 과정 진행한 데이터를 다시 새 프레임으로 생성
clean_test_df = pd.DataFrame({'review' : clean_test_reviews})

In [23]:
TEST_CLEAN_DATA = "test_clean.csv"
clean_test_df.to_csv(DATA_IN_PATH + TEST_CLEAN_DATA, index = False)