In [136]:
import pandas as pd
import numpy as np

In [137]:
with open('./data/review/2016_filtered_review.txt', encoding='utf-8') as f:
    docs = [doc.strip().split('\t\t') for doc in f]
    docs = [(doc[1], int(doc[2])) for doc in docs if len(doc) == 3]
    # To read the second and third column info from each row
    texts, scores = zip(*docs[:20000])
    # 둘을 분리해서 별도의 list 변수로 저장

In [138]:
filtered_texts = []
filtered_labels = []

for text, score in zip(texts, scores):
    if 5 <= score <= 9:
        continue
        
    # 평점 기준으로 문서에 label을 부여
    # 1 ~ 3 -> 부정, 0
    # 9 ~ 10 -> 긍정, 1
    filtered_texts.append(text)
    filtered_labels.append(1 if score > 9 else 0)

In [139]:
sum(filtered_labels)/len(filtered_texts)

0.8294408520349943

In [140]:
filtered_texts[0]

' 진심 쓰레기 영화 만들 무서 알 쫄아 틀었 이건 뭐 웃 거리 없는 쓰레기 영화 임'

In [141]:
filtered_texts[:2]

[' 진심 쓰레기 영화 만들 무서 알 쫄아 틀었 이건 뭐 웃 거리 없는 쓰레기 영화 임',
 ' 역대 좀비 영화 가장 최고다 원작 만화 읽어 보려 영화 보고 결정 하려 감독 간츠 실사 했 사람 거르려 그냥 봤 정말 흠잡 없는 최고 좀비 영화 잔인 거 싫어하지 참고 볼 만하 로미 인물 왜 그런 모르']

In [142]:
filtered_words = [doc.strip().split() for doc in filtered_texts]

In [143]:
total_words = []
for words in filtered_words:
    total_words.extend(words)

In [144]:
print(len(total_words))
print(len(set(total_words)))

153375
13909


In [145]:
from collections import Counter
c = Counter(total_words)

In [146]:
max_features = 5000 # 빈도수를 기준으로 상위 5000개의 단어만 사용
common_words = [ word for word, count in c.most_common(max_features)]

In [147]:
len(common_words)

5000

In [148]:
print(common_words)

['영화', '너무', '좋', '봤', '보고', '정말', '연기', '감동', '배우', '진짜', '대통령', '였', '사람', '했', '입니', '것', '먹', '그', '더', '노무현', '이', '그립', '눈물', '보는', '잘', '수', '생각', '가슴', '하는', '분', '마음', '꼭', '봐', '현실', '알', '이런', '최고', '볼', '우리', '때', '있는', '다시', '본', '할', '스토리', '역사', '나', '말', '같은', '좀비', '그냥', '아니', '같아', '보면', '없는', '여운', '시간', '평점', '또', '청춘', '내', '역시', '한번', '울었', '지금', '이었', '많은', '살', '재미', '어른', '점', '이야기', '순정', '있었', '마지막', '내용', '들', '아이', '왜', '한', '친구', '처음', '되', '짱구', '내내', '울', '하게', '재밌게', '된', '슬프', '왔', '공유', '그런', '참', '사랑', '모습', '되는', '장면', '함', '기대', '재밌었', '원피스', '같다', '거', '보러', '저', '국민', '보세', '극장판', '하나', '있', '모르', '좀', '합', '인간', '넘', '손예진', '느낌', '와', '정도', '감사합', '모두', '끝', '인생', '일본', '감정', '당신', '감독', '너무나', '않고', '안', '돈', '덕혜옹주', '재밌어', '아닌', '같', '짱', '듯', '요즘', '세상', '일', '나라', '전도연', '속', '시대', '보기', '부산', '남', '순수한', '계속', '못', '펑펑', '완전', '잼', '없다', '다른', '요', '오랜만', '재미있었', '상영', '하지', '때문', '중간', '이상', '여자', '없었', '있어', '도경수', '않았', '보여', '갔', '걸', '

In [149]:
words_dic ={}  # 각 단어에 index 번호 부여
words_index_dic={} # index 번호가 key가 됨, value는 단어
for index, word in enumerate(common_words):
    words_dic[word]=index
    words_index_dic[index]=word

In [150]:
filtered_indexed_words = [] # index 번호가 부여된 단어들만을 사용해서 각 문서를 표현
for review in filtered_words:
    indexed_words=[]
    for word in review:
        try:
            indexed_words.append(words_dic[word])
        except: 
            pass
    filtered_indexed_words.append(indexed_words)

In [151]:
filtered_indexed_words[0] # 첫번째 영화 리뷰에 사용된 단어들의 인덱스

[227, 349, 0, 319, 34, 225, 182, 1050, 811, 54, 349, 0, 185]

In [152]:
# 어떠한 단어들인지 확인
[words_index_dic[index] for index in filtered_indexed_words[0]]

['진심', '쓰레기', '영화', '만들', '알', '이건', '뭐', '웃', '거리', '없는', '쓰레기', '영화', '임']

In [153]:
from sklearn.model_selection import train_test_split 
X_train, X_test, y_train, y_test = train_test_split(filtered_indexed_words, filtered_labels, test_size=0.2)

In [154]:
import numpy as np

def vectorize_sequences(sequences, dimension=max_features):
    # Create an all-zero matrix of shape (len(sequences), dimension)
    # len(sequences) => number of documents
    results = np.zeros((len(sequences), dimension))
    for i, sequence in enumerate(sequences): # sequence => 단어들의 index로 구성되어 있는 리스트
        results[i, sequence] = 1.  # set specific indices of results[i] to 1s
    return results

In [155]:
X_train_indexed = vectorize_sequences(X_train)
X_test_indexed = vectorize_sequences(X_test)

In [156]:
X_train_indexed[0]

array([0., 0., 0., ..., 0., 0., 0.])

In [157]:
len(X_train_indexed[0])

5000

In [158]:
X_train_indexed.shape

(10516, 5000)

In [159]:
y_train[0]

1

In [160]:
def to_categorical(y, num_classes):
    """ 1-hot encodes a tensor """
    return np.eye(num_classes, dtype='uint8')[y]

In [161]:
y_train_one_hot = to_categorical(y_train, 2)
y_test_one_hot = to_categorical(y_test, 2)

In [162]:
y_train_one_hot

array([[0, 1],
       [0, 1],
       [0, 1],
       ...,
       [0, 1],
       [0, 1],
       [0, 1]], dtype=uint8)

In [163]:
import torch
import torch.utils.data as Data
from torch import nn
from torch.nn import functional as F
from torch import optim


In [164]:
class Model(nn.Module):
    def __init__(self, n_feature):
        super(Model, self).__init__()
        self.Dense1 = nn.Linear(n_feature, 32)
        self.Dense2 = nn.Linear(32, 16)
        self.Classifier = nn.Linear(16, 2)
    
    def forward(self, x):
        x = F.relu(self.Dense1(x))
        x = F.relu(self.Dense2(x))
        return self.Classifier(x)

In [165]:
model = Model(max_features)

In [166]:
criterion = nn.BCEWithLogitsLoss() # softmax가 내장

In [167]:
optimizer = optim.RMSprop(model.parameters(), lr=0.001)

In [168]:
class DataMaker(Data.Dataset):
    def __init__(self, X, y):
        # scaler = StandardScaler()
        self.targets = X.astype(np.float32)
        self.labels = y.astype(np.float32)
    
    def __getitem__(self, i):
        return self.targets[i, :], self.labels[i]

    def __len__(self):
        return len(self.targets)

In [169]:
# X_train_indexed = torch.from_numpy(X_train_indexed)
# X_test_indexed = torch.from_numpy(X_test_indexed)
# y_train_one_hot = torch.from_numpy(y_train_one_hot)
# y_test_one_hot = torch.from_numpy(y_test_one_hot)

In [170]:
X_test_indexed

array([[1., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       ...,
       [0., 1., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.]])

In [171]:
X_train_indexed, X_valid_indexed, y_train_one_hot, y_valid_one_hot = train_test_split(X_train_indexed, y_train_one_hot, test_size=0.2)

In [172]:
train_set = DataMaker(X_train_indexed, y_train_one_hot)
valid_set = DataMaker(X_valid_indexed, y_valid_one_hot)
test_set = DataMaker(X_test_indexed, y_test_one_hot)

In [173]:
batch_size = 128
train_loader = Data.DataLoader(train_set, batch_size=batch_size, shuffle=True)
valid_loader = Data.DataLoader(valid_set, batch_size=batch_size, shuffle=True)
test_loader = Data.DataLoader(test_set, batch_size=batch_size, shuffle=True)

In [174]:
train_loader.dataset[0][0]

array([0., 1., 1., ..., 0., 0., 0.], dtype=float32)

In [175]:
train_loader.dataset[0][1]

array([0., 1.], dtype=float32)

In [176]:
model(torch.from_numpy(train_loader.dataset[0][0]))

tensor([ 0.1986, -0.1723], grad_fn=<AddBackward0>)

In [177]:
epoch_n = 10
total, correct, loss = 0.0, 0.0, 0.0
total_v, correct_v, loss_v = 0.0, 0.0, 0.0
loss_list = []
acc_list = []
loss_v_list = []
acc_v_list = []
for epoch in range(epoch_n):
    model.train()
    for data, labels in train_loader:
        optimizer.zero_grad()

        output = model(data)
        batch_loss = criterion(output, labels)
        loss += batch_loss

        _, pred_y = torch.max(output, 1)
        pred_y = pred_y.view(-1)
        correct += torch.sum(torch.eq(torch.from_numpy(to_categorical(pred_y, 2)), labels)).item()/2
        total += len(labels)

        batch_loss.backward()
        optimizer.step()
    
    model.eval()
    for data, labels in valid_loader:
        output = model(data)
        batch_loss = criterion(output, labels)
        loss_v += batch_loss

        _, pred_y = torch.max(output, 1)
        pred_y = pred_y.view(-1)
        correct_v += torch.sum(torch.eq(torch.from_numpy(to_categorical(pred_y, 2)), labels)).item()/2
        total_v += len(labels)
    
    acc = correct/total
    loss_epoch = loss/total
    loss_list.append(loss_epoch.item())
    acc_list.append(acc)

    acc_v = correct_v/total_v
    loss_epoch_v = loss_v/total_v
    loss_v_list.append(loss_epoch.item())
    acc_v_list.append(acc)
    print(f'Epoch {epoch+1}/{epoch_n}: train_Acc: {acc:.4f}, train_Loss{loss_epoch:.8f}, valid_Acc: {acc_v:.4f}, valid_Loss{loss_epoch_v:.8f}')


Epoch 1/10: train_Acc: 0.8256, train_Loss0.00306676, valid_Acc: 0.8969, valid_Loss0.00216654
Epoch 2/10: train_Acc: 0.8762, train_Loss0.00233419, valid_Acc: 0.9045, valid_Loss0.00202437
Epoch 3/10: train_Acc: 0.9009, train_Loss0.00194276, valid_Acc: 0.9083, valid_Loss0.00194816
Epoch 4/10: train_Acc: 0.9160, train_Loss0.00167881, valid_Acc: 0.9106, valid_Loss0.00194520
Epoch 5/10: train_Acc: 0.9271, train_Loss0.00148585, valid_Acc: 0.9123, valid_Loss0.00197939
Epoch 6/10: train_Acc: 0.9355, train_Loss0.00133658, valid_Acc: 0.9130, valid_Loss0.00202446
Epoch 7/10: train_Acc: 0.9422, train_Loss0.00121645, valid_Acc: 0.9132, valid_Loss0.00207903
Epoch 8/10: train_Acc: 0.9474, train_Loss0.00111753, valid_Acc: 0.9132, valid_Loss0.00215908
Epoch 9/10: train_Acc: 0.9519, train_Loss0.00103512, valid_Acc: 0.9128, valid_Loss0.00224138
Epoch 10/10: train_Acc: 0.9554, train_Loss0.00096438, valid_Acc: 0.9125, valid_Loss0.00232502


In [179]:
total, correct, loss = 0.0, 0.0, 0.0
model.eval()
for data, labels in test_loader:
    output = model(data)
    batch_loss = criterion(output, labels)
    loss += batch_loss

    _, pred_y = torch.max(output, 1)
    pred_y = pred_y.view(-1)
    correct += torch.sum(torch.eq(torch.from_numpy(to_categorical(pred_y, 2)), labels)).item()/2
    total += len(labels)

acc = correct/total
loss_epoch = loss/total
print(f'Test Acc: {acc}, Loss{loss_epoch:.8f}')

Test Acc: 0.9136553822746292, Loss0.00279668


In [180]:
words_dic_reverse = {}
for key in words_dic:
    words_dic_reverse[words_dic[key]]=key

In [181]:
words_dic_reverse[0]

'영화'

In [182]:
# Test data의 첫번째 리뷰
for index in X_test[0]:
    print(words_dic_reverse[index])

일본
깔
목적
평점
리지
개꿀
잼
인한
보는
리뷰
안보
영화
쳐
봐
재미
잇는
영화
잔
또
까
리지
재밋습니
부산
행보
다재
보셔


In [183]:
for index in X_test[4]:
    print(words_dic_reverse[index])

개인
이런
장르
좋아하지
않았
넘
재밌게
봤
지루한
부분
하나
없던
영화
최고
입니
몰입도


In [184]:
for index in X_test[0]:
    print(words_index_dic[index])

일본
깔
목적
평점
리지
개꿀
잼
인한
보는
리뷰
안보
영화
쳐
봐
재미
잇는
영화
잔
또
까
리지
재밋습니
부산
행보
다재
보셔
