In [1]:
# torchtext==0.9.0
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader

from torchtext import data
from torchtext.legacy.data import TabularDataset, Field, Iterator

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import urllib.request
from konlpy.tag import Okt

from tqdm import tqdm

In [3]:
# Download NSMC dataset
# Reference: https://wikidocs.net/44249
# urllib.request.urlretrieve("https://raw.githubusercontent.com/e9t/nsmc/master/ratings_train.txt", filename="ratings_train.txt")
# urllib.request.urlretrieve("https://raw.githubusercontent.com/e9t/nsmc/master/ratings_test.txt", filename="ratings_test.txt")

In [2]:
train_data = pd.read_table('ratings_train.txt')
test_data = pd.read_table('ratings_test.txt')

print(f'Number of Training sample: {len(train_data)}')
print(f'Number of Test sample: {len(test_data)}')

Number of Training sample: 150000
Number of Test sample: 41769


In [4]:
train_data.head()

Unnamed: 0,id,document,label
0,9976970,아 더빙.. 진짜 짜증나네요 목소리,0
1,3819312,흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나,1
2,10265843,너무재밓었다그래서보는것을추천한다,0
3,9045019,교도소 이야기구먼 ..솔직히 재미는 없다..평점 조정,0
4,6483659,사이몬페그의 익살스런 연기가 돋보였던 영화!스파이더맨에서 늙어보이기만 했던 커스틴 ...,1


In [5]:
# Cleaning data
# Delete duplicates
print(train_data['document'].nunique())     # 3,818 duplicates
print(train_data['label'].nunique())        # label: (0,1) binary classification

print(f'Number of Training sample before delete duplicates: {len(train_data)}')
train_data.drop_duplicates(subset=['document'], inplace=True)
print(f'Number of Training sample after delete duplicates: {len(train_data)}')

146182
2
Number of Training sample before delete duplicates: 150000
Number of Training sample after delete duplicates: 146183


In [6]:
# Remove Missing values
print(train_data.isnull().sum())
print('='*30)
print(train_data.loc[train_data.document.isnull()])

# Drop NaN row
train_data = train_data.dropna(how = 'any')
print('='*30)
print(train_data.isnull().sum())
print(f'Number of Training sample after delete missing values: {len(train_data)}')

id          0
document    1
label       0
dtype: int64
            id document  label
25857  2172111      NaN      1
id          0
document    0
label       0
dtype: int64
Number of Training sample after delete missing values: 146182


In [7]:
# Remove punctuations
train_data['document'] = train_data['document'].str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]", "")
train_data.head()

  train_data['document'] = train_data['document'].str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]", "")


Unnamed: 0,id,document,label
0,9976970,아 더빙 진짜 짜증나네요 목소리,0
1,3819312,흠포스터보고 초딩영화줄오버연기조차 가볍지 않구나,1
2,10265843,너무재밓었다그래서보는것을추천한다,0
3,9045019,교도소 이야기구먼 솔직히 재미는 없다평점 조정,0
4,6483659,사이몬페그의 익살스런 연기가 돋보였던 영화스파이더맨에서 늙어보이기만 했던 커스틴 던...,1


In [8]:
# 기존에 한글 없는 리뷰 빈 document가 되므로 Null로 변경 후 제거
train_data['document'] = train_data['document'].str.replace('^ +', '')
train_data['document'].replace('', np.nan, inplace=True)
print(train_data.isnull().sum())
train_data = train_data.dropna(how='any')
print(f'Number of Training sample after delete empty documents: {len(train_data)}')

id            0
document    789
label         0
dtype: int64
Number of Training sample after delete empty documents: 145393


  train_data['document'] = train_data['document'].str.replace('^ +', '')


In [9]:
# Same procedure for test dataset
test_data.drop_duplicates(subset=['document'], inplace=True)    # Delete duplicates
test_data['document'] = test_data['document'].str.replace('[^ㄱ-ㅎㅏ-ㅣ가-힣 ]', '')
test_data['document'] = test_data['document'].str.replace('^ +', '')
test_data['document'].replace('', np.nan, inplace=True)
test_data = test_data.dropna(how='any') # Delete NaN values
print(f'Number of Test sample after preprocessing: {len(test_data)}')

Number of Test sample after preprocessing: 40864


  test_data['document'] = test_data['document'].str.replace('[^ㄱ-ㅎㅏ-ㅣ가-힣 ]', '')
  test_data['document'] = test_data['document'].str.replace('^ +', '')


In [3]:
# Tokenization
okt = Okt()
okt.morphs('아 더빙 진짜 짜증나네요 목소리', stem=True)

# Stopwords
stopwords = ['의','가','이','은','들','는','좀','잘','걍','과','도','를','으로','자','에','와','한','하다']

In [None]:
# Tokenization + Remove stopwords
X_train = []
for sentence in tqdm(train_data['document']):
    tokenized_sentence = okt.morphs(sentence, stem=True)
    stopwords_removed_sentence = [word for word in tokenized_sentence if word not in stopwords]
    X_train.append(stopwords_removed_sentence)

In [13]:
# Tokenization + Remove stopwords
X_test = []
for sentence in tqdm(test_data['document']):
    tokenized_sentence = okt.morphs(sentence, stem=True)
    stopwords_removed_sentence = [word for word in tokenized_sentence if not word in stopwords]
    X_test.append(stopwords_removed_sentence)

100%|██████████| 40864/40864 [04:20<00:00, 157.15it/s]


In [None]:
X_train_joined = [' '.join(x) for x in X_train]
X_test_joined = [' '.join(x) for x in X_test]

In [32]:
train_data['document'] = X_train_joined
test_data['document'] = X_test_joined

In [34]:
train_data.to_csv("train_data.csv", index=False)
test_data.to_csv("test_data.csv", index=False)

In [5]:
# Padding = 30
# https://wikidocs.net/60314
ID = Field(sequential=False, use_vocab=False)   # 실제 사용안함.
TEXT = Field(sequential=True, use_vocab=True, tokenize=str.split, lower=True, batch_first=True, fix_length=30)
LABEL = Field(sequential=False, use_vocab=False, batch_first=False, is_target=True)

In [6]:
# train_data, test_data = TabularDataset.splits(path='.', train='ratings_train.txt', test='ratings_test.txt',
#                                               format='tsv', fields=[('id', ID), ('text', TEXT), ('label', LABEL)], skip_header=True)
train_data, test_data = TabularDataset.splits(path='.', train='train_data.csv', test='test_data.csv',
                                              format='csv', fields=[('id', ID), ('text', TEXT), ('label', LABEL)], skip_header=True)

In [7]:
print(f'Num training sample: {len(train_data)}')
print(f'Num test sample: {len(test_data)}')
print(train_data.fields.items())    # ID, text, label

Num training sample: 145393
Num test sample: 40864
dict_items([('id', <torchtext.legacy.data.field.Field object at 0x00000238C67F3940>), ('text', <torchtext.legacy.data.field.Field object at 0x00000238C67F39A0>), ('label', <torchtext.legacy.data.field.Field object at 0x00000238C67F38E0>)])


In [8]:
TEXT.build_vocab(train_data, min_freq=3)

In [10]:
batch_size = 1
train_loader = Iterator(dataset=train_data, batch_size=batch_size)
test_loader = Iterator(dataset=test_data, batch_size=batch_size)

print(f'Num training batches: {len(train_loader)}')
print(f'Num test batchsed: {len(test_loader)}')

Num training batches: 145393
Num test batchsed: 40864


In [18]:
class RNN(nn.Module):
    def __init__(self, embedding_size, input_size, hidden_size, output_size):
        super(RNN, self).__init__()
        self.embedding = nn.Embedding(embedding_size, input_size, padding_idx=1)
        self.hidden_size = hidden_size
        self.input_size = input_size
        self.i2h = nn.Linear(input_size+hidden_size, hidden_size)
        self.i2o = nn.Linear(hidden_size, output_size)
        # self.softmax = nn.LogSoftmax(dim=1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, input, hidden):
        # input = self.embedding(input).view(1, -1)
        # combined = torch.cat((input, hidden), dim=1)
        # hidden = torch.tanh(self.i2h(combined))
        # output = self.softmax(self.i2o(combined))

        input = self.embedding(input).unsqueeze(1)
        for i in range(self.input_size):
            combined = torch.cat((input[i], hidden), dim=1)     # wx + wh + b
            hidden = self.i2h(combined)
            hidden = torch.tanh(hidden)
        output = self.i2o(hidden)
        #output = self.softmax(output)
        output = self.sigmoid(output)

        return output, hidden

    def init_hidden(self):
        return torch.zeros(1, self.hidden_size)

In [41]:
def train(input_tensor, label): 
    hidden = rnn.init_hidden()

    # for i in range(input_tensor.size()[0]):
    #     print(input_tensor[i].size(), hidden.size())
    #     output, hidden = rnn(input_tensor[i], hidden)
    output, hidden = rnn(input_tensor, hidden)
    loss = criterion(output[0], label.float())
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    return output, loss.item()

In [57]:
vocab_size = len(TEXT.vocab)
rnn = RNN(vocab_size, 30, 128, 1)
criterion = nn.BCELoss()
learning_rate = 0.0001
optimizer = torch.optim.SGD(rnn.parameters(), lr=learning_rate)

In [None]:
n_epochs = 2
current_loss = 0
plot_steps, print_steps = 1000, 2000
all_losses = []

for i in range(n_epochs):
    for i, sample in enumerate(train_loader):
        text = sample.text[0]
        label = sample.label
        output, loss = train(text, label)
        current_loss += loss
        if (i+1) % plot_steps == 0:
            all_losses.append(current_loss / plot_steps)
            current_loss = 0
        if (i+1) % print_steps == 0:
            guess = round(output.item())
            correct = "CORRECT" if guess == label else f"WRONG ({label.item()}) | {output.item()}"
            print(f"{i} {n_epochs*len(train_loader)} {loss:.4f} {guess} {correct}")