2021-2 상명대학교 자연어처리 08실습RNN+MNIST
2021.11.15.월요일

* 데이터 전처리

In [1]:
max_length = 256 # sms 최대 길이

# 1. 데이터 불러오기

In [2]:
import pandas as pd

In [3]:
df = pd.read_csv('sms.tsv', sep='\t', )
print(df.columns)
print(df.shape)

Index(['label', 'sms'], dtype='object')
(5572, 2)


In [4]:
df.head()

Unnamed: 0,label,sms
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
# 클래스 파악
classes = sorted(set(df['label']))
class_to_idx = {}

for i, c in enumerate(classes): # 모든 클래스에 대해
    class_to_idx.update({c: i})
    
nclass = len(classes)

print("# of classes: %d" %nclass)
print(classes)
print(class_to_idx)

# of classes: 2
['ham', 'spam']
{'ham': 0, 'spam': 1}


# 2. 새로운 DataFrame
## 1) 'label, sms'만 남기기
## 2) 최대 텍스트 길이 만큼 자르기 # pandas.Series.str.slice

In [6]:
new_df = pd.DataFrame({'label':df['label'], 'sms':df['sms'].str.slice(start=0, stop=max_length)})

## 3) 중복 제거

In [7]:
len(new_df)

5572

In [8]:
new_df = pd.DataFrame(new_df.drop_duplicates())

In [9]:
len(new_df)

5169

## 4) 셔플

In [10]:
df_shuffled = new_df.sample(frac=1).reset_index(drop=True)
df_shuffled.head()

Unnamed: 0,label,sms
0,ham,K:)all the best:)congrats...
1,ham,Sorry completely forgot * will pop em round th...
2,ham,* Am on my way
3,ham,Fine i miss you very much.
4,ham,I was just callin to say hi. Take care bruv!


## 5) train, test 나누기

In [11]:
# train:test = 9:1
train_ratio = 0.9

# train dataset
s, e = 0, int(df_shuffled.shape[0]*train_ratio) # # of rows
df_shuffled_label = df_shuffled['label'][s:e]
df_shuffled_sms = df_shuffled['sms'][s:e]
df_train = pd.DataFrame({'label':df_shuffled_label, 'sms':df_shuffled_sms})
print("index for train: %d~%d" %(s, e))

#test dataset
# train dataset
s, e = e, e+int(df_shuffled.shape[0]*(1.0-train_ratio)) # # of rows
df_shuffled_label = df_shuffled['label'][s:e]
df_shuffled_sms = df_shuffled['sms'][s:e]
df_test = pd.DataFrame({'label':df_shuffled_label, 'sms':df_shuffled_sms})
print("index for test: %d~%d" %(s, e))

index for train: 0~4652
index for test: 4652~5168


In [12]:
# column 수 확인
print(df_train.shape)
print(df_test.shape)

(4652, 2)
(516, 2)


## 6) 저장

In [13]:
df_train.to_csv('./sms.maxlen.uniq.shuf.train.tsv', header=False, index=False, sep='\t')
df_test.to_csv('./sms.maxlen.uniq.shuf.test.tsv', header=False, index=False, sep='\t')

* 데이터 로더

In [14]:
import torch
print(torch.__version__)

1.4.0


In [15]:
!pip install torchtext==0.4.0

You should consider upgrading via the '/home/ec2-user/anaconda3/envs/pytorch_p36/bin/python -m pip install --upgrade pip' command.[0m


In [16]:
import torchtext
import numpy as np
from data_loader import DataLoader # data_loader.py

* RNN+SMS 구현

# 0.1 라이브러리 임포트

In [17]:
import torch
import torch.nn as nn
import numpy as np
import torchvision.transforms as transforms
import matplotlib.pyplot as plt
from torch.autograd import Variable

# 0.2 하이퍼파라미터 셋팅

In [18]:
# Hyper-parameters
dropout_p = 0.3
word_vec_size = 256 # emb_size

hidden_size = 512
num_layers = 4

batch_size = 128
num_epochs = 10

learning_rate = 0.001 # 교수님께서 추가

input: (batch_size, n, input_size)  
hidden: (batch_size, n, hidden_size*2)  
out: (batch_size, n, # of classes)  
레이어 개수, emb_size  
이런 것들 다 hyperparameters

In [19]:
# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# 1. SMS train, test dataset 가져오기

In [20]:
from data_loader import DataLoader

In [21]:
loaders = DataLoader(
        train_fn='./sms.maxlen.uniq.shuf.train.tsv',
        batch_size=batch_size,
        valid_ratio=.2, # train:val = 8:2
        max_vocab=999999, # 크게
        min_freq=5, # 문장의 최소 단어 개수
)

In [22]:
test_loaders = DataLoader(
        train_fn='./sms.maxlen.uniq.shuf.test.tsv',
        batch_size=batch_size,
        valid_ratio=.01, # val 안 나눈다. 0은 안 받으므로 0.01
        max_vocab=999999,
        min_freq=5,
)

# 2. 대략적인 데이터 형태

In [23]:
print("|train| =", len(loaders.train_loader.dataset),
      "|valid| =", len(loaders.valid_loader.dataset))

vocab_size = len(loaders.text.vocab)
num_classes = len(loaders.label.vocab)
print("|vocab| =", vocab_size, "|classes| =", num_classes)

|train| = 3722 |valid| = 930
|vocab| = 1546 |classes| = 2


# 3. 데이터 로드 함수
학습시킬 때 batch_size 단위로 끊어서 로드하기 위함

# 데이터 로드함수 이해하기

In [24]:
n = 3 # 샘플로 그려볼 데이터 개수
for i, data in enumerate(loaders.train_loader): # batch_size 만큼
    labels = data.label
    texts = data.text
    
    if i>n:
        break
    print ("[%d]" %i)
    print("한 번에 로드되는 데이터 크기: ", len(labels))
    
    # 출력
    for j in range(n):
        label = labels[j].numpy() # tensor -> numpy로 변환
        text = texts[j].numpy()
        print("label: ", label)
        print("text: ", text.shape)

[0]
한 번에 로드되는 데이터 크기:  128
label:  0
text:  (15,)
label:  0
text:  (15,)
label:  0
text:  (15,)
[1]
한 번에 로드되는 데이터 크기:  128
label:  0
text:  (13,)
label:  0
text:  (13,)
label:  0
text:  (13,)
[2]
한 번에 로드되는 데이터 크기:  128
label:  0
text:  (8,)
label:  0
text:  (8,)
label:  0
text:  (8,)
[3]
한 번에 로드되는 데이터 크기:  128
label:  0
text:  (30,)
label:  0
text:  (30,)
label:  0
text:  (30,)


# 4. 모델 선언

In [25]:
# Recurrent neural network (many-to-one)
class RNN(nn.Module):
    def __init__(self,
                 input_size, # vocab_size
                 word_vec_size, # word embbeding vector 차원
                 hidden_size, # bidirectional LSRM의 hidden state & cell state의 size
                 n_classes,
                 num_layers=4, # 쌓을 레이어 개수
                 dropout_p=0.3
                 ):
        super(RNN, self).__init__()
        
        self.input_size = input_size
        self.word_vec_size = word_vec_size
        self.hidden_size = hidden_size
        self.n_classes = n_classes
        self.num_layers = num_layers
        self.dropout_p = dropout_p
        
        # 입력 차원(vocab_size), 출력 차원(word_vec_size)
        self.emb =nn.Embedding(input_size, word_vec_size) # 부터!
        
        self.lstm = nn.LSTM(input_size=word_vec_size,
                            hidden_size=hidden_size,
                            num_layers=num_layers,
                            dropout=dropout_p,
                            batch_first=True,
                            bidirectional=True)
        self.fc  = nn.Linear(hidden_size*2, num_classes)
        # LogSoftmax + NLLLoss instead of Softmax + CrossEntropy
        self.activation = nn.LogSoftmax(dim=1) # 마지막 차원에 softmax
        
    def forward(self, x):
        # x: (batch_size, length)
        x = self.emb(x)
        
        # x: (batch_size, length, word_vec_size)
        x, _ = self.lstm(x) # x: output, _: 마지막 time step의 hidden state & cell state
        
        # x: (batch_size, length, hidden_size*2)
        # x[:,-1]: (batch_size, 1, hidden_size*2)
        out = self.activation(self.fc(x[:,-1])) # 마지막 time step
        # self.fc(x[:,-1]): (batch_size, num_classes)
        
        return out

In [26]:
model = RNN(input_size=vocab_size,
            word_vec_size=word_vec_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            n_classes=num_classes,
            dropout_p=dropout_p).to(device)

In [27]:
def ComputeAccr(dloader, imodel):
    correct = 0
    total = 0
    
    imodel.eval() # test mode
    for i, data in enumerate(dloader): # batch_size만큼
        texts = data.text.to(device) # (batch_size, length)
        labels = data.label.to(device) # (batch_size, num_classes)
        
        # Forward prop.
        output = imodel(texts) # (batch_size, num_classes)
        _, output_index = torch.max(output, 1) # (batch_size, 1)
        
        total += labels.size(0)
        
        correct += (output_index == labels).sum().float()
        # print("Accuracy of Test Data: {}".format(100*correct/total))
    
    imodel.train()
    return (100*correct/total).numpy() # tensor -> numpy

In [28]:
print("Accuracy of Test Data: %.2f" %ComputeAccr(loaders.valid_loader, model))

Accuracy of Test Data: 12.37


# 5. loss, optimizer

In [29]:
# Loss and optimizer
#loss_func = nn.CrossEntropyLoss()
loss_func = nn.NLLLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# 6. 학습

In [30]:
# Train the model
total_step = len(loaders.train_loader)
for epoch in range(num_epochs):
    for i, data in enumerate(loaders.train_loader): # batch_size만큼
        texts = data.text.to(device) # (batch_size, length)
        labels = data.label.to(device) # (batch_size, num_classes)
        
        print("[%d]" %i)
        
        # Forward prop.
        output = model(texts) # (batch_size, num_classes)
        loss = loss_func(output, labels)
        
        # Backward prop. & optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        if (i+1) % 10 == 0:
            print('Epoch[{}/{}], Step [{}/{}], Loss: {:.4f}, Accr: {:.2f}'
                 .format(epoch+1, num_epochs, i+1, total_step,
                        loss.item(),
                        ComputeAccr(loaders.valid_loader, model)))

[0]
[1]
[2]
[3]
[4]
[5]
[6]
[7]
[8]
[9]
Epoch[1/10], Step [10/30], Loss: 0.3581, Accr: 87.63
[10]
[11]
[12]
[13]
[14]
[15]
[16]
[17]
[18]
[19]
Epoch[1/10], Step [20/30], Loss: 0.2312, Accr: 87.63
[20]
[21]
[22]
[23]
[24]
[25]
[26]
[27]
[28]
[29]
Epoch[1/10], Step [30/30], Loss: 0.2016, Accr: 87.63
[0]
[1]
[2]
[3]
[4]
[5]
[6]
[7]
[8]
[9]
Epoch[2/10], Step [10/30], Loss: 0.0899, Accr: 87.63
[10]
[11]
[12]
[13]
[14]
[15]
[16]
[17]
[18]
[19]
Epoch[2/10], Step [20/30], Loss: 0.8529, Accr: 87.63
[20]
[21]
[22]
[23]
[24]
[25]
[26]
[27]
[28]
[29]
Epoch[2/10], Step [30/30], Loss: 0.7128, Accr: 87.63
[0]
[1]
[2]
[3]
[4]
[5]
[6]
[7]
[8]
[9]
Epoch[3/10], Step [10/30], Loss: 0.2029, Accr: 87.63
[10]
[11]
[12]
[13]
[14]
[15]
[16]
[17]
[18]
[19]
Epoch[3/10], Step [20/30], Loss: 0.9569, Accr: 87.63
[20]
[21]
[22]
[23]
[24]
[25]
[26]
[27]
[28]
[29]
Epoch[3/10], Step [30/30], Loss: 0.2592, Accr: 91.40
[0]
[1]
[2]
[3]
[4]
[5]
[6]
[7]
[8]
[9]
Epoch[4/10], Step [10/30], Loss: 0.0926, Accr: 92.90
[10]
[11]


# 7. 테스트

In [31]:
print("Accuracy of Test Data: %.2f" %ComputeAccr(test_loaders.train_loader, model))

Accuracy of Test Data: 84.34


# 8. 학습된 파라미터 저장

In [32]:
netname = './nets/rnn_weight_sms.pkl'
torch.save(model, netname, )

  "type " + obj.__name__ + ". It won't be checked "


# 9. 학습된 파라미터 로드
실무에서 학습된(pretrained) 파라미터 로드 시 5, 6, 8 과정 생략한채 실행

In [33]:
netname = './nets/rnn_weight_sms.pkl'
model = torch.load(netname)

In [34]:
print("Accuracy of Test Data: %.2f" %ComputeAccr(test_loaders.train_loader, model))

Accuracy of Test Data: 84.34
