In [1]:
import re
from copy import deepcopy

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchtext import data

In [2]:
batch_size = 32
embedded_size = 128
hidden_size = 150
n_classes = 5
EPOCHS = 5

In [3]:
TEXT = data.Field(sequential=True,
                  use_vocab=True,
                  tokenize=str.split, 
                  lower=True,
                  batch_first=True,
                  fix_length=None)

LABEL = data.Field(sequential=False,
                   use_vocab=False,
                   is_target=True)

INDEX = data.Field(sequential=False,
                   use_vocab=False,
                   is_target=False)

In [4]:
from torchtext.data import TabularDataset

train_data = TabularDataset(
    path = './data/train.nltk.csv',
    format = 'csv',
    fields = [('text', TEXT), ('label', LABEL)],
    skip_header = True)


test_data = TabularDataset(
    path = './data/test_x.nltk.1.csv',
    format = 'csv',
    fields = [('text', TEXT), ('index', INDEX)],
    skip_header = True)

print(vars(train_data[0]))
TEXT.build_vocab(train_data, min_freq=7, max_size=20000, vectors = "fasttext.en.300d")
train_data, valid_data = train_data.split(split_ratio=0.8, stratified=True)

{'text': ['almost', 'choking', 'much', 'much', 'wanted', 'say', 'strange', 'exclamations', 'came', 'lips', 'pole', 'gazed', 'fixedly', 'bundle', 'notes', 'hand', 'looked', 'odin', 'evident', 'perplexity'], 'label': '3'}


In [5]:
len(train_data), len(valid_data), len(test_data)

(43903, 10976, 19617)

In [6]:
print(vars(train_data[0]))

{'text': ['joe', 'actually', 'laid', 'head', 'pillow', 'side', 'put', 'arm', 'round', 'neck', 'joy', 'knew'], 'label': '0'}


In [7]:
print('단어 집합의 크기 : {}'.format(len(TEXT.vocab)))

단어 집합의 크기 : 11985


In [8]:
from torchtext.data import BucketIterator
from torchtext.data import Iterator

train_loader, valid_loader = BucketIterator.splits(
    (train_data, valid_data),
    batch_size = batch_size,
    device='cuda:0',
    shuffle = True,
    sort_key=lambda x:len(x.text),
    sort_within_batch=True
)
test_loader = Iterator(dataset=test_data, batch_size=batch_size, device='cuda:0')

In [9]:
print('train 데이터의 미니 배치 수 : {}'.format(len(train_loader)))
print('valid 데이터의 미니 배치 수 : {}'.format(len(valid_loader)))
print('test 데이터의 미니 배치 수 : {}'.format(len(test_loader)))

train 데이터의 미니 배치 수 : 1372
valid 데이터의 미니 배치 수 : 343
test 데이터의 미니 배치 수 : 614


In [10]:
class LSTM(nn.Module):
    def __init__(self, 
                 input_size, 
                 embedded_size, 
                 hidden_size, 
                 n_classes, 
                 n_layers=3, 
                 dropout_p=0.3):
        self.input_size = input_size
        self.embedded_size = embedded_size
        self.hidden_size = hidden_size
        self.n_classes = n_classes
        self.n_layers = n_layers
        self.dropout_p = dropout_p
        super().__init__()
        
        # 단순히 숫자로 매칭만 되어있던 것을 neural net에 넣어서 학습시켜줌
#         self.emb = nn.Embedding(self.input_size, self.embedded_size) 
        self.emb = nn.Embedding.from_pretrained(TEXT.vocab.vectors, freeze=False) 
        self.lstm = nn.LSTM(
#             input_size = self.embedded_size,
            input_size = 300,
            hidden_size = self.hidden_size,
            num_layers = self.n_layers,
            dropout = self.dropout_p,
            batch_first = True,
            bidirectional = True
        )
        self.fc_layer = nn.Linear(hidden_size*2, n_classes)
        self.activation = nn.LogSoftmax(dim=-1)
    
    def forward(self, x):
        # |x| = (bs, length)
#         print(x.shape)

        embedded = self.emb(x) 
        # [embedded] = (bs, length, embedded_size)
#         print(embedded.shape)

        y, hidden = self.lstm(embedded)
        # |y| = (bs, length, hidden_size*2)
#         print("aa: ", y.shape)
#         print(y[:, -1, :].shape)
#         print(y[0,:,:])

        y = self.fc_layer(y[:, -1, :]) # lstm의 결과중 마지막것만 가져옴
#         [y] = (bs, n_classes)
#         print("bb: ", y.shape)

        y = self.activation(y)
        # [y] = (bs, n_classes)
#         print("cc: ", y.shape)
        return y

In [11]:
# cnt = 0
# for x, y in train_loader:
#     if cnt == 0:
#         print(x)
#     cnt += 1

In [12]:
from tqdm import tqdm
model = LSTM(len(TEXT.vocab), embedded_size, hidden_size, n_classes).to('cuda:0')

crit = nn.NLLLoss().to('cuda:0')
optimizer = optim.Adam(model.parameters(), lr=0.001)
best_loss = 999999
best_model = None

for i in range(EPOCHS):
    train_loss = 0
    train_accuracy = 0
    model.train()
    for x, y in tqdm(train_loader):
        optimizer.zero_grad()
        x, y = x.to('cuda:0'), y.to('cuda:0')

        y_hat = model(x)
        loss = crit(y_hat, y)
        loss.backward()
        optimizer.step()
        if isinstance(y, torch.LongTensor) or isinstance(y, torch.cuda.LongTensor):
            accuracy = (torch.argmax(y_hat, dim=-1) == y).sum() / float(y.size(0))
        else:
            accuracy = 0
        train_loss += float(loss) / len(train_loader)
        train_accuracy += accuracy / len(train_loader)

    valid_loss = 0
    valid_accuracy = 0
    with torch.no_grad():
        model.eval()
        for x, y in valid_loader:
            x, y = x.to('cuda:0'), y.to('cuda:0')
            y_hat = model(x)
            loss = crit(y_hat, y)
            
            if isinstance(y, torch.LongTensor) or isinstance(y, torch.cuda.LongTensor):
                accuracy = (torch.argmax(y_hat, dim=-1) == y).sum() / float(y.size(0))
            else:
                accuracy = 0
            valid_loss += float(loss) / len(valid_loader)
            valid_accuracy += accuracy / len(valid_loader)
            
        if valid_loss < best_loss:
            best_loss = valid_loss
            best_model = deepcopy(model.state_dict())
            
            
    print("EPOCHS: {:2d} | train_accuracy: {:.4f} / train_loss: {:.4f} / valid_accuracy: {:.4f} / valid_loss: {:.4f} / best_loss: {:.4f}".format
              (i+1, train_accuracy, train_loss, valid_accuracy, valid_loss, best_loss))

100%|██████████████████████████████████████████████████████████████████████████████| 1372/1372 [00:17<00:00, 78.55it/s]
  1%|▊                                                                               | 13/1372 [00:00<00:27, 50.06it/s]

EPOCHS:  1 | train_accuracy: 0.5654 / train_loss: 1.0925 / valid_accuracy: 0.6969 / valid_loss: 0.8154 / best_loss: 0.8154


100%|██████████████████████████████████████████████████████████████████████████████| 1372/1372 [00:17<00:00, 79.29it/s]
  0%|                                                                                 | 1/1372 [00:00<02:28,  9.22it/s]

EPOCHS:  2 | train_accuracy: 0.7580 / train_loss: 0.6605 / valid_accuracy: 0.7175 / valid_loss: 0.7578 / best_loss: 0.7578


100%|██████████████████████████████████████████████████████████████████████████████| 1372/1372 [00:17<00:00, 79.30it/s]
  1%|▍                                                                                | 8/1372 [00:00<00:18, 74.22it/s]

EPOCHS:  3 | train_accuracy: 0.8179 / train_loss: 0.5042 / valid_accuracy: 0.7279 / valid_loss: 0.7548 / best_loss: 0.7548


100%|██████████████████████████████████████████████████████████████████████████████| 1372/1372 [00:17<00:00, 79.10it/s]
  0%|▎                                                                                | 5/1372 [00:00<00:32, 42.13it/s]

EPOCHS:  4 | train_accuracy: 0.8516 / train_loss: 0.4031 / valid_accuracy: 0.7247 / valid_loss: 0.8178 / best_loss: 0.7548


100%|██████████████████████████████████████████████████████████████████████████████| 1372/1372 [00:17<00:00, 79.56it/s]


EPOCHS:  5 | train_accuracy: 0.8783 / train_loss: 0.3312 / valid_accuracy: 0.7198 / valid_loss: 0.9363 / best_loss: 0.7548


In [13]:
torch.save(best_model, 'rnn_model2.pth')

In [14]:
best_model = torch.load('rnn_model1.pth', map_location='cuda:0')

In [15]:
test_model = LSTM(len(TEXT.vocab), embedded_size, hidden_size, n_classes).to('cuda:0')
test_model.load_state_dict(best_model)

<All keys matched successfully>

In [16]:
test_model

LSTM(
  (emb): Embedding(11985, 300)
  (lstm): LSTM(300, 150, num_layers=3, batch_first=True, dropout=0.3, bidirectional=True)
  (fc_layer): Linear(in_features=300, out_features=5, bias=True)
  (activation): LogSoftmax(dim=-1)
)

In [17]:
cnt = 0
for x in test_loader:
    if cnt == 0:
        print(x)
#         print(x.text.shape)
    cnt += 1


[torchtext.data.batch.Batch of size 32]
	[.text]:[torch.cuda.LongTensor of size 32x103 (GPU 0)]
	[.index]:[torch.cuda.LongTensor of size 32 (GPU 0)]


In [18]:
import torch.nn.functional as F

In [19]:
test_model.eval()
y_hats = []
ys = []
with torch.no_grad():
    for x_batch in test_loader:
        x = x_batch.text.to('cuda:0')
        idx = x_batch.index
        y_hat = test_model(x).cpu()
        y_hats += y_hat
        ys += idx.cpu()
    y_hats = torch.stack(y_hats).exp()
    ys = torch.stack(ys)
test_pred = y_hats.numpy()
idx = ys.numpy()

In [20]:
idx

array([15797, 10467, 16085, ..., 18234, 16864,  4559], dtype=int64)

In [21]:
test_pred

array([[0.01727097, 0.96136636, 0.00540816, 0.00408278, 0.01187172],
       [0.03141812, 0.00749922, 0.05443162, 0.01723714, 0.88941383],
       [0.02748302, 0.02230678, 0.03276433, 0.904553  , 0.01289284],
       ...,
       [0.27242175, 0.09312213, 0.1966439 , 0.26261678, 0.17519534],
       [0.04828686, 0.9054456 , 0.01343996, 0.01647181, 0.01635582],
       [0.01606853, 0.00126799, 0.00861753, 0.96599245, 0.0080535 ]],
      dtype=float32)

In [22]:
import pandas as pd
df = pd.DataFrame(test_pred, index = idx)
# df.loc[2]
df.sort_index()

Unnamed: 0,0,1,2,3,4
0,0.018758,0.148417,0.536296,0.118137,0.178393
1,0.046005,0.078767,0.343277,0.130793,0.401158
2,0.521680,0.375053,0.054054,0.028978,0.020234
3,0.032735,0.022664,0.813713,0.005806,0.125082
4,0.094522,0.045346,0.322157,0.108928,0.429047
...,...,...,...,...,...
19612,0.040261,0.946728,0.002206,0.005349,0.005456
19613,0.019459,0.001162,0.004892,0.001272,0.973215
19614,0.021618,0.946908,0.009166,0.013734,0.008574
19615,0.037550,0.691360,0.116279,0.120103,0.034708


In [26]:
import pandas as pd
sub = pd.read_csv('./data/sample_submission.csv', index_col=0)

sub[sub.columns] = df
sub.head()
sub.to_csv('./data/submission2.csv')

test를 shuffle해버림!!!!