In [1]:
import re
from copy import deepcopy

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchtext import data

In [2]:
batch_size = 32
embedded_size = 128
hidden_size = 150
n_classes = 5
EPOCHS = 7

In [3]:
TEXT = data.Field(sequential=True,
                  use_vocab=True,
                  tokenize=str.split, 
                  lower=True,
                  batch_first=True,
                  fix_length=None)

LABEL = data.Field(sequential=False,
                   use_vocab=False,
                   is_target=True)

In [4]:
from torchtext.data import TabularDataset

train_data = TabularDataset(
    path = './data/train.nltk.csv',
    format = 'csv',
    fields = [('text', TEXT), ('label', LABEL)],
    skip_header = True)


test_data = TabularDataset(
    path = './data/test_x.nltk.1.csv',
    format = 'csv',
    fields = [('text', TEXT)],
    skip_header = True)

print(vars(train_data[0]))
TEXT.build_vocab(train_data, min_freq=7, max_size=20000, vectors = "fasttext.en.300d")
train_data, valid_data = train_data.split(split_ratio=0.8, stratified=True)

{'text': ['almost', 'choking', 'much', 'much', 'wanted', 'say', 'strange', 'exclamations', 'came', 'lips', 'pole', 'gazed', 'fixedly', 'bundle', 'notes', 'hand', 'looked', 'odin', 'evident', 'perplexity'], 'label': '3'}


In [5]:
len(train_data), len(valid_data), len(test_data)

(43903, 10976, 19617)

In [6]:
print(vars(train_data[1]))

{'text': ['heard', 'prince', 'replied', 'mr', 'rolles', 'general', 'odin', 'even', 'met', 'society'], 'label': '4'}


In [7]:
print('단어 집합의 크기 : {}'.format(len(TEXT.vocab)))

단어 집합의 크기 : 11985


In [8]:
from torchtext.data import BucketIterator

train_loader, valid_loader = BucketIterator.splits(
    (train_data, valid_data),
    batch_size = batch_size,
    device='cuda:0',
    shuffle = True,
    sort_key=lambda x:len(x.text),
    sort_within_batch=True
)

test_loader = BucketIterator(
    test_data,
    batch_size = batch_size,
    device='cuda:0',
    shuffle = False,
#     sort_key=lambda x:len(x.text),
#     sort_within_batch=True
)

In [9]:
print('train 데이터의 미니 배치 수 : {}'.format(len(train_loader)))
print('valid 데이터의 미니 배치 수 : {}'.format(len(valid_loader)))
print('test 데이터의 미니 배치 수 : {}'.format(len(test_loader)))

train 데이터의 미니 배치 수 : 1372
valid 데이터의 미니 배치 수 : 343
test 데이터의 미니 배치 수 : 614


In [10]:
class LSTM(nn.Module):
    def __init__(self, 
                 input_size, 
                 embedded_size, 
                 hidden_size, 
                 n_classes, 
                 n_layers=3, 
                 dropout_p=0.3):
        self.input_size = input_size
        self.embedded_size = embedded_size
        self.hidden_size = hidden_size
        self.n_classes = n_classes
        self.n_layers = n_layers
        self.dropout_p = dropout_p
        super().__init__()
        
        # 단순히 숫자로 매칭만 되어있던 것을 neural net에 넣어서 학습시켜줌
#         self.emb = nn.Embedding(self.input_size, self.embedded_size) 
        self.emb = nn.Embedding.from_pretrained(TEXT.vocab.vectors, freeze=False) 
        self.lstm = nn.LSTM(
#             input_size = self.embedded_size,
            input_size = 300,
            hidden_size = self.hidden_size,
            num_layers = self.n_layers,
            dropout = self.dropout_p,
            batch_first = True,
            bidirectional = True
        )
        self.fc_layer = nn.Linear(hidden_size*2, n_classes)
        self.activation = nn.LogSoftmax(dim=-1)
    
    def forward(self, x):
        # |x| = (bs, length)
#         print(x.shape)

        embedded = self.emb(x) 
        # [embedded] = (bs, length, embedded_size)
#         print(embedded.shape)

        y, hidden = self.lstm(embedded)
        # |y| = (bs, length, hidden_size*2)
#         print("aa: ", y.shape)
#         print(y[:, -1, :].shape)
#         print(y[0,:,:])

        y = self.fc_layer(y[:, -1, :]) # lstm의 결과중 마지막것만 가져옴
#         [y] = (bs, n_classes)
#         print("bb: ", y.shape)

        y = self.activation(y)
        # [y] = (bs, n_classes)
#         print("cc: ", y.shape)
        return y

In [11]:
# cnt = 0
# for x, y in train_loader:
#     if cnt == 0:
#         print(x)
#     cnt += 1

In [18]:
from tqdm import tqdm
model = LSTM(len(TEXT.vocab), embedded_size, hidden_size, n_classes).to('cuda:0')

crit = nn.NLLLoss().to('cuda:0')
optimizer = optim.Adam(model.parameters(), lr=0.001)
best_loss = 999999
best_model = None

for i in range(EPOCHS):
    train_loss = 0
    train_accuracy = 0
    model.train()
    for x, y in tqdm(train_loader):
        optimizer.zero_grad()
        x, y = x.to('cuda:0'), y.to('cuda:0')

        y_hat = model(x)
        loss = crit(y_hat, y)
        loss.backward()
        optimizer.step()
        if isinstance(y, torch.LongTensor) or isinstance(y, torch.cuda.LongTensor):
            accuracy = (torch.argmax(y_hat, dim=-1) == y).sum() / float(y.size(0))
        else:
            accuracy = 0
        train_loss += float(loss) / len(train_loader)
        train_accuracy += accuracy / len(train_loader)

    valid_loss = 0
    valid_accuracy = 0
    with torch.no_grad():
        model.eval()
        for x, y in valid_loader:
            x, y = x.to('cuda:0'), y.to('cuda:0')
            y_hat = model(x)
            loss = crit(y_hat, y)
            
            if isinstance(y, torch.LongTensor) or isinstance(y, torch.cuda.LongTensor):
                accuracy = (torch.argmax(y_hat, dim=-1) == y).sum() / float(y.size(0))
            else:
                accuracy = 0
            valid_loss += float(loss) / len(valid_loader)
            valid_accuracy += accuracy / len(valid_loader)
            
        if valid_loss < best_loss:
            best_loss = valid_loss
            best_model = deepcopy(model.state_dict())
            
            
    print("EPOCHS: {:2d} | train_accuracy: {:.4f} / train_loss: {:.4f} / valid_accuracy: {:.4f} / valid_loss: {:.4f} / best_loss: {:.4f}".format
              (i+1, train_accuracy, train_loss, valid_accuracy, valid_loss, best_loss))

100%|██████████████████████████████████████████████████████████████████████████████| 1372/1372 [00:17<00:00, 77.60it/s]
  1%|▉                                                                               | 16/1372 [00:00<00:18, 71.50it/s]

EPOCHS:  1 | train_accuracy: 0.5827 / train_loss: 1.0586 / valid_accuracy: 0.6818 / valid_loss: 0.8588 / best_loss: 0.8588


100%|██████████████████████████████████████████████████████████████████████████████| 1372/1372 [00:17<00:00, 77.52it/s]
  0%|▏                                                                                | 4/1372 [00:00<00:36, 37.48it/s]

EPOCHS:  2 | train_accuracy: 0.7628 / train_loss: 0.6456 / valid_accuracy: 0.7421 / valid_loss: 0.6958 / best_loss: 0.6958


100%|██████████████████████████████████████████████████████████████████████████████| 1372/1372 [00:17<00:00, 77.13it/s]
  0%|▎                                                                                | 5/1372 [00:00<00:42, 31.93it/s]

EPOCHS:  3 | train_accuracy: 0.8205 / train_loss: 0.4853 / valid_accuracy: 0.7397 / valid_loss: 0.7411 / best_loss: 0.6958


100%|██████████████████████████████████████████████████████████████████████████████| 1372/1372 [00:17<00:00, 76.33it/s]
  0%|▏                                                                                | 4/1372 [00:00<00:37, 36.92it/s]

EPOCHS:  4 | train_accuracy: 0.8563 / train_loss: 0.3939 / valid_accuracy: 0.7353 / valid_loss: 0.8096 / best_loss: 0.6958


100%|██████████████████████████████████████████████████████████████████████████████| 1372/1372 [00:17<00:00, 79.18it/s]
  0%|▎                                                                                | 5/1372 [00:00<00:27, 48.99it/s]

EPOCHS:  5 | train_accuracy: 0.8796 / train_loss: 0.3283 / valid_accuracy: 0.7324 / valid_loss: 0.9021 / best_loss: 0.6958


100%|██████████████████████████████████████████████████████████████████████████████| 1372/1372 [00:17<00:00, 78.65it/s]
  0%|▎                                                                                | 6/1372 [00:00<00:23, 57.06it/s]

EPOCHS:  6 | train_accuracy: 0.8992 / train_loss: 0.2728 / valid_accuracy: 0.7299 / valid_loss: 1.0335 / best_loss: 0.6958


100%|██████████████████████████████████████████████████████████████████████████████| 1372/1372 [00:17<00:00, 78.95it/s]


EPOCHS:  7 | train_accuracy: 0.9129 / train_loss: 0.2323 / valid_accuracy: 0.7280 / valid_loss: 1.1120 / best_loss: 0.6958


In [35]:
torch.save(best_model, 'rnn_model1.pth')

In [36]:
best_model = torch.load('rnn_model1.pth', map_location='cuda:0')

In [37]:
test_model = LSTM(len(TEXT.vocab), embedded_size, hidden_size, n_classes).to('cuda:0')
test_model.load_state_dict(best_model)

<All keys matched successfully>

In [38]:
test_model

LSTM(
  (emb): Embedding(11985, 300)
  (lstm): LSTM(300, 150, num_layers=3, batch_first=True, dropout=0.3, bidirectional=True)
  (fc_layer): Linear(in_features=300, out_features=5, bias=True)
  (activation): LogSoftmax(dim=-1)
)

In [39]:
cnt = 0
for x in test_loader:
    if cnt == 0:
        print(x.text.shape)
    cnt += 1

torch.Size([32, 12])


In [None]:
test_model.eval()
y_hats = []
with torch.no_grad():
    for x_batch in train_loader:
        x = x_batch.text.to('cuda:0')
        y_hat = test_model(x).cpu()
        y_hats += y_hat
    y_hats = torch.stack(y_hats).exp()
test_pred = y_hats.numpy()

In [46]:
import torch.nn.functional as F

In [53]:
test_model.eval()
y_hats = []
losses = 0
ys = []
with torch.no_grad():
    for x, y in valid_loader:
        x, y = x.to('cuda:0'), y.to('cuda:0')
        y_hat = test_model(x)
        
        loss = F.nll_loss(y_hat, y)
        losses += float(loss) / len(valid_loader)
        ys.append(y)
        y_hat = y_hat.cpu()
        y_hats += y_hat
    y_hats = torch.stack(y_hats).exp()
test_pred = y_hats.numpy()

print(losses)

0.6957581716402271


In [54]:
ys

[tensor([3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 0, 2, 2, 2, 2, 2, 2, 2, 1, 3,
         3, 3, 4, 4, 4, 4, 4, 4], device='cuda:0'),
 tensor([1, 1, 1, 1, 1, 1, 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
         3, 3, 3, 3, 3, 3, 3, 3], device='cuda:0'),
 tensor([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
         2, 2, 1, 1, 1, 1, 1, 1], device='cuda:0'),
 tensor([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
         2, 2, 2, 2, 2, 2, 2, 2], device='cuda:0'),
 tensor([4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 2, 2, 2, 2], device='cuda:0'),
 tensor([3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
         4, 4, 4, 4, 4, 4, 4, 4], device='cuda:0'),
 tensor([3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
         3, 3, 3, 3, 3, 3, 3, 3], device='cuda:0'),
 tensor([3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 

In [52]:
test_pred

array([[0.04943195, 0.01957397, 0.07606616, 0.83667743, 0.01825058],
       [0.13303986, 0.12536663, 0.39287797, 0.31331658, 0.03539892],
       [0.13624495, 0.087958  , 0.32081994, 0.3574904 , 0.09748676],
       ...,
       [0.09638662, 0.02545827, 0.13299501, 0.03777713, 0.707383  ],
       [0.00442029, 0.03067481, 0.9069555 , 0.01964252, 0.03830694],
       [0.01845455, 0.9675698 , 0.00316975, 0.00514997, 0.00565592]],
      dtype=float32)

In [29]:
import pandas as pd
sub = pd.read_csv('./data/sample_submission.csv', index_col=0)

sub[sub.columns] = test_pred
sub.head()
sub.to_csv('./data/submission2.csv')