## 1. LSTM 모델을 이용한 NLP Classification (스팸 메일 분류기)

### 이번 실습에서는 LSTM 모델을 사용하여 스팸 메일을 분류하는 과정을 LSTM 설계, 데이터 전처리 과정을 통해 알아봅니다.

**1.1 Fully Connected Layer 복습**

RNN과 LSTM 모델을 학습하기에 앞서 기본적인 ANN (Fully Connected Layer)를 Pytorch로 구성하는 것을 복습합니다.

In [None]:
import torch
import torch.nn as nn
import numpy as np
from torch.autograd import Variable

class ANN(nn.Module):
    def __init__(self, num_output, input_size, hidden_size, device):
        super(ANN, self).__init__()
        self.device = device

        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.outlayer = nn.Linear(hidden_size, num_output)

    def forward(self, x):
        h = self.fc1(x).relu()
        h = self.fc2(h).relu()
        predict = self.outlayer(h)
        return predict

**1.2 LSTM for NLP**

가장 보편적으로 쓰이는 recurrent neural network 구조인 LSTM을 PyTorch로 꾸미는 과정입니다. 기본적으로 텍스트를 다룰 때에는 word2vec을 사용해도 되지만, nn.Embedding 레이어를 사용해서 정수 인코딩 결과를 word2vec으로 만들어주는 레이어를 사용합니다.

In [None]:
class LSTM_net(nn.Module):
    def __init__(self, num_output, size_vocab, dim_embed, hidden_size, linear_size, num_layers, device):
        super(LSTM_net, self).__init__()
        self.device = device # GPU
        self.num_output = num_output # 1
        self.hidden_size = hidden_size # 128
        self.num_layers = num_layers # 2

        self.embed = nn.Embedding(size_vocab, dim_embed)


        self.lstm = nn.LSTM(input_size = dim_embed, hidden_size = hidden_size,
                            num_layers = num_layers, dropout = 0.3, bidirectional = True)
        self.fclayer = nn.Linear(hidden_size, linear_size)
        self.outlayer = nn.Linear(linear_size, num_output)

    def forward(self, x):
        scaler = 2 if self.lstm.bidirectional == True else 1

        emb = self.embed(x)

        h_state = Variable(torch.zeros(self.num_layers*scaler, emb.size(0),
                                       self.hidden_size, requires_grad = True)).to(self.device)
        c_state = Variable(torch.zeros(self.num_layers*scaler, emb.size(0),
                                       self.hidden_size, requires_grad = True)).to(self.device)

        lstm_out, (h, c) = self.lstm(emb.transpose(1,0), (h_state, c_state))
        h = h[-1]  # important
        h = self.fclayer(h).relu()
        predict = self.outlayer(h)
        return predict

**1.3 Spam Mail Classification: 데이터 전처리**

스팸 메일을 분류할 수 있는 이진 분류기를 LSTM을 이용하여 꾸며보는 예시입니다. 우선 csv 파일을 받아 토큰화, 정제 및 추출, 정수 인코딩 과정을 거칩니다.

In [None]:
import os
import pandas as pd
data = pd.read_csv("emails.csv")
display(data.info(),data.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5728 entries, 0 to 5727
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    5728 non-null   object
 1   spam    5728 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 89.6+ KB


None

Unnamed: 0,text,spam
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1
2,Subject: unbelievable new homes made easy im ...,1
3,Subject: 4 color printing special request add...,1
4,"Subject: do not have money , get software cds ...",1


**토큰화**

In [None]:
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import numpy as np
stop_words = set(stopwords.words('english'))
data = data.dropna().reset_index(drop=True)
token_text = []
for i in range(5728):
    token = word_tokenize(data.iloc[i,0])
    token_stop_text = []
    for w in token:
        if w not in stop_words:
            token_stop_text.append(w)
    token_text.append(token_stop_text)
print('After cleaning :', len(token_text))

After cleaning : 5728


**정수 인코딩**

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(token_text)
print(len(tokenizer.word_index))

37231


In [None]:
text_encoded = tokenizer.texts_to_sequences(token_text)
print(text_encoded[0])

[13, 4, 5717, 12190, 440, 1686, 3812, 377, 820, 4317, 68, 4, 78, 325, 6211, 43, 9097, 39, 120, 4428, 752, 3, 5963, 5964, 1585, 223, 82, 2067, 169, 1756, 2, 1899, 7930, 1613, 4211, 68, 10916, 597, 332, 5718, 4, 6476, 7931, 407, 120, 280, 3, 787, 50, 768, 4429, 2958, 4556, 4318, 78, 39, 1899, 374, 693, 597, 169, 787, 2, 123, 930, 1194, 4, 4430, 4, 1013, 1, 241, 3, 211, 2413, 3, 3448, 477, 1937, 4212, 68, 1166, 2, 1045, 4, 752, 1348, 617, 2475, 39, 749, 1, 1, 115, 603, 77, 188, 4557, 370, 223, 603, 365, 874, 2, 4431, 4, 83, 752, 4098, 197, 424, 50, 217, 2, 4432, 4, 374, 1292, 1, 11, 82, 3652, 1262, 2, 406, 107, 2026, 1195, 4, 190, 1880, 647, 356, 1391, 1957, 4558, 1211, 831, 2304, 2, 105, 608, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 159, 2, 2, 2, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7

**학습을 위한 Label: Spam인 경우 1, Normal Text인 경우 0**

In [None]:
text_label = np.array(data.iloc[:,1])

**Padding 및 데이터 자르기**

이메일은 보통 다수의 문장으로 이루어져 있기 때문에, 정제 및 추출을 거치더라도 1개 샘플의 길이가 길 수 있습니다. 따라서 maxlen을 설정하여, maxlen 이하의 토큰을 가진 이메일은 padding을, maxlen 이상의 토큰을 가진 이메일은 첫 100개만 사용하고 나머지는 버립니다.

In [None]:
print(np.shape(text_encoded))
print(np.shape(text_label))
maxlen = 0
for w in text_encoded:
    if len(w) >= maxlen:
        maxlen = len(w)
print(maxlen)

maxlen = 100
rowdata = []
for w in text_encoded:
    if len(w) >= maxlen:
        rowdata.append(w[:maxlen])
    else:
        rowdata.append(np.pad(w, (0, maxlen), 'constant', constant_values=0)[:maxlen])
text_padded = np.concatenate(rowdata, axis=0).reshape(-1, maxlen)
print(np.shape(text_padded))

(5728,)
(5728,)
5599
(5728, 100)


**1.4 학습을 위한 Dataset 만들기 및 학습 과정**

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
from torch.utils.data import DataLoader, TensorDataset, random_split
from torch import LongTensor as LT
from torch import FloatTensor as FT

class Generate_Dataset(torch.utils.data.Dataset):
    def __init__(self,xdata, ydata,device):
        self.x_data = xdata
        self.y_data = ydata
        self.device = device

    def __len__(self):
        return len(self.x_data)
    def __getitem__(self, idx):
        x = LT(self.x_data[idx]).to(self.device)
        y = LT(self.y_data[idx]).to(self.device)
        return x, y

**Generate Dataset**

In [None]:
dataset = Generate_Dataset(text_padded[:5000,:], text_label[:5000].reshape([-1,1]), device)
trainset, testset = random_split(dataset, [4500,500])
train_loader = DataLoader(trainset, batch_size=256, shuffle = True)
test_loader = DataLoader(testset, batch_size = 500, shuffle = False)

**Define Network and Optimizer**

In [None]:
lstm_net = LSTM_net(num_output = 2, size_vocab = len(tokenizer.word_index), dim_embed = 64,
                    hidden_size = 64, linear_size = 64, num_layers = 1, device = device)

optimizer = torch.optim.Adam(lstm_net.parameters(), lr = 0.01)

**Training Session**

In [None]:
from tqdm import tqdm
for epoch in range(10):
    print('Epoch',epoch)
    with tqdm(train_loader, unit = 'batch') as tepoch:
        for x, y in tepoch:
            predict = lstm_net(x)
            loss = torch.nn.functional.cross_entropy(predict, y.ravel())
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        print(loss)
            #print(loss)
            #tepoch.set_description(f"Epoch {epoch}")
            #tepoch.set_postfix(loss = loss.item())

 ... (more hidden) ...

Epoch 0


 ... (more hidden) ...
 ... (more hidden) ...

tensor(0.3029, grad_fn=<NllLossBackward>)
Epoch 1


 ... (more hidden) ...
 ... (more hidden) ...

tensor(0.0551, grad_fn=<NllLossBackward>)
Epoch 2


 ... (more hidden) ...
 ... (more hidden) ...

tensor(0.0084, grad_fn=<NllLossBackward>)
Epoch 3


 ... (more hidden) ...
 ... (more hidden) ...

tensor(0.0022, grad_fn=<NllLossBackward>)
Epoch 4


 ... (more hidden) ...
 ... (more hidden) ...

tensor(0.0595, grad_fn=<NllLossBackward>)
Epoch 5


 ... (more hidden) ...
 ... (more hidden) ...

tensor(0.0032, grad_fn=<NllLossBackward>)
Epoch 6


 ... (more hidden) ...
 ... (more hidden) ...

tensor(0.0107, grad_fn=<NllLossBackward>)
Epoch 7


 ... (more hidden) ...
 ... (more hidden) ...

tensor(0.0001, grad_fn=<NllLossBackward>)
Epoch 8


 ... (more hidden) ...
 ... (more hidden) ...

tensor(3.9004e-05, grad_fn=<NllLossBackward>)
Epoch 9


 ... (more hidden) ...

tensor(2.1635e-05, grad_fn=<NllLossBackward>)





**Test the Performance**

In [None]:
with tqdm(test_loader, unit='batch') as tepoch:
    for x, y in tepoch:
        predict = lstm_net(x).argmax(1).detach().numpy()
        answer = y.ravel().detach().numpy()
score = 0
for i in range(len(predict)):
    if predict[i] == answer[i]:
        score += 1
print(score,'out of 500, accuracy is',score/500*100,'%')

 ... (more hidden) ...

486 out of 500, accuracy is 97.2 %





## 2. seq2seq 모델을 이용한 NLP machine translation

### 이번 실습에서는 LSTM 모델을 이용한 seq2seq 모델에서 기계 번역을 구현합니다.

**2.1 Download Dataset**

In [None]:
import os
import spacy
os.system("python -m spacy download en_core_web_sm")
os.system("python -m spacy download de_core_news_sm")

# Source from [1]
spacy_german = spacy.load('de_core_news_sm')
spacy_english = spacy.load('en_core_web_sm')


In [None]:
def tokenize_de(text):
    return [tok.text for tok in spacy_german.tokenizer(text)][::-1]
def tokenize_en(text):
    return [tok.text for tok in spacy_english.tokenizer(text)]
SRC = Field(tokenize = tokenize_de, init_token = '<sos>', eos_token = '<eos>', lower = True)
TRG = Field(tokenize = tokenize_en, init_token = '<sos>', eos_token = '<eos>', lower = True)

In [None]:
from torchtext.legacy.datasets import Multi30k
from torchtext.legacy.data import Field
train_data, valid_data, test_data = Multi30k.splits(exts = ('.de', '.en'), fields = (SRC, TRG))

In [None]:
SRC.build_vocab(train_data, min_freq = 2)
TRG.build_vocab(train_data, min_freq = 2)

**2.2 Network Structures**

In [None]:
import torch
import torch.nn as nn
import numpy as np
from torch.autograd import Variable

class seq_Encoder(nn.Module):
    def __init__(self, vocab_size, dim_embed, hidden_size, num_layers, dropout):
        super().__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        self.embed = nn.Embedding(vocab_size, dim_embed)
        self.lstm = nn.LSTM(dim_embed, hidden_size, num_layers, dropout = dropout)
        self.dropout = nn.Dropout(dropout)

    def forward(self, src):
        outputs, (hidden, cell) = self.lstm(self.dropout(self.embed(src)))
        return hidden, cell

In [None]:
class seq_Decoder(nn.Module):
    def __init__(self, output_size, dim_embed, hidden_size, num_layers, dropout):
        super().__init__()

        self.output_size = output_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        self.embed = nn.Embedding(output_size, dim_embed)
        self.lstm = nn.LSTM(dim_embed, hidden_size, num_layers, dropout = dropout)
        self.fclayer = nn.Linear(hidden_size, output_size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, input_data, hidden, cell):

        input_data = input_data.unsqueeze(0)
        embedded = self.dropout(self.embed(input_data))
        output, (hidden, cell) = self.lstm(embedded, (hidden, cell))
        prediction = self.fclayer(output.squeeze(0))

        return prediction, hidden, cell

In [None]:
import random

class seq2seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device

    def forward(self, source, target, tf_ratio = 0.5):
        batch_size = target.shape[1]
        translation_length = target.shape[0]
        target_vocab_size = self.decoder.output_size

        outputs = torch.zeros(translation_length, batch_size, target_vocab_size).to(self.device)
        hidden, cell = self.encoder(source)
        input_trans = target[0,:]

        for t in range(1, translation_length):
            output, hidden, cell = self.decoder(input_trans, hidden, cell)
            outputs[t] = output
            teacher_force = random.random() < tf_ratio
            input_trans = target[t] if teacher_force else output.argmax(1)
        return outputs

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
enc = seq_Encoder(len(SRC.vocab), 64, 64, 1, 0.3)
dec = seq_Decoder(len(TRG.vocab), 64, 64, 1, 0.3)
seq_net = seq2seq(enc, dec, device).to(device)
optimizer = torch.optim.Adam(seq_net.parameters(), lr = 0.01)



In [None]:
from torchtext.legacy.data import BucketIterator
train_iterator, valid_iterator, test_iterator = BucketIterator.splits((train_data, valid_data, test_data),
                                                                      batch_size = 256, device = device)

**2.3 Train the Translator Network**

In [None]:
pad_index = TRG.vocab.stoi[TRG.pad_token]
lossfcn = nn.CrossEntropyLoss(ignore_index = pad_index)

for epoch in range(10):
    loss_epoch = 0
    for batch in train_iterator:
        source_data = batch.src
        target_data = batch.trg
        target_pred = seq_net(source_data, target_data)
        target_pred = target_pred[1:].view(-1, target_pred.shape[-1])
        target_data = target_data[1:].view(-1)
        optimizer.zero_grad()
        loss = lossfcn(target_pred, target_data)
        loss.backward()
        optimizer.step()
        loss_epoch += loss.item()
    print('Epoch',epoch,'Loss',loss_epoch/len(train_iterator))

Epoch 0 Loss 5.136473542765567
Epoch 1 Loss 4.546293902815434


KeyboardInterrupt: 