In [None]:
!pip install konlpy

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## 라이브러리 다운로드

In [None]:
!pip install torchtext==0.4.0

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from torchtext import data, datasets
from torchtext.vocab import GloVe

import re
import os
import math
import random
import numpy as np
import pandas as pd
from tqdm import tqdm, tqdm_notebook
import matplotlib.pyplot as plt

import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
torch.manual_seed(777)
random.seed(777)
np.random.seed(777)

## Data setting

In [None]:
from konlpy.tag import Okt
okt = Okt()

In [None]:
torch.manual_seed(777)
random.seed(777)
np.random.seed(777)

train = pd.read_csv("/content/drive/My Drive/crawling_20211104.csv")
df_shuffled=train.iloc[np.random.permutation(train.index)].reset_index(drop=True)
train = df_shuffled.copy()

label = []
for s in train['scores']:
  if s == "width: 100%":
    label.append(0)
  elif s == "width: 80%":
    label.append(1)
  else:
    label.append(2)

train['label'] = label

train_df = train[:-8000]
val_df = train[-8000:]

total_tokens = [okt.morphs(sentence) for sentence in tqdm(train['rvs'])]

stopwords = ['의','가','이','은','들','는','과','도','를','으로','자','에','와','한','하다','대다','년','월','대']

100%|██████████| 30000/30000 [02:12<00:00, 226.40it/s]


In [None]:
using_data = train[['rvs', 'label', 'retypes', 'categories', 'meta_sizes', 'meta_brights', 'meta_colors', 'meta_thicks']]

In [None]:
meta_df = pd.DataFrame()
for c in using_data.columns.tolist()[2:]:
  dummy_cate = pd.get_dummies(using_data[c])
  meta_df = pd.concat([meta_df, dummy_cate], axis=1)

train_meta_df = meta_df[:-8000]
val_meta_df = meta_df[-8000:]

### 단어 집합 만들기 with GloVe (torchtext)

In [None]:
# 필드 정의
TEXT = data.Field(sequential=True,
                  use_vocab=True,
                  tokenize=okt.morphs,
                  lower=True,
                  batch_first=True,
                  stop_words = stopwords,
                  fix_length=100)

LABEL = data.LabelField(sequential=False,
                   use_vocab=False,
                   batch_first=False,
                   is_target=True)

TEXT.build_vocab(total_tokens,  vectors=GloVe(name='6B', dim=200), min_freq=3, max_size=10000)
LABEL.build_vocab(['neg','pos'])

vocab = TEXT.vocab
print('단어 집합의 크기 : {}'.format(len(vocab)))
vocab_size = len(vocab)

word_dict = TEXT.vocab.stoi
rev_word_dict = {v:k for k, v in word_dict.items()}

.vector_cache/glove.6B.zip: 862MB [02:45, 5.21MB/s]                           
100%|█████████▉| 399999/400000 [00:32<00:00, 12405.91it/s]


단어 집합의 크기 : 8949


In [None]:
using_train_data = train_df[['rvs', 'label', 'categories', 'retypes',
       'meta_sizes', 'meta_brights', 'meta_colors', 'meta_thicks', ]]
      #  'pur_option', 'cus_sex', 'cus_height', 'cus_weight']]

using_val_data = val_df[['rvs', 'label', 'categories', 'retypes',
       'meta_sizes', 'meta_brights', 'meta_colors', 'meta_thicks', ]]
      #  'pur_option', 'cus_sex', 'cus_height', 'cus_weight']]

In [None]:
 class BinaryDataset(Dataset):

    def __init__(self, df, meta_df, word_dict, min_len, only_ct = True, test=False):
        self.df = df
        self.meta_df = meta_df
        self.w2i = word_dict
        self.i2w = {v:k for k, v in word_dict.items()}
        self.min_len = min_len
        self.test = test
        self.only_ct = only_ct

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        sentence = self.df.iloc[idx, 0]
        label = self.df.iloc[idx, 1]

        if self.only_ct:
          meta = self.meta_df.iloc[idx, :]
        else:
          meta = self.meta_df.iloc[idx, 3:9]

        text = [tok for tok in word_tokenize(sentence.lower())]
        if len(text) < self.min_len:
            text += [TEXT.pad_token] * (self.min_len - len(text))
        else :
            text = text[:self.min_len]
        indexed = torch.tensor([self.w2i[t] for t in text])

        return indexed, torch.tensor(label), torch.tensor(meta).float()

BATCH = 16

train_data = BinaryDataset(using_train_data, train_meta_df, word_dict, 64)
train_loader = DataLoader(train_data, batch_size=BATCH, shuffle=True)

val_data = BinaryDataset(using_val_data, val_meta_df, word_dict, 64)
eval_loader = DataLoader(val_data, batch_size=BATCH, shuffle=False)

print('훈련 샘플의 개수 : {}'.format(len(train_data)))
print('검증 샘플의 개수 : {}'.format(len(val_data)))

훈련 샘플의 개수 : 22000
검증 샘플의 개수 : 8000


In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device_type = 'cuda' if torch.cuda.is_available() else 'cpu'

In [None]:
class ConstantsClass():
    def __init__(self):
        self.UNK = 0
        self.PAD = 1
        self.BOS = 2
        self.EOS = 3
        self.PAD_WORD = '<pad>'
        self.UNK_WORD = '<unk>'
        self.BOS_WORD = '<s>'
        self.EOS_WORD = '</s>'


Constants = ConstantsClass()


class Linear(nn.Module):
    ''' Simple Linear layer with xavier init '''

    def __init__(self, d_in, d_out, bias=True):
        super(Linear, self).__init__()
        self.linear = nn.Linear(d_in, d_out, bias=bias)
        torch.nn.init.xavier_normal(self.linear.weight)

    def forward(self, x):
        return self.linear(x)


class Bottle(nn.Module):
    ''' Perform the reshape routine before and after an operation '''

    def forward(self, input):
        if len(input.size()) <= 2:
            return super(Bottle, self).forward(input)
        size = input.size()[:2]
        out = super(Bottle, self).forward(input.view(size[0] * size[1], -1))
        return out.view(size[0], size[1], -1)


class BottleLinear(Bottle, Linear):
    ''' Perform the reshape routine before and after a linear projection '''
    pass


class BottleSoftmax(Bottle, nn.Softmax):
    ''' Perform the reshape routine before and after a softmax operation'''
    pass


class LayerNormalization(nn.Module):
    ''' Layer normalization module '''

    def __init__(self, d_hid, eps=1e-3):
        super(LayerNormalization, self).__init__()

        self.eps = eps
        self.a_2 = nn.Parameter(torch.ones(d_hid), requires_grad=True)
        self.b_2 = nn.Parameter(torch.zeros(d_hid), requires_grad=True)

    def forward(self, z):
        if z.size(1) == 1:
            return z

        mu = torch.mean(z, keepdim=True, dim=-1)
        sigma = torch.std(z, keepdim=True, dim=-1)
        ln_out = (z - mu.expand_as(z)) / (sigma.expand_as(z) + self.eps)
        ln_out = ln_out * self.a_2.expand_as(ln_out) + self.b_2.expand_as(ln_out)

        return ln_out


class BatchBottle(nn.Module):
    ''' Perform the reshape routine before and after an operation '''

    def forward(self, input):
        if len(input.size()) <= 2:
            return super(BatchBottle, self).forward(input)
        size = input.size()[1:]
        out = super(BatchBottle, self).forward(input.view(-1, size[0] * size[1]))
        return out.view(-1, size[0], size[1])


class BottleLayerNormalization(BatchBottle, LayerNormalization):
    ''' Perform the reshape routine before and after a layer normalization'''
    pass


class ScaledDotProductAttention(nn.Module):
    ''' Scaled Dot-Product Attention '''

    def __init__(self, d_model, attn_dropout=0.1):
        super(ScaledDotProductAttention, self).__init__()
        self.temper = np.power(d_model, 0.5)
        self.dropout = nn.Dropout(attn_dropout)
        self.softmax = BottleSoftmax()

    def forward(self, q, k, v, attn_mask=None):
        attn = torch.bmm(q, k.transpose(1, 2)) / self.temper

        if attn_mask is not None:
            assert attn_mask.size() == attn.size(), \
                'Attention mask shape {} mismatch ' \
                'with Attention logit tensor shape ' \
                '{}.'.format(attn_mask.size(), attn.size())

            attn.data.masked_fill_(attn_mask, -float('inf'))

        attn = self.softmax(attn)
        attn = self.dropout(attn)
        output = torch.bmm(attn, v)

        return output, attn


class MultiHeadAttention(nn.Module):
    ''' Multi-Head Attention module '''

    def __init__(self, n_head, d_model, d_k, d_v, dropout=0.1):
        super(MultiHeadAttention, self).__init__()

        self.n_head = n_head
        self.d_k = d_k
        self.d_v = d_v

        self.w_qs = nn.Parameter(torch.FloatTensor(n_head, d_model, d_k))
        self.w_ks = nn.Parameter(torch.FloatTensor(n_head, d_model, d_k))
        self.w_vs = nn.Parameter(torch.FloatTensor(n_head, d_model, d_v))

        self.attention = ScaledDotProductAttention(d_model)
        self.layer_norm = LayerNormalization(d_model)
        self.proj = Linear(n_head * d_v, d_model)

        self.dropout = nn.Dropout(dropout)

        torch.nn.init.xavier_normal(self.w_qs)
        torch.nn.init.xavier_normal(self.w_ks)
        torch.nn.init.xavier_normal(self.w_vs)

    def forward(self, q, k, v, attn_mask=None):
        d_k, d_v = self.d_k, self.d_v
        n_head = self.n_head

        residual = q

        mb_size, len_q, d_model = q.size()
        mb_size, len_k, d_model = k.size()
        mb_size, len_v, d_model = v.size()

        # treat as a (n_head) size batch
        q_s = q.repeat(n_head, 1, 1).view(n_head, -1, d_model)  # n_head x (mb_size*len_q) x d_model
        k_s = k.repeat(n_head, 1, 1).view(n_head, -1, d_model)  # n_head x (mb_size*len_k) x d_model
        v_s = v.repeat(n_head, 1, 1).view(n_head, -1, d_model)  # n_head x (mb_size*len_v) x d_model

        # treat the result as a (n_head * mb_size) size batch
        q_s = torch.bmm(q_s, self.w_qs).view(-1, len_q, d_k)  # (n_head*mb_size) x len_q x d_k
        k_s = torch.bmm(k_s, self.w_ks).view(-1, len_k, d_k)  # (n_head*mb_size) x len_k x d_k
        v_s = torch.bmm(v_s, self.w_vs).view(-1, len_v, d_v)  # (n_head*mb_size) x len_v x d_v

        # perform attention, result size = (n_head * mb_size) x len_q x d_v
        outputs, attns = self.attention(q_s, k_s, v_s, attn_mask=attn_mask.repeat(n_head, 1, 1))

        # back to original mb_size batch, result size = mb_size x len_q x (n_head*d_v)
        outputs = torch.cat(torch.split(outputs, mb_size, dim=0), dim=-1)

        # project back to residual size
        outputs = self.proj(outputs)
        outputs = self.dropout(outputs)

        return self.layer_norm(outputs + residual), attns


class PositionwiseFeedForward(nn.Module):
    ''' A two-feed-forward-layer module '''

    def __init__(self, d_hid, d_inner_hid, dropout=0.1):
        super(PositionwiseFeedForward, self).__init__()
        self.w_1 = nn.Conv1d(d_hid, d_inner_hid, 1)  # position-wise
        self.w_2 = nn.Conv1d(d_inner_hid, d_hid, 1)  # position-wise
        self.layer_norm = LayerNormalization(d_hid)
        self.dropout = nn.Dropout(dropout)
        self.relu = nn.ReLU()

    def forward(self, x):
        residual = x
        output = self.relu(self.w_1(x.transpose(1, 2)))
        output = self.w_2(output).transpose(2, 1)
        output = self.dropout(output)
        return self.layer_norm(output + residual)


class EncoderLayer(nn.Module):
    ''' Compose with two layers '''

    def __init__(self, d_model, d_inner_hid, n_head, d_k, d_v, dropout=0.1):
        super(EncoderLayer, self).__init__()
        self.slf_attn = MultiHeadAttention(
            n_head, d_model, d_k, d_v, dropout=dropout)
        self.pos_ffn = PositionwiseFeedForward(d_model, d_inner_hid, dropout=dropout)

    def forward(self, enc_input, slf_attn_mask=None):
        enc_output, enc_slf_attn = self.slf_attn(
            enc_input, enc_input, enc_input, attn_mask=slf_attn_mask)
        enc_output = self.pos_ffn(enc_output)
        return enc_output, enc_slf_attn

In [None]:
def position_encoding_init(n_position, d_pos_vec):
    ''' Init the sinusoid position encoding table '''

    # keep dim 0 for padding token position encoding zero vector
    position_enc = np.array([
        [pos / np.power(10000, 2 * (j // 2) / d_pos_vec) for j in range(d_pos_vec)]
        if pos != 0 else np.zeros(d_pos_vec) for pos in range(n_position)])

    position_enc[1:, 0::2] = np.sin(position_enc[1:, 0::2])  # dim 2i
    position_enc[1:, 1::2] = np.cos(position_enc[1:, 1::2])  # dim 2i+1
    return torch.from_numpy(position_enc).type(torch.FloatTensor)


def get_attn_padding_mask(seq_q, seq_k):
    ''' Indicate the padding-related part to mask '''
    assert seq_q.dim() == 2 and seq_k.dim() == 2
    mb_size, len_q = seq_q.size()
    mb_size, len_k = seq_k.size()
    pad_attn_mask = seq_k.data.eq(Constants.PAD).unsqueeze(1)  # bx1xsk
    pad_attn_mask = pad_attn_mask.expand(mb_size, len_q, len_k)  # bxsqxsk
    return pad_attn_mask


def get_attn_subsequent_mask(seq):
    ''' Get an attention mask to avoid using the subsequent info.'''
    assert seq.dim() == 2
    attn_shape = (seq.size(0), seq.size(1), seq.size(1))
    subsequent_mask = np.triu(np.ones(attn_shape), k=1).astype('uint8')
    subsequent_mask = torch.from_numpy(subsequent_mask)
    if seq.is_cuda:
        subsequent_mask = subsequent_mask.cuda()
    return subsequent_mask


class Encoder(nn.Module):
    ''' A encoder model with self attention mechanism. '''

    def __init__(
            self, n_src_vocab, n_max_seq, n_layers=6, n_head=8, d_k=64, d_v=64,
            d_word_vec=512, d_model=512, d_inner_hid=1024, dropout=0.1):

        super(Encoder, self).__init__()

        n_position = n_max_seq + 1
        self.n_max_seq = n_max_seq
        self.d_model = d_model

        self.position_enc = nn.Embedding(n_position, d_word_vec, padding_idx=Constants.PAD)
        self.position_enc.weight.data = position_encoding_init(n_position, d_word_vec)

        self.src_word_emb = nn.Embedding(n_src_vocab, d_word_vec, padding_idx=Constants.PAD)

        self.layer_stack = nn.ModuleList([
            EncoderLayer(d_model, d_inner_hid, n_head, d_k, d_v, dropout=dropout)
            for _ in range(n_layers)])

    def forward(self, src_seq, src_pos, return_attns=False):
        # Word embedding look up
        enc_input = self.src_word_emb(src_seq)

        # Position Encoding addition
        enc_input += self.position_enc(src_pos)
        if return_attns:
            enc_slf_attns = []

        enc_output = enc_input
        enc_slf_attn_mask = get_attn_padding_mask(src_seq, src_seq)
        for enc_layer in self.layer_stack:
            enc_output, enc_slf_attn = enc_layer(
                enc_output, slf_attn_mask=enc_slf_attn_mask)
            if return_attns:
                enc_slf_attns += [enc_slf_attn]

        if return_attns:
            return enc_output, enc_slf_attns
        else:
            return enc_output

In [None]:
class MultiHeadAttention_classifier(nn.Module):
    def __init__(self, vocab, meta_input_dim, meta_hidden_dim, n_layers=6, n_head=8, max_seq_len = 64, label_size = 3, use_m = True,
                 d_word_vec=128, d_model=128, d_inner_hid=256, d_k=32, d_v=32,
                 dropout=0.1, proj_share_weight=True, embs_share_weight=True):
        super(MultiHeadAttention_classifier, self).__init__()
        vocab_size = len(vocab)
        max_seq_len = max_seq_len
        label_size = label_size
        self.use_m = use_m

        self.encoder = Encoder(
            vocab_size, max_seq_len, n_layers=n_layers, n_head=n_head,
            d_word_vec=d_word_vec, d_model=d_model,
            d_inner_hid=d_inner_hid, dropout=dropout)
        
        self.meta_feature = nn.Sequential(
            nn.Linear(meta_input_dim, meta_hidden_dim),
            nn.Linear(meta_hidden_dim, meta_input_dim)
        )
        if self.use_m :
          self.hidden2label = nn.Sequential(
              nn.Linear(max_seq_len * d_model + meta_input_dim, d_model),
              nn.Linear(d_model, label_size)
          )

        else:
          self.hidden2label = nn.Sequential(
              nn.Linear(max_seq_len * d_model, d_model),
              nn.Linear(d_model, label_size)
          )

    def forward(self, seq, meta, lengths=None):
        batch_size = seq.size(0)
        pos = np.array([[pos_i + 1 if w_i != Constants.PAD else 0 for pos_i, w_i in enumerate(inst)] for inst in seq])
        pos = torch.from_numpy(pos)
        if torch.cuda.is_available():
            pos = pos.cuda()
        enc_output = self.encoder(seq, pos)

        if self.use_m :
          # meta = meta.unsqueeze(1)
          meta_f = self.meta_feature(meta)
          cat = torch.cat([enc_output.reshape((batch_size, -1)), meta_f], dim=1)
          out = self.hidden2label(cat)
        else:
          out = self.hidden2label(enc_output.reshape((batch_size, -1)))
        
        return out

In [None]:
META_INPUT = 21
META_HIDDEN = 64

model = MultiHeadAttention_classifier(vocab, META_INPUT, META_INPUT)
optimizer = optim.Adam(model.parameters())

criterion = nn.CrossEntropyLoss()
model = model.to(device)
criterion = criterion.to(device)



In [None]:
def binary_accuracy(preds, y):

    output_softmax = torch.log_softmax(preds, 1)
    _, output_tags = torch.max(output_softmax, 1)
    correct_pred = (output_tags == y).float()
    acc = correct_pred.sum() / len(correct_pred)
    # acc_sum += acc * len(correct_pred)
    return acc, len(correct_pred)

In [None]:
def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    acc_sum = 0

    for batch in iterator:
        
        optimizer.zero_grad()
        
        text, label, meta = batch[0].to(device), batch[1].to(device), batch[2].to(device)
        outputs = model(text, meta)
        predictions = F.softmax(outputs, dim=1)
        loss = criterion(predictions, label)
        
        acc, leng = binary_accuracy(outputs, label)
        acc_sum += acc * leng
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [None]:
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:

            text, label, meta = batch[0].to(device), batch[1].to(device), batch[2].to(device)

            outputs = model(text, meta)#.squeeze(1)
            predictions = F.softmax(outputs, dim=1)

            loss = criterion(predictions, label)
            
            acc, _ = binary_accuracy(outputs, label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [None]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score
from sklearn.metrics import recall_score, f1_score, roc_auc_score

def get_clf_eval(y_test=None, pred=None):
    confusion = confusion_matrix(y_test, pred)
    accuracy = accuracy_score(y_test, pred)
    precision = precision_score(y_test, pred)
    recall = recall_score(y_test, pred)
    f1 = f1_score(y_test, pred)

    roc_auc = roc_auc_score(y_test, pred)
    print("Confusion Matrix")
    print("*"*20)
    print(confusion)
    print("*"*20)
    print("ACC: {0:.4f}, Precision : {1: .4f}, Recall : {2: .4f}, F1_score : {3:.4f}, AUC : {4:.4f}"
    .format(accuracy, precision, recall,f1,roc_auc))

In [None]:
N_EPOCHS = 5

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_acc = train(model, train_loader, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, eval_loader, criterion)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'lstm_model.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')