In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
from table_LLM import *
from natsort import natsorted

  from .autonotebook import tqdm as notebook_tqdm


In [14]:
data = pd.read_csv('/home/hyun/paper/dataset/train_data_1000.csv')
data_x = data.iloc[:, :-1]
data_y = data.iloc[:, -1:]
x_columns = data_x.columns
y_column = data_y.columns

data["result"] = data["result"].apply(lambda x: f"<num>{str(x)}<num>")

unique_words = table_util.get_unique_word(data, x_columns=x_columns, y_column=y_column)
special_word = ["is", ",", " ", "<end>", "<start>", "<pad>"]
unique_words.extend(special_word)
unique_words = natsorted(unique_words)


unique_words.append("<num>")
unique_words.extend(["0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "."])

In [15]:
unique_word2idx = {word: idx for idx, word in enumerate(unique_words)}
idx2unique_word = {idx: word for idx, word in enumerate(unique_words)}


In [16]:
unique_word2idx

{' ': 0,
 ',': 1,
 '<end>': 2,
 '<pad>': 3,
 '<start>': 4,
 'Facility1_1': 5,
 'Facility1_2': 6,
 'Facility1_3': 7,
 'Facility1_4': 8,
 'Facility1_5': 9,
 'Facility1_6': 10,
 'Facility1_7': 11,
 'Facility2_1': 12,
 'Facility2_2': 13,
 'Facility2_3': 14,
 'Facility2_4': 15,
 'Facility2_5': 16,
 'Facility2_6': 17,
 'Facility2_7': 18,
 'Facility2_8': 19,
 'Facility3_1': 20,
 'Facility3_2': 21,
 'Facility3_3': 22,
 'Facility3_4': 23,
 'Facility3_5': 24,
 'Facility3_6': 25,
 'Facility4_1': 26,
 'Facility4_2': 27,
 'Facility4_3': 28,
 'Facility4_4': 29,
 'Facility4_5': 30,
 'Facility4_6': 31,
 'Facility4_7': 32,
 'Facility5_1': 33,
 'Facility5_2': 34,
 'Facility5_3': 35,
 'Facility5_4': 36,
 'Facility5_5': 37,
 'Facility5_6': 38,
 'Facility5_7': 39,
 'Facility5_8': 40,
 'Facility6_1': 41,
 'Facility6_2': 42,
 'Facility6_3': 43,
 'Facility6_4': 44,
 'Facility6_5': 45,
 'is': 46,
 'process1': 47,
 'process2': 48,
 'process3': 49,
 'process4': 50,
 'process5': 51,
 'process6': 52,
 'result': 53

In [33]:
class TwoStepTokenizer():
    def __init__(self, unique_word2idx, idx2unique_word):
        self.unique_word2idx = unique_word2idx
        self.idx2unique_word = idx2unique_word
        self.vocab_size = len(unique_word2idx)
        self.eos_token = "<end>"
        self.sos_token = "<start>"
        self.pad_token = "<pad>"
        self.pad_token_idx = self.unique_word2idx[self.pad_token]
        self.num_token = "<num>"
        self.num_token_idx = self.unique_word2idx[self.num_token]

    def _encode(self, sentence):
        token_list = []
        position_list = []
        token_type_list = []
        temp = ""
        position = 0
        flag = False
        for word in sentence:
            temp += word
            if temp in self.unique_word2idx:

                token_list.append(self.unique_word2idx[temp]) # token
                position_list.append(position) # position

                if self.unique_word2idx[temp] > self.num_token_idx:
                    token_type_list.append(1)
                else:
                    token_type_list.append(0)



                position += 1
                temp = ""
        return token_list, position_list, token_type_list
    
    def _decode(self, tokens):
        sentence = [self.idx2unique_word[token] for token in tokens]
        sentence = "".join(sentence)
        return sentence
    
    def encode(self, sentence):
        if isinstance(sentence, str):
            return self._encode(sentence)
        elif isinstance(sentence, list):
            return [self._encode(s) for s in sentence]
        
    def decode(self, tokens):
        if isinstance(tokens[0], list):
            return [self._decode(t) for t in tokens]
        else:
            return self._decode(tokens)
        
    def __call__(self, sentence):
        return self.encode(sentence)
    



In [34]:
tokenizer = TwoStepTokenizer(unique_word2idx, idx2unique_word)    

In [35]:
dataset = TableDataset_v2.from_pandas(data, preserve_index=False)
dataset.set_tokenizer(tokenizer)
dataset.set_length(max_length=55)
dataset.set_generate_mode("train")

In [36]:
encoded_tokens = tokenizer.encode("process4 is Facility4_3, process6 is Facility6_5, process5 is Facility5_3, process1 is Facility1_2, result is <num>46.13<num>, process3 is Facility3_5<end>")
print(encoded_tokens[0])
print(encoded_tokens[1])
print(encoded_tokens[2])
tokenizer.decode([50, 0, 46, 0, 28, 1, 0, 49, 0, 46, 0, 24, 1, 0, 47, 0, 46, 0, 6, 1, 0, 48, 0, 46, 0, 16, 1, 0, 51, 0, 46, 0, 35, 1, 0, 53, 0, 46, 0, 54, 59, 61, 65, 56, 58, 54, 1, 0, 52, 0, 46, 0, 45, 2])

[50, 0, 46, 0, 28, 1, 0, 52, 0, 46, 0, 45, 1, 0, 51, 0, 46, 0, 35, 1, 0, 47, 0, 46, 0, 6, 1, 0, 53, 0, 46, 0, 54, 59, 61, 65, 56, 58, 54, 1, 0, 49, 0, 46, 0, 24, 2]
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]


'process4 is Facility4_3, process3 is Facility3_5, process1 is Facility1_2, process2 is Facility2_5, process5 is Facility5_3, result is <num>46.13<num>, process6 is Facility6_5<end>'

In [9]:
# 하이퍼파라미터 설정
vocab_size = tokenizer.vocab_size
embed_size = 128
num_heads = 8
hidden_size = 256
num_layers = 4
max_seq_length = 55
batch_size = 32
num_epochs = 100

In [10]:
import copy
import torch
import math
import torch.nn as nn
from torch.nn.parameter import Parameter
from torch.nn import Conv1d as Conv1D
from torch.nn import GELU as gelu
from torch.nn import LayerNorm as LayerNorm

# def gelu(x):
#     return 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))

# class LayerNorm(nn.Module):
#     """
#     레이어의 출력에 대해 평균과 분산을 구하고 이를 통해 출력을 정규화하는 레이어

#     """
#     def __init__(self, hidden_size, eps=1e-12):
#         """Construct a layernorm module in the TF style (epsilon inside the square root).
#         """
#         super(LayerNorm, self).__init__()
#         self.weight = nn.Parameter(torch.ones(hidden_size))
#         self.bias = nn.Parameter(torch.zeros(hidden_size))
#         self.variance_epsilon = eps

#     def forward(self, x):
#         u = x.mean(-1, keepdim=True)
#         s = (x - u).pow(2).mean(-1, keepdim=True)
#         x = (x - u) / torch.sqrt(s + self.variance_epsilon)
#         return self.weight * x + self.bias

# class Conv1D(nn.Module):
#     """
#     1D Convolution 레이어.  == torch.nn.Conv1d
#     """
#     def __init__(self, nf, nx):
#         super(Conv1D, self).__init__()
#         self.nf = nf
#         w = torch.empty(nx, nf)
#         nn.init.normal_(w, std=0.02)
#         self.weight = Parameter(w)
#         self.bias = Parameter(torch.zeros(nf))

#     def forward(self, x):
#         size_out = x.size()[:-1] + (self.nf,)
#         x = torch.addmm(self.bias, x.view(-1, x.size(-1)), self.weight)
#         x = x.view(*size_out)
#         return x

class Attention(nn.Module):
    def __init__(self, nx, n_ctx, config, scale=False):
        super(Attention, self).__init__()
        n_state = nx  # in Attention: n_state=768 (nx=n_embd)
        # [switch nx => n_state from Block to Attention to keep identical to TF implem]
        assert n_state % config.n_head == 0
        self.register_buffer("bias", torch.tril(torch.ones(n_ctx, n_ctx)).view(1, 1, n_ctx, n_ctx))
        self.n_head = config.n_head
        self.split_size = n_state
        self.scale = scale
        self.c_attn = Conv1D(n_state * 3, nx)
        self.c_proj = Conv1D(n_state, nx)

    def _attn(self, q, k, v):
        w = torch.matmul(q, k)
        if self.scale:
            w = w / math.sqrt(v.size(-1))
        nd, ns = w.size(-2), w.size(-1)
        b = self.bias[:, :, ns-nd:ns, :ns]
        w = w * b - 1e10 * (1 - b)
        w = nn.Softmax(dim=-1)(w)
        return torch.matmul(w, v)

    def merge_heads(self, x):
        x = x.permute(0, 2, 1, 3).contiguous()
        new_x_shape = x.size()[:-2] + (x.size(-2) * x.size(-1),)
        return x.view(*new_x_shape)  # in Tensorflow implem: fct merge_states

    def split_heads(self, x, k=False):
        new_x_shape = x.size()[:-1] + (self.n_head, x.size(-1) // self.n_head)
        x = x.view(*new_x_shape)  # in Tensorflow implem: fct split_states
        if k:
            return x.permute(0, 2, 3, 1)  # (batch, head, head_features, seq_length)
        else:
            return x.permute(0, 2, 1, 3)  # (batch, head, seq_length, head_features)

    def forward(self, x, layer_past=None):
        x = self.c_attn(x)
        query, key, value = x.split(self.split_size, dim=2)
        query = self.split_heads(query)
        key = self.split_heads(key, k=True)
        value = self.split_heads(value)
        if layer_past is not None:
            past_key, past_value = layer_past[0].transpose(-2, -1), layer_past[1]  # transpose back cf below
            key = torch.cat((past_key, key), dim=-1)
            value = torch.cat((past_value, value), dim=-2)
        present = torch.stack((key.transpose(-2, -1), value))  # transpose to have same shapes for stacking
        a = self._attn(query, key, value)
        a = self.merge_heads(a)
        a = self.c_proj(a)
        return a, present

class MLP(nn.Module):
    def __init__(self, n_state, config):  # in MLP: n_state=3072 (4 * n_embd)
        super(MLP, self).__init__()
        nx = config.n_embd
        self.c_fc = Conv1D(n_state, nx)
        self.c_proj = Conv1D(nx, n_state)
        self.act = gelu

    def forward(self, x):
        h = self.act(self.c_fc(x))
        h2 = self.c_proj(h)
        return h2

class Block(nn.Module):
    def __init__(self, n_ctx, config, scale=False):
        super(Block, self).__init__()
        nx = config.n_embd
        self.ln_1 = LayerNorm(nx, eps=config.layer_norm_epsilon)
        self.attn = Attention(nx, n_ctx, config, scale)
        self.ln_2 = LayerNorm(nx, eps=config.layer_norm_epsilon)
        self.mlp = MLP(4 * nx, config)

    def forward(self, x, layer_past=None):
        a, present = self.attn(self.ln_1(x), layer_past=layer_past)
        x = x + a
        m = self.mlp(self.ln_2(x))
        x = x + m
        return x, present

class GPT2Model(nn.Module):
    def __init__(self, config):
        super(GPT2Model, self).__init__()
        self.n_layer = config.n_layer
        self.n_embd = config.n_embd
        self.n_vocab = config.vocab_size

        self.wte = nn.Embedding(config.vocab_size, config.n_embd)
        self.wpe = nn.Embedding(config.n_positions, config.n_embd)
        block = Block(config.n_ctx, config, scale=True)
        self.h = nn.ModuleList([copy.deepcopy(block) for _ in range(config.n_layer)])
        self.ln_f = LayerNorm(config.n_embd, eps=config.layer_norm_epsilon)

    def set_embeddings_weights(self, model_embeddings_weights):
        embed_shape = model_embeddings_weights.shape
        self.decoder = nn.Linear(embed_shape[1], embed_shape[0], bias=False)
        self.decoder.weight = model_embeddings_weights  # Tied weights

    def forward(self, input_ids, position_ids=None, token_type_ids=None, past=None):
        if past is None:
            past_length = 0
            past = [None] * len(self.h)
        else:
            past_length = past[0][0].size(-2)
        if position_ids is None:
            position_ids = torch.arange(past_length, input_ids.size(-1) + past_length, dtype=torch.long,
                                        device=input_ids.device)
            position_ids = position_ids.unsqueeze(0).expand_as(input_ids)

        input_shape = input_ids.size()
        input_ids = input_ids.view(-1, input_ids.size(-1))
        position_ids = position_ids.view(-1, position_ids.size(-1))

        inputs_embeds = self.wte(input_ids)
        position_embeds = self.wpe(position_ids)
        if token_type_ids is not None:
            token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1))
            token_type_embeds = self.wte(token_type_ids)
        else:
            token_type_embeds = 0
        hidden_states = inputs_embeds + position_embeds + token_type_embeds
        presents = []
        for block, layer_past in zip(self.h, past):
            hidden_states, present = block(hidden_states, layer_past)
            presents.append(present)
        hidden_states = self.ln_f(hidden_states)
        output_shape = input_shape + (hidden_states.size(-1),)
        return hidden_states.view(*output_shape), presents

class GPT2LMHead(nn.Module):
    def __init__(self, model_embeddings_weights, config):
        super(GPT2LMHead, self).__init__()
        self.n_embd = config.n_embd
        self.set_embeddings_weights(model_embeddings_weights)

    def set_embeddings_weights(self, model_embeddings_weights):
        embed_shape = model_embeddings_weights.shape
        self.decoder = nn.Linear(embed_shape[1], embed_shape[0], bias=False)
        self.decoder.weight = model_embeddings_weights  # Tied weights

    def forward(self, hidden_state):
        # Truncated Language modeling logits (we remove the last token)
        # h_trunc = h[:, :-1].contiguous().view(-1, self.n_embd)
        lm_logits = self.decoder(hidden_state)
        return lm_logits

class GPT2LMHeadModel(nn.Module):
    def __init__(self, config):
        super(GPT2LMHeadModel, self).__init__()
        self.transformer = GPT2Model(config)
        self.lm_head = GPT2LMHead(self.transformer.wte.weight, config)

    def set_tied(self):
        """ Make sure we are sharing the embeddings
        """
        self.lm_head.set_embeddings_weights(self.transformer.wte.weight)

    def forward(self, input_ids, position_ids=None, token_type_ids=None, lm_labels=None, past=None):
        hidden_states, presents = self.transformer(input_ids, position_ids, token_type_ids, past)
        lm_logits = self.lm_head(hidden_states)
        if lm_labels is not None:
            loss_fct = nn.CrossEntropyLoss(ignore_index=-1)
            loss = loss_fct(lm_logits.view(-1, lm_logits.size(-1)), lm_labels.view(-1))
            return loss
        return lm_logits, presents

In [12]:
for i, batch in enumerate(train_iterator):
    print(batch[0]) # tensor
    print(tokenizer.decode(batch[0].tolist())) # string
    break

tensor([47,  0, 46,  0,  8,  1,  0, 53,  0, 46,  0, 54, 59, 58, 65, 64, 64, 54,
         1,  0, 50,  0, 46,  0, 26,  1,  0, 51,  0, 46,  0, 36,  1,  0, 49,  0,
        46,  0, 21,  1,  0, 48,  0, 46,  0, 18,  1,  0, 52,  0, 46,  0, 41,  2,
         3])
process1 is Facility1_4, result is <num>43.99<num>, process4 is Facility4_1, process5 is Facility5_4, process3 is Facility3_2, process2 is Facility2_7, process6 is Facility6_1<end><pad>


In [13]:
for epoch in range(num_epochs):
    train_loss = train(model, train_iterator, optimizer, criterion, 1)  
    valid_loss = evaluate(model, train_iterator, criterion)
    print(f'Epoch: {epoch+1:02}')
    print(f'\tTrain Loss: {train_loss:.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f}')


AttributeError: 'Tensor' object has no attribute 'src'