In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
# from sample_squad_data import *
# from sample_another_data import *
import pickle
import time
from transformers import BertModel, BertTokenizer
from torch.nn.utils.rnn import pad_packed_sequence
from torch.nn.utils.rnn import pack_padded_sequence
from torch.utils.data import Dataset, DataLoader
import random

  torch.utils._pytree._register_pytree_node(
  torch.utils._pytree._register_pytree_node(


In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [41]:

class Encoder(nn.Module):
    def __init__(self,input_size, hidden_size, dropout=0.1):
        super().__init__()
        self.rnn = nn.GRU(input_size, hidden_size, num_layers = 1,
                          dropout= dropout, bidirectional=True, batch_first = True)
        self.dropout = nn.Dropout(dropout)
    def forward(self, input_emb, lengths, device):
        lengths = lengths.cpu()
        emb = pack_padded_sequence(input_emb, lengths, batch_first=False, enforce_sorted=False)
        self.rnn.flatten_parameters()
        outputs, hidden_t = self.rnn(emb)
        outputs = pad_packed_sequence(outputs, batch_first=False)
        return outputs, hidden_t

class Decoder(nn.Module):
    def __init__(self, enc_size, dec_size, n_layers, att_vec_size, bert_model_name, dropout= 0.1): 
        super().__init__()
        self.bert = BertModel.from_pretrained(bert_model_name)
        self.rnn = nn.GRU(
            992, dec_size,
            num_layers=n_layers, dropout=dropout,
            bidirectional=False, batch_first=False) #dec_size : hidden size của GRU
        self.attn = ConcatAttention(enc_size, dec_size, att_vec_size)
        self.dropout = nn.Dropout(dropout)
        self.hidden_size = dec_size

    def forward(self, question_ids, hidden, context, init_attn_weighted_context):#context: decinit, init_weight = 0
        g_outputs = []
        c_outputs = []
        copy_gate_outputs = []
        current_attn_weighted_context = init_attn_weighted_context
        precompute = None
        with torch.no_grad():
            word_emb = self.bert.embeddings.word_embeddings(question_ids)
        emb_t = self.dropout(word_emb)
        emb_t = emb_t.squeeze(1)  # emb_t shape: [batch_size, emb_dim] [32, 768]
        decoder_rnn_input_t = emb_t # 10,768
        decoder_rnn_input_t = torch.cat([emb_t, current_attn_weighted_context], 1).unsqueeze(0) # 768 + weight
        output, hidden = self.rnn(decoder_rnn_input_t, hidden) #1, 32, dim----1, 32, dim
        output= output.squeeze(0)
        current_attn_weighted_context, attn, precompute = self.attn(output, context.transpose(0, 1), precompute)
        return output, hidden, attn, current_attn_weighted_context


class Generator(nn.Module):
    def __init__(self, dec_size, vocab_size):
        super(Generator, self).__init__()
        self.generator = nn.Sequential(
            nn.Linear(dec_size, vocab_size),
            nn.Softmax(dim=1))
    def forward(self, g_output_t):
        return self.generator(g_output_t)
    
class ConcatAttention(nn.Module):
    def __init__(self, context_dim, query_dim, att_dim):
        super(ConcatAttention, self).__init__()
        self.context_dim = context_dim
        self.query_dim = query_dim
        self.att_dim = att_dim
        self.linear_pre = nn.Linear(context_dim, att_dim, bias=True)
        self.linear_q = nn.Linear(query_dim, att_dim, bias=False)
        self.linear_v = nn.Linear(att_dim, 1, bias=False)
        self.sm = nn.Softmax(dim=1)
        self.tanh = nn.Tanh()
    def forward(self, input, context, precompute=None):
        if precompute is None:
            precompute00 = self.linear_pre(context.contiguous().view(-1, context.size(2))) # reshape to (..., hidden_size)
            precompute = precompute00.view(context.size(0), context.size(1), -1)
        targetT = self.linear_q(input).unsqueeze(1)
        tmp10 = precompute + targetT.expand_as(precompute)
        tmp20 = self.tanh(tmp10)
        energy = self.linear_v(tmp20.view(-1, tmp20.size(2))).view(tmp20.size(0), tmp20.size(1))
        score = self.sm(energy)
        score_m = score.view(score.size(0), 1, score.size(1))
        weightedContext = torch.bmm(score_m, context).squeeze(1)
        return weightedContext, score, precompute

class DecIniter(nn.Module):
    def __init__(self, enc_rnn_size, dec_rnn_size):
        super(DecIniter, self).__init__()
        self.initer = nn.Linear(
            enc_rnn_size,
            dec_rnn_size)
        self.tanh = nn.Tanh()
        self.linear = nn.Linear(16, enc_rnn_size//2)
# kết hợp đầu ra encoder và style embeddinh để đưa vào decoder
    def forward(self, enc_list):
        enc_list[1] = self.linear(enc_list[1])#[batch, enc/2] batch, 112
        x = torch.cat((enc_list[0], enc_list[1]), dim=1)#ini h0
        return self.tanh(self.initer(x))  
    
    
class Seq2Seq(nn.Module):
    def __init__(self, embedder, encoder,dec_init, decoder, generator):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.embedder = embedder
        self.decIniter = dec_init
        self.generator= generator
        self.style_emb_mat = nn.Parameter(torch.randn(9,16))
    def forward(self, word_emb, iob_ids, is_clue, pos_ids, ner_ids, lengths, style_ids, ques_ids, device, teacher_force):#Sample gốm data, question, style_id
        out_length = ques_ids.shape[1]
        style_ids = style_ids.cpu()
        outputs = torch.zeros(out_length, ques_ids.shape[0] , 30522)#[length, batch, vocab]
        emb = self.embedder(word_emb, iob_ids, is_clue, pos_ids, ner_ids) 
        emb = emb.transpose(0, 1)# len. batch. dim
        context, hidden_enc = self.encoder(emb, lengths, device)# shape hidden_enc = [2, batch, hidden]
       # context[0] : len, batch, dim
        y_style_one_hot = torch.eye(9)[style_ids]
        y_style_one_hot = y_style_one_hot.to(device)
        style_emb = torch.matmul(y_style_one_hot, self.style_emb_mat)#[batch, 16]
        hidden_0 = [hidden_enc[1], style_emb]
        init_dec_hidden = self.decIniter(hidden_0).unsqueeze(0)
        batch_size = context[0].size(1)  
        h_size = (
            batch_size,
            112 * 2)
        init_attn_weighted_context = context[0].data.new(*h_size).zero_()
        current_attn_weighted_context=init_attn_weighted_context
        hidden = init_dec_hidden
        in_dec = ques_ids[: , 0] 
        for t in range(1, out_length):
            out, hidden, attn, current_attn_weighted_context = self.decoder(in_dec,
                                                                            hidden, 
                                                                            context[0],
                                                                            current_attn_weighted_context)
            
            out = generator(out) #[batch, vocab]
            outputs[t] = out
            top1 = out.argmax(1)
            in_dec = ques_ids[:, t] if random.random() < teacher_force else top1
        
        return outputs

In [6]:
class CustomEmbedding(nn.Module):
    def __init__(self, pos_size, ner_size,  ids_binary_emb_dim=12, out_emb= 300, dropout_rate=0.1):
        super(CustomEmbedding, self).__init__()
        self.iob_embedding = nn.Embedding(3, ids_binary_emb_dim)
        self.is_clue = nn.Embedding(2, ids_binary_emb_dim)
        self.pos_tag_embedding = nn.Embedding(pos_size, ids_binary_emb_dim)
        self.ner_tag_embedding = nn.Embedding(ner_size, ids_binary_emb_dim)
        
        # Dropout layer
        self.dropout = nn.Dropout(dropout_rate)
        self.linear = nn.Linear(348, 300)
        
        
    def forward(self, word_glove_emb, iob, is_clue, pos_tag_ids, ner_tag_ids):
        iob_emb = self.iob_embedding(iob)
        is_clue = self.is_clue(is_clue)
        pos_tag_emb = self.pos_tag_embedding(pos_tag_ids)
        ner_tag_emb = self.ner_tag_embedding(ner_tag_ids)
        # Concatenate all embeddings
        combined_emb = torch.cat((word_glove_emb, iob_emb, is_clue, pos_tag_emb, ner_tag_emb), dim=-1)
        combined_emb = self.dropout(combined_emb)
        combined_emb = combined_emb.to(torch.float32)
        combined_emb = self.linear(combined_emb)
        return combined_emb

In [7]:
# datas = get_squad_raw_examples('train.txt')

In [8]:
with open('list_ids_data1.pkl', 'rb') as file:
    loaded_list = pickle.load(file)
with open('token_sents.pkl', 'rb') as file:
    token_sents = pickle.load(file)
with open('id_big.pkl', 'rb') as file:
    id_big = pickle.load(file)
with open('filtered_questions.pkl', 'rb') as file:
    filtered_questions = pickle.load(file)

In [9]:
len(token_sents)

49235

In [10]:
data_ids = loaded_list
style_ids = data_ids[0]
pos_ids = data_ids[1]

ner_ids = data_ids[2]
iob_tag = data_ids[3]
is_clue = data_ids[4]
iob_ids = [[['I', 'O', 'B'].index(item) for item in iob] for iob in iob_tag]

In [11]:
lengths = []
for sent in token_sents:
    lengths.append( len(sent))
lengths = [int(x) for x in lengths]
lengths = torch.Tensor(lengths)


In [None]:
# KHỐI CODE ĐƯỢC CMT PHÍA DƯỚI LÀ DÙNG ĐỂ LẤY DANH SÁCH EMBEDDING TỪ GLOVE VÀ LƯU VÀO FILE PICKLE

In [11]:
# sent_token_padded = [sentence + ['<pad>'] * (80 - len(sentence)) for sentence in token_sents]

In [12]:
# special_tokens = ['<pad>', '<sos>', '<eos>']
# def load_glove_embeddings(file_path):
#     embeddings_index = {}
#     with open(file_path, encoding='utf-8') as f:
#         for line in f:
#             values = line.split()
#             word = values[0]
#             coefs = np.asarray(values[1:], dtype='float32')
#             embeddings_index[word] = coefs
#     return embeddings_index

# glove_file = './Data/glove.6B.300d.txt'  # Adjust the path to the downloaded GloVe file
# embeddings_index = load_glove_embeddings(glove_file)

In [13]:
# word_glove_emb= []
# for sent in sent_token_padded:
#     t =[]
#     for word in sent:
#         if embeddings_index.get(word.lower()) is not None:
#             t.append(embeddings_index.get(word.lower()))
#         elif word in special_tokens:
#             t.append(np.zeros((300)))
#         else:
#             t.append(np.ones((300)))
#     word_glove_emb.append(t)
# # with open('glove_emb.pkl', 'wb') as file:
# #     pickle.dump(word_glove_emb, file)

In [14]:
# glove_emb_tensor1= torch.as_tensor(word_glove_emb)

In [15]:
# with open('glove_emb_tensor1.pkl', 'wb') as file:
#     pickle.dump(glove_emb_tensor1, file)

In [12]:
# #embedding đã được load sẵn từ glove
with open('glove_emb_tensor1.pkl', 'rb') as file:
    glove_emb_tensor1 = pickle.load(file)

In [13]:
iob_ids_new = [item for idx, item in enumerate(iob_ids[0:50000]) if [idx] not in id_big]
is_clue_new = [item for idx, item in enumerate(is_clue[0:50000]) if [idx] not in id_big]
pos_tag_ids_new = [item for idx, item in enumerate(pos_ids[0:50000]) if [idx] not in id_big]
ner_tag_ids_new = [item for idx, item in enumerate(ner_ids[0:50000]) if [idx] not in id_big]
style_ids  = [item for idx, item in enumerate(style_ids[0:50000]) if [idx] not in id_big]
# lọc ra các ids của các câu có độ dài quá lớn

In [14]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')



In [15]:
def pad_sequence(sequences, max_length):
    padded_sequences = torch.zeros((len(sequences), max_length), dtype=torch.long)
    for i, seq in enumerate(sequences):
        length = min(len(seq), max_length)
        padded_sequences[i, :length] = torch.tensor(seq[:length])
    return padded_sequences

In [16]:
bert_model_name = 'bert-base-uncased'
vocab_size = 30522  
iob_size = 3
pos_size = 17
ner_size = 19
dropout_rate = 0.2
max_length = 79
enc_size= 224
dec_size = 224
n_layers = 1
att_vec_size = 224
input_size = 300
hidden_size = 112
e =10

In [17]:
ques = filtered_questions

In [18]:
tokenized_inputs = tokenizer(ques, padding=True, truncation=True, return_tensors="pt", add_special_tokens=True)
ques_ids = tokenized_inputs['input_ids']

In [19]:
max_length_new = 80
iob_ids_new = torch.tensor(pad_sequence(iob_ids_new, max_length_new))
is_clue_new = torch.tensor(pad_sequence(is_clue_new, max_length_new))
pos_tag_ids_new = torch.tensor(pad_sequence(pos_tag_ids_new, max_length_new))
ner_tag_ids_new = torch.tensor(pad_sequence(ner_tag_ids_new, max_length_new))

  iob_ids_new = torch.tensor(pad_sequence(iob_ids_new, max_length_new))
  is_clue_new = torch.tensor(pad_sequence(is_clue_new, max_length_new))
  pos_tag_ids_new = torch.tensor(pad_sequence(pos_tag_ids_new, max_length_new))
  ner_tag_ids_new = torch.tensor(pad_sequence(ner_tag_ids_new, max_length_new))


In [20]:
style_ids_new = [item for sublist in style_ids for item in sublist] # style_ids_new
style_ids_new = torch.tensor(style_ids_new)

In [42]:
embedding_layer = CustomEmbedding(pos_size, ner_size, dropout_rate=dropout_rate)#1
encoder = Encoder(input_size, hidden_size)#2
dec_init = DecIniter(enc_size, dec_size)
decoder = Decoder(enc_size=enc_size, 
                 dec_size= dec_size, n_layers=n_layers, att_vec_size=att_vec_size, bert_model_name=bert_model_name)

generator = Generator(dec_size = dec_size, vocab_size=vocab_size)
seq2seq = Seq2Seq(embedding_layer, encoder, dec_init, decoder, generator)

# model = Seq2Seq(embedding_layer, encoder, dec_init, decoder)
# state_dict = torch.load('seq2seq_weight_1.pth')
# model.load_state_dict(state_dict)



In [22]:
class CustomDataset(Dataset):
    def __init__(self, emb, iob, clue, pos, ner, lengths, style, ques): 
        self.emb = emb
        self.iob = iob
        self.clue = clue
        self.pos = pos
        self.ner = ner
        self.lengths = lengths
        self.style = style
        self.ques = ques
    
    def __len__(self):
        return self.lengths.shape[0]
    
    def __getitem__(self, idx):
        return (self.emb[idx], self.iob[idx], self.clue[idx], 
                self.pos[idx], self.ner[idx], self.lengths[idx], self.style[idx], self.ques[idx])

dataset = CustomDataset(glove_emb_tensor1, iob_ids_new, is_clue_new, pos_tag_ids_new, ner_tag_ids_new, lengths, style_ids_new, ques_ids)

# Create the dataloader with batch size of 32
dataloader = DataLoader(dataset, batch_size=64, shuffle=True)


In [37]:
criterion = nn.CrossEntropyLoss()
params = list(seq2seq.parameters())
optimizer = optim.Adam(params, lr=0.1, weight_decay=0.1)
num_epochs = 1

In [43]:
seq2seq = seq2seq.to(device)
# generator=generator.to(device)

In [44]:
seq2seq.train()
for epoch in range(1):
    print(epoch)
    i=0
    for batch in dataloader:
        if i %100 ==0:
            print(i)
        seq2seq.zero_grad()
        emb, iob, clue, pos, ner, len, style, ques  = batch
        emb = emb.to(device)
        iob = iob.to(device)
        clue = clue.to(device)
        pos = pos.to(device)
        ner = ner.to(device)
        len = len.to(device)
        style = style.to(device)
        ques = ques.to(device)
        out = seq2seq(emb, iob, clue, pos, ner, len, style, ques, device, 0.2)
        output_dim = out.shape[-1]
        out = out[1:].view(-1, output_dim).to(device)
        ques = ques.permute(1, 0)
        ques = ques[1:].reshape(-1)
        loss = criterion(out, ques)
        loss.backward()
        optimizer.step()
        print(loss.item())
        i+=1


0
0


RuntimeError: Sizes of tensors must match except in dimension 1. Expected size 64 but got size 69 for tensor number 1 in the list.

In [None]:
#NHƯ TRÊN MODEL KHI TRAIN 1 EPOCH MẤT HƠN 1,5 TIẾNG NHƯNG KHÔNG RA KẾT QUẢ

In [44]:
torch.save(seq2seq.state_dict(), 'seq2seq_weight.pth')


In [46]:
seq2seq.eval().to(device)
e = 10
out = seq2seq(glove_emb_tensor1[0:e].to(device), iob_ids_new[0:e].to(device),
              is_clue_new[0:e].to(device), pos_tag_ids_new[0:e].to(device), ner_tag_ids_new[0:e].to(device), 
              lengths[0:e].to(device), style_ids_new[0:e].to(device), ques_ids[0:e].to(device), device, 0)

out = out.permute(1, 0, 2)

In [47]:
ids_prd = torch.argmax(out, dim = 2)
print(ids_prd.shape)
for sent in ids_prd:
    tokens = tokenizer.convert_ids_to_tokens(sent)
    print('----',tokens, '-----')


torch.Size([10, 63])
---- ['[PAD]', 'nipples', 'institutional', 'institutional', 'institutional', 'institutional', 'institutional', 'institutional', 'institutional', 'institutional', 'institutional', 'institutional', 'institutional', 'institutional', 'institutional', 'institutional', 'institutional', 'institutional', 'institutional', 'institutional', 'institutional', 'institutional', 'institutional', 'institutional', 'institutional', 'institutional', 'institutional', 'institutional', 'institutional', 'institutional', 'institutional', 'institutional', 'institutional', 'institutional', 'institutional', 'institutional', 'institutional', 'institutional', 'institutional', 'institutional', 'institutional', 'institutional', 'institutional', 'institutional', 'institutional', 'institutional', 'institutional', 'institutional', 'institutional', 'institutional', 'institutional', 'institutional', 'institutional', 'institutional', 'institutional', 'institutional', 'institutional', 'institutional', '