### **_Import Package_**

In [1]:
import torch
import torch.nn as nn
from transformers import BertModel, BertTokenizer
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, Dataset

import numpy as np
import pandas as pd

import spacy
nlp = spacy.load("en_core_web_sm")

from tqdm import tqdm

In [2]:
global device
global vocab_size
global bert_model_hidden_size
global lstm_hidden_size
global num_layers
global batch_size

In [5]:
# global variable iinitialize
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
# print(torch.cuda.get_device_name())

bert_model_name = 'bert-base-uncased'
bert_tokenizer = BertTokenizer.from_pretrained(bert_model_name)
bert_model = BertModel.from_pretrained(bert_model_name).to(device)

vocab_size = bert_model.config.vocab_size
bert_model_hidden_size = bert_model.config.hidden_size
lstm_hidden_size = 256
num_layers = 4
batch_size = 8

cpu


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


### **_Encoder_**

In [10]:
class Encoder(nn.Module):

    def __init__(self, bert_model):
        super(Encoder, self).__init__()
        self.bert = bert_model
        
    def forward(self, input_ids, attention_mask):
        # with torch.no_grad():
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        last_hidden_state = outputs.last_hidden_state
        return last_hidden_state


### **_Decoder_**

In [11]:
class Decoder(nn.Module):

    def __init__(self, lstm_hidden_size, num_layers, vocab_size):
        super(Decoder, self).__init__()
        self.lstm = nn.LSTM(input_size=bert_model.config.hidden_size,
                            hidden_size=lstm_hidden_size,
                            num_layers=num_layers).to(device)
        self.linear1 = nn.Linear(lstm_hidden_size, lstm_hidden_size * 2).to(device)
        self.linear2 = nn.Linear(lstm_hidden_size * 2, lstm_hidden_size * 4).to(device)
        self.linear3 = nn.Linear(lstm_hidden_size * 4, vocab_size).to(device)
    
    def forward(self, input):
        lstm_outputs, _ = self.lstm(input)
        output = self.linear1(lstm_outputs)
        output = self.linear2(output)
        prediction = self.linear3(output)
        return prediction
    

### **_Seq2Seq_**

In [12]:
class Seq2Seq(nn.Module):
    
    def __init__(self, encoder, decoder) -> None:
        '''
        '''
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
    
    def forward(self, input_ids, attention_mask):
        last_hidden_state = self.encoder(input_ids, attention_mask)
        decoder_output = self.decoder(last_hidden_state)
        return decoder_output
        

In [13]:
class Seq2Seqwithattn(nn.Module):

    def __init__(self, encoder, decoder, attn_embed_dim, num_heads) -> None:
        super(Seq2Seqwithattn, self).__init__()
        self.encoder = encoder
        self.multihead_attn = nn.MultiheadAttention(attn_embed_dim, num_heads).to(device)
        self.decoder = decoder

    def forward(self, input_ids, attention_mask, target_ids):
        encoder_outputs  = self.encoder(input_ids, attention_mask)
        target_outputs = self.encoder(target_ids, attention_mask)
        attn_output, _ = self.multihead_attn(target_outputs,
                                             encoder_outputs,
                                             encoder_outputs)
        decoder_outputs = self.decoder(attn_output)
        return decoder_outputs

In [14]:
class bertsformer(nn.Module):

    def __init__(self, encoder, bert_model_hidden_size) -> None:
        super(bertsformer, self).__init__()
        self.bert_encoder = encoder
        self.decoder_layer  = nn.TransformerDecoderLayer(d_model=bert_model_hidden_size, nhead=8).to(device)
        self.trans_decoder = nn.TransformerDecoder(self.decoder_layer, num_layers=6, norm=nn.LayerNorm(bert_model_hidden_size)).to(device)
        self.output_layer = nn.Linear(bert_model_hidden_size, vocab_size).to(device)
        
    def forward(self, input_ids, attention_mask, target_ids):
        input_hidden_state = self.bert_encoder(input_ids=input_ids, attention_mask=attention_mask)
        target_hidden_state = self.bert_encoder(input_ids=input_ids, attention_mask=attention_mask)
        
        trans_decoder_output = self.trans_decoder(target_hidden_state, input_hidden_state)
        output = self.output_layer(trans_decoder_output)
        return output

    

### **_Preprocess data_**

In [15]:
def preprocess_data(text):
    return [tok.text for tok in nlp.tokenizer(text)]

In [16]:
class PubMedDataset(Dataset):
    
    def __init__(self, df, tokenizer) -> None:
        super().__init__()
        self.df = df
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        input_seq = self.df.loc[index, "input"]
        target_seq = self.df.loc[index, "target"]

        encoded_inputs = self.tokenizer.encode_plus(
            input_seq,
            add_special_tokens=True,
            padding='max_length', 
            max_length=128,
            truncation=True,
            return_tensors='pt'
        )
        encoded_targets = self.tokenizer.encode_plus(
            target_seq,
            add_special_tokens=True,
            padding='max_length',
            max_length=128,
            truncation=True,
            return_tensors='pt'
        )

        input_ids = encoded_inputs['input_ids'].squeeze(0)
        attention_mask = encoded_inputs['attention_mask'].squeeze(0)
        target_ids = encoded_targets['input_ids'].squeeze(0)

        return input_ids, attention_mask, target_ids

In [17]:
import json

training_set = r'./../spider/meddialog/results/eval_wer_json/pubmed_46374_train.json'
with open(training_set, 'r') as td:
    data = json.load(td)

In [18]:
utterances = []
results = []

for d in data:
    utterances.append(d['utterances']['pubmed'])
    results.append(d['results']['pubmed'][2:])

print(len(utterances))

pubmed_df = pd.DataFrame({'input': utterances, 'target': results})
PD = PubMedDataset(pubmed_df, bert_tokenizer)
dataloader = DataLoader(PD, batch_size=batch_size, shuffle=True)

print(pubmed_df.head())

46374
                                               input   
0  This study aimed to investigate the efficacy a...  \
1  Over recent decades , the abundance and geogra...   
2  There are few data on the prevalence of obesit...   
3  Cytomegalovirus establishes a lifelong infecti...   
4  Early identification of patients with acute di...   

                                              target  
0  this study aimed toinvestigate the efficacy an...  
1  over recent decades the abundance and geograph...  
2  there are few data and the prevalence of obesi...  
3  cytomegalovirus establishes lifelong infection...  
4  early identification and patients with acute d...  


### **_Train model_**

In [19]:
def train_model(model, criterion, optimizer, dataloader, num_epochs, clip):
    
    # batch accumulation parameter
    accum_iter = 32
    
    model.train()
    
    for epoch in range(num_epochs):
        for batch_idx, (input_ids, attention_mask, target_ids) in tqdm(enumerate(dataloader), desc=f'Epoch {epoch + 1}', ncols=30):
            # optimizer.zero_grad()

            # 將資料移到 device
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            target_ids = target_ids.to(device)
            
            # passes and weights update
            with torch.set_grad_enabled(True):
                
                # forward
                outputs = model(input_ids, attention_mask, target_ids)
                loss = criterion(outputs.view(-1, vocab_size), target_ids.view(-1))

                # normalize loss to account for batch accumulation
                loss = loss / accum_iter

                # backward & optimization
                loss.backward()
                
                torch.nn.utils.clip_grad_norm_(model.parameters(), clip)

                if ((batch_idx + 1) % accum_iter == 0) or (batch_idx + 1 == len(dataloader)):
                    optimizer.step()
                    optimizer.zero_grad()

                    # loss = criterion(outputs.view(-1, vocab_size), target_ids.view(-1))
                    # if ((batch_idx + 1) % 1000 == 0):
                        # print(f"Epoch [{epoch+1}/{num_epochs}], Batch [{batch_idx+1}/{len(dataloader)}], Loss: {loss.item()}")
        loss = criterion(outputs.view(-1, vocab_size), target_ids.view(-1))
        print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item()}")

In [20]:
encoder = Encoder(bert_model=bert_model)
decoder = Decoder(lstm_hidden_size=lstm_hidden_size, num_layers=num_layers, vocab_size=vocab_size)
# bert_lstm_s2s_model = Seq2Seq(encoder, decoder)
seq2seq_with_attn_model = Seq2Seqwithattn(encoder, decoder, bert_model.config.hidden_size, 8)
# bertsformer = bertsformer(encoder, bert_model_hidden_size=bert_model_hidden_size)

# SRC_PAD_IDX = bert_tokenizer.pad_token_id
# TRG_PAD_IDX = bert_tokenizer.pad_token_id

# criterion = nn.CrossEntropyLoss(ignore_index=TRG_PAD_IDX)
criterion = nn.CrossEntropyLoss()
# optimizer = torch.optim.Adam(bert_lstm_s2s_model.parameters(), lr=0.001)
# optimizer = torch.optim.AdamW(bert_lstm_s2s_model.parameters(), lr=0.001)
optimizer = torch.optim.AdamW(seq2seq_with_attn_model.parameters(), lr=0.00008)
# optimizer = torch.optim.AdamW(bertsformer.parameters(), lr=0.00001)

train_model(seq2seq_with_attn_model, \
            criterion=criterion, \
            optimizer=optimizer, \
            dataloader=dataloader, \
            num_epochs=15,
            clip=1)

# torch.save(model.state_dict(), 'model.pth')

Epoch 1: 4it [00:28,  7.06s/it]


KeyboardInterrupt: 

### **_Evaluate the model_**

In [None]:
predicted_sentences = []
bert_lstm_s2s_model.eval()

with torch.no_grad():
    for input_ids, attention_mask, target_ids in dataloader:
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)

        outputs = bert_lstm_s2s_model(input_ids, attention_mask)

        _, predicted_ids = torch.max(outputs, dim=2)

        for ids in predicted_ids:
            tokens = bert_tokenizer.convert_ids_to_tokens(ids, skip_special_tokens=True)
            sentence = bert_tokenizer.convert_tokens_to_string(tokens)
            predicted_sentences.append(sentence)


print(predicted_sentences[:10])


In [None]:
testTxt = 'Hello, World'

encoded_inputs = bert_tokenizer.encode_plus(
    testTxt,
    add_special_tokens=True,
    padding='max_length',
    max_length=128,
    truncation=True,
    return_tensors='pt'
)

In [None]:
# NLP Example
batch, sentence_length, embedding_dim = 3, 3, 3
embedding = torch.randn(batch, sentence_length, embedding_dim)
layer_norm = nn.LayerNorm(embedding_dim)
# Activate module
layer_norm(embedding)
# Image Example
N, C, H, W = 3, 3, 3, 3
input = torch.randn(N, C, H, W)
# Normalize over the last three dimensions (i.e. the channel and spatial dimensions)
# as shown in the image below
print(input)
layer_norm = nn.LayerNorm([C, H, W])
output = layer_norm(input)

tensor([[[[ 0.3778, -1.2009, -2.0004],
          [ 1.4171,  0.4454,  1.2879],
          [ 0.5370, -0.5584,  0.1352]],

         [[ 0.3975,  0.0664,  1.0255],
          [ 0.1967,  0.7439,  2.9432],
          [-0.9861, -1.2516, -0.3943]],

         [[-0.3417,  0.4060, -0.9845],
          [-0.3367,  0.1158, -2.1325],
          [-0.7750, -1.3084,  0.6354]]],


        [[[ 1.3053,  0.9232, -1.0590],
          [-0.6940,  1.4665,  0.8403],
          [ 0.9252, -0.3428,  0.3314]],

         [[-0.4104,  0.1264,  0.2063],
          [ 3.0852,  0.3418,  0.4495],
          [ 0.8446,  0.2561, -0.7031]],

         [[ 0.6151,  0.8057,  0.2074],
          [-1.7569,  0.4648, -0.1453],
          [ 0.8271,  0.8471, -1.1014]]],


        [[[-0.2911, -0.9113,  2.4088],
          [ 1.5399, -0.0792, -1.0721],
          [-0.8771, -0.8096, -0.0929]],

         [[ 1.1522,  0.5074, -0.3919],
          [-0.7024,  1.3359,  0.7152],
          [-0.3411, -0.1181,  0.8319]],

         [[ 1.9686,  0.2805,  1.9455],
     

In [None]:
print(output)

tensor([[[[ 0.4008, -1.0543, -1.7911],
          [ 1.3586,  0.4631,  1.2395],
          [ 0.5474, -0.4621,  0.1772]],

         [[ 0.4189,  0.1138,  0.9976],
          [ 0.2338,  0.7382,  2.7651],
          [-0.8563, -1.1009, -0.3108]],

         [[-0.2624,  0.4267, -0.8547],
          [-0.2578,  0.1593, -1.9128],
          [-0.6617, -1.1533,  0.6382]]],


        [[[ 1.0517,  0.6436, -1.4734],
          [-1.0835,  1.2238,  0.5551],
          [ 0.6457, -0.7085,  0.0116]],

         [[-0.7807, -0.2075, -0.1221],
          [ 2.9526,  0.0226,  0.1376],
          [ 0.5597, -0.0689, -1.0933]],

         [[ 0.3146,  0.5181, -0.1209],
          [-2.2188,  0.1540, -0.4976],
          [ 0.5409,  0.5623, -1.5187]]],


        [[[-0.4806, -1.0278,  1.9016],
          [ 1.1349, -0.2936, -1.1697],
          [-0.9976, -0.9380, -0.3057]],

         [[ 0.7929,  0.2240, -0.5695],
          [-0.8434,  0.9549,  0.4074],
          [-0.5247, -0.3279,  0.5103]],

         [[ 1.5133,  0.0238,  1.4928],
     