### **_Encoder_**

In [21]:
import torch
import torch.nn as nn
from transformers import BertModel, BertTokenizer
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, Dataset

import numpy as np
import pandas as pd

import spacy
nlp = spacy.load("en_core_web_sm")

In [41]:
global device
global vocab_size
global lstm_hidden_size
global num_layers
global batch_size

In [42]:
bert_model_name = 'bert-base-uncased'
bert_tokenizer = BertTokenizer.from_pretrained(bert_model_name)
bert_model = BertModel.from_pretrained(bert_model_name)

# global variable iinitialize
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
vocab_size = bert_model.config.vocab_size
lstm_hidden_size = 256
num_layers = 2
batch_size = 6

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


### **_Encoder_**

In [38]:
class Encoder(nn.Module):

    def __init__(self, bert_model):
        super(Encoder, self).__init__()
        self.bert = bert_model
        
    def forward(self, input_ids, attention_mask):
        # with torch.no_grad():
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        last_hidden_state = outputs.last_hidden_state
        return last_hidden_state


### **_Decoder_**

In [25]:
class Decoder(nn.Module):

    def __init__(self, lstm_hidden_size, num_layers, vocab_size):
        super(Decoder, self).__init__()
        self.lstm = nn.LSTM(input_size=bert_model.config.hidden_size,
                            hidden_size=lstm_hidden_size,
                            num_layers=num_layers)
        self.linear = nn.Linear(lstm_hidden_size, vocab_size)
    
    def forward(self, input):
        lstm_outputs, _ = self.lstm(input)
        output = self.linear(lstm_outputs)
        return output
    

### **_Seq2Seq_**

In [26]:
class Seq2Seq(nn.Module):
    
    def __init__(self, encoder, decoder) -> None:
        '''
        '''
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
    
    def forward(self, input_ids, attention_mask):
        last_hidden_state = self.encoder(input_ids, attention_mask)
        decoder_output = self.decoder(last_hidden_state)
        return decoder_output
        

In [None]:
class Seq2Seqwithattn():

    def __init__(self, encoder, decoder, attn_embed_dim, num_heads) -> None:
        super(Seq2Seqwithattn, self).__init__()
        self.encoder = encoder
        self.multihead_attn = nn.MultiheadAttention(attn_embed_dim, num_heads)
        self.decoder = decoder

    def forward(self, input_ids, attention_mask):
        encoder_outputs  = self.encoder(input_ids, attention_mask)
        attn_output, _ = self.multihead_attn(encoder_outputs,
                                             encoder_outputs,
                                             encoder_outputs)
        decoder_outputs = self.decoder(attn_output)
        return decoder_outputs

In [None]:
class bertsformer(nn.Module):

    def __init__(self, bert_model):
        super(bertsformer, self).__init__()
        self.bert_encoder = bert_model
        self.trans_decoder = nn.TransformerDecoder(d_model=512, nhead=8)

    def forward(self, input_ids, attention_mask):
        bert_output = self.bert_encoder(input_ids=input_ids, attention_mask=attention_mask)
        hidden_states = bert_output['last_hidden_state']
        # Assuming you have the necessary inputs for the TransformerDecoder
        trans_decoder_output = self.trans_decoder(hidden_states, num_layers=6)
        return trans_decoder_output


### **_Preprocess data_**

In [27]:
def preprocess_data(text):
    return [tok.text for tok in nlp.tokenizer(text)]

In [28]:
class PubMedDataset(Dataset):
    
    def __init__(self, df, tokenizer) -> None:
        super().__init__()
        self.df = df
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        input_seq = self.df.loc[index, "input"]
        target_seq = self.df.loc[index, "target"]

        encoded_inputs = self.tokenizer.encode_plus(
            input_seq,
            add_special_tokens=True,
            padding='max_length', 
            max_length=128,
            truncation=True,
            return_tensors='pt'
        )
        encoded_targets = self.tokenizer.encode_plus(
            target_seq,
            add_special_tokens=True,
            padding='max_length',
            max_length=128,
            truncation=True,
            return_tensors='pt'
        )

        input_ids = encoded_inputs['input_ids'].squeeze(0)
        attention_mask = encoded_inputs['attention_mask'].squeeze(0)
        target_ids = encoded_targets['input_ids'].squeeze(0)

        return input_ids, attention_mask, target_ids

In [29]:
import json

training_set = r'./../spider/meddialog/results/eval_wer_json/pubmed_46374_train.json'
with open(training_set, 'r') as td:
    data = json.load(td)

In [36]:
utterances = []
results = []

for d in data:
    utterances.append(d['utterances']['pubmed'])
    results.append(d['results']['pubmed'][2:])

print(len(utterances))

pubmed_df = pd.DataFrame({'input': utterances, 'target': results})
PD = PubMedDataset(pubmed_df, bert_tokenizer)
dataloader = DataLoader(PD, batch_size=6, shuffle=True)

print(pubmed_df.head())

46374
                                               input   
0  This study aimed to investigate the efficacy a...  \
1  Over recent decades , the abundance and geogra...   
2  There are few data on the prevalence of obesit...   
3  Cytomegalovirus establishes a lifelong infecti...   
4  Early identification of patients with acute di...   

                                              target  
0  this study aimed toinvestigate the efficacy an...  
1  over recent decades the abundance and geograph...  
2  there are few data and the prevalence of obesi...  
3  cytomegalovirus establishes lifelong infection...  
4  early identification and patients with acute d...  


### **_Train model_**

In [37]:
def train_model(model, criterion, optimizer, dataloader, num_epochs):
    model.train()

    for epoch in range(num_epochs):
        for i, (input_ids, attention_mask, target_ids) in enumerate(dataloader):
            optimizer.zero_grad()

            # 將資料移到 device
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            target_ids = target_ids.to(device)

            # forward
            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs.view(-1, vocab_size), target_ids.view(-1))

            # backward & optimization
            loss.backward()
            optimizer.step()
            
            print(f"Epoch [{epoch+1}/{num_epochs}], Step [{i+1}/{len(dataloader)}], Loss: {loss.item()}")

In [39]:
encoder = Encoder(bert_model=bert_model)
decoder = Decoder(lstm_hidden_size=lstm_hidden_size, num_layers=num_layers, vocab_size=vocab_size)
bert_lstm_s2s_model = Seq2Seq(encoder, decoder)

criterion = nn.CrossEntropyLoss()
# optimizer = torch.optim.Adam(bert_lstm_s2s_model.parameters(), lr=0.001)
optimizer = torch.optim.AdamW(bert_lstm_s2s_model.parameters(), lr=0.001)

train_model(bert_lstm_s2s_model, \
            criterion=criterion, \
            optimizer=optimizer, \
            dataloader=dataloader, \
            num_epochs=10)

# torch.save(model.state_dict(), 'model.pth')

Epoch [1/10], Step [1/7729], Loss: 10.329257011413574
Epoch [1/10], Step [2/7729], Loss: 10.11476993560791
Epoch [1/10], Step [3/7729], Loss: 9.888408660888672
Epoch [1/10], Step [4/7729], Loss: 9.4196195602417
Epoch [1/10], Step [5/7729], Loss: 9.075414657592773
Epoch [1/10], Step [6/7729], Loss: 7.967618942260742
Epoch [1/10], Step [7/7729], Loss: 7.450380802154541
Epoch [1/10], Step [8/7729], Loss: 6.690471649169922
Epoch [1/10], Step [9/7729], Loss: 8.086164474487305
Epoch [1/10], Step [10/7729], Loss: 6.265773773193359
Epoch [1/10], Step [11/7729], Loss: 8.07442569732666
Epoch [1/10], Step [12/7729], Loss: 6.242560863494873
Epoch [1/10], Step [13/7729], Loss: 4.707840919494629
Epoch [1/10], Step [14/7729], Loss: 6.338712692260742
Epoch [1/10], Step [15/7729], Loss: 3.845024824142456
Epoch [1/10], Step [16/7729], Loss: 5.520803928375244
Epoch [1/10], Step [17/7729], Loss: 4.982219219207764
Epoch [1/10], Step [18/7729], Loss: 4.27101469039917
Epoch [1/10], Step [19/7729], Loss: 4.97

KeyboardInterrupt: 

### **_Evaluate the model_**

In [40]:
predicted_sentences = []
bert_lstm_s2s_model.eval()

with torch.no_grad():
    for input_ids, attention_mask, target_ids in dataloader:
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)

        outputs = bert_lstm_s2s_model(input_ids, attention_mask)

        _, predicted_ids = torch.max(outputs, dim=2)

        for ids in predicted_ids:
            tokens = bert_tokenizer.convert_ids_to_tokens(ids, skip_special_tokens=True)
            sentence = bert_tokenizer.convert_tokens_to_string(tokens)
            predicted_sentences.append(sentence)


print(predicted_sentences[:10])


Seq2Seq(
  (encoder): Encoder(
    (bert): BertModel(
      (embeddings): BertEmbeddings(
        (word_embeddings): Embedding(30522, 768, padding_idx=0)
        (position_embeddings): Embedding(512, 768)
        (token_type_embeddings): Embedding(2, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): BertEncoder(
        (layer): ModuleList(
          (0-11): 12 x BertLayer(
            (attention): BertAttention(
              (self): BertSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): BertSelfOutput(
                (dense): Linear(in_features=768, out_features=768, bias=True)
               

In [51]:
import torch
import numpy

a = np.random.randint(-1,2,(3,3))
print(a)

a = np.matmul(a,a.T)
print(a)

[[ 0 -1  1]
 [-1  1 -1]
 [ 0  1  0]]


In [52]:
a = np.matmul(a,a.T)
print(a)

[[ 2 -2 -1]
 [-2  3  1]
 [-1  1  1]]
