# import packages

In [None]:
import pandas as pd
import numpy as np
import json
import torch
from torch import nn
import math

# preprocess data

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import os
os.chdir('/content/drive/MyDrive/NLP')

In [None]:
with open('data/english-train.json', 'r') as json_file:
    english_train = json.load(json_file)
with open('data/english-dev.json', 'r') as json_file:
    english_dev = json.load(json_file)
with open('data/english-test.json', 'r') as json_file:
    english_test = json.load(json_file)


In [None]:
english_dev[0]['utterances'][1]

'doctor: in brief: best to stay home right now stay home, consult here. disinfect everything and stay safe. we are here to answer your questions. would you like to video or text chat with me?'

In [None]:
patient_ques = []
docter_ans = []

patient_ques.extend([i['utterances'][0] for i in english_train])
docter_ans.extend([i['utterances'][1] for i in english_train])
patient_ques.extend([i['utterances'][0] for i in english_dev])
docter_ans.extend([i['utterances'][1] for i in english_dev])
patient_ques.extend([i['utterances'][0] for i in english_test])
docter_ans.extend([i['utterances'][1] for i in english_test])

patient_ques = ['SOS '+i[9:]+' EOS' for i in patient_ques]
docter_ans = ['SOS '+i[8:]+' EOS' for i in docter_ans]

tot_data = []
tot_data.extend(patient_ques)
tot_data.extend(docter_ans)

In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Initialize the tokenizer
tokenizer = Tokenizer(num_words=10000, oov_token="<OOV>")
dialogs = tot_data
# Fit the tokenizer on the texts
tokenizer.fit_on_texts(dialogs)

# Convert texts to sequences of integers
patient_ques = tokenizer.texts_to_sequences(patient_ques)
docter_ans = tokenizer.texts_to_sequences(docter_ans)

# Pad the sequences to have equal length
# patient_ques = pad_sequences(patient_ques, padding='post')
# docter_ans = pad_sequences(docter_ans, padding='post')

print("Word Index = " , tokenizer.word_index)
print("Sequences = " , patient_ques)
# print("Padded Sequences:")
# print(padded_sequences)

Sequences =  [[7, 54, 6, 336, 80, 3, 314, 4, 38, 6, 89, 2781, 1338, 476, 12, 829, 11, 5, 60, 130, 337, 14, 23, 48, 12, 78, 13, 2782, 13, 5, 60, 8], [7, 1339, 71, 2, 14, 55, 140, 34, 15, 187, 6, 120, 3, 55, 6, 202, 593, 47, 99, 120, 15, 5, 338, 256, 58, 2, 14, 48, 189, 961, 39, 2, 78, 20, 111, 39, 2, 184, 6, 111, 8], [7, 2, 14, 6, 536, 3, 962, 63, 13, 6, 148, 40, 56, 47, 3, 56, 512, 121, 21, 537, 19, 65, 8], [7, 67, 45, 1057, 117, 5, 963, 352, 15, 22, 26, 8], [7, 118, 436, 135, 2, 43, 888, 282, 167, 13, 30, 2, 53, 6, 141, 96, 1981, 2783, 1176, 2784, 889, 1058, 2785, 211, 404, 44, 1560, 151, 1982, 3, 477, 1983, 44, 766, 151, 4, 629, 404, 35, 43, 27, 2786, 15, 889, 1058, 830, 44, 203, 322, 2, 379, 6, 2787, 109, 2, 1982, 95, 339, 20, 1984, 630, 10, 202, 122, 712, 29, 4, 2788, 3, 477, 29, 1984, 1340, 290, 120, 25, 294, 184, 57, 15, 256, 58, 8], [7, 25, 9, 78, 65, 16, 68, 60, 37, 5, 713, 64, 37, 964, 4, 160, 37, 6, 831, 11, 1985, 239, 594, 68, 832, 27, 1177, 1986, 8], [7, 283, 405, 32, 2789,

In [None]:
class EmbeddingBlock(nn.Module):
    """
    Arguments:
        num_embeddings : the number of word types
        embedding_dim : the dimension of embedding vector
    """

    def __init__(self, num_embeddings, embedding_dim):
        super(EmbeddingBlock, self).__init__()

        self.embedding_dim = embedding_dim
        self.pos_units = [10000**(2*i/self.embedding_dim) for i in range(self.embedding_dim//2)]

        self.embedding = nn.Embedding(num_embeddings=num_embeddings, embedding_dim=embedding_dim)

    def forward(self, x):
        """
        input : indexed words (batch_size, num_words)
        output : word embeddings (batch_size, num_words, embedding_dim)
        """

        out = self.embedding(x)

        pos = torch.zeros(out.shape)

        for p in range(pos.shape[1]):
            for i in range(0, pos.shape[2], 2):
                pos[:, p, i] = torch.sin(torch.Tensor([p/self.pos_units[i//2]]))
                pos[:, p, i+1] = torch.cos(torch.Tensor([p/self.pos_units[i//2]]))
        out += pos

        return out


class AttentionBlock(nn.Module):
    """
    Arguments:
        in_channel : the dimension of embedding vector
        out_channel : the dimension of query/key/value vector


    Variables:
        in_channel : d_model
        out_channel : d_k
    """

    def __init__(self, in_channel, out_channel):
        super(AttentionBlock, self).__init__()

        self.in_channel = in_channel

        self.fc_q = nn.Linear(in_channel, out_channel)  # W^Q
        self.fc_k = nn.Linear(in_channel, out_channel)  # W^K
        self.fc_v = nn.Linear(in_channel, out_channel)  # W^V

        self.softmax = nn.Softmax(dim=1)

    def forward(self, Q, K, V):
        """
        input : embedded words (batch_size, query_dim, key_dim, value_dim)
        output : attention score (batch_size, query_dim)
        """
        out_q = self.fc_q(Q)
        out_k = self.fc_k(K)
        out_v = self.fc_v(V)

        out = self.softmax(out_q @ out_k.transpose(1, 2) / math.sqrt(self.in_channel))

        out = out @ out_v

        return out


class MultiHeadAttentionBlock(nn.Module):
    """
    Arguments:
        in_channel : the dimension of embedding vector
        num_attention : the number of attention heads
        hidden_channel : the number of hidden channels in Position-wise Feed-Forward Networks

    Variables:
        in_channel : d_model
        inner_channel : d_ff
        num_attention : h
    """

    def __init__(self, in_channel, num_attention, hidden_channel):
        super(MultiHeadAttentionBlock, self).__init__()

        self.num_attention = num_attention

        self.heads = nn.ModuleList([AttentionBlock(in_channel, in_channel // self.num_attention) for _ in range(num_attention)])
        self.flatten = nn.Flatten()

        self.fc = nn.Linear(in_channel, in_channel)   # W^O

        self.ln1 = nn.LayerNorm((in_channel))


        self.ffc = nn.Sequential(nn.Linear(in_channel, hidden_channel),        # Position-wise Feed-Forward Networks
                                    nn.ReLU(),
                                    nn.Linear(hidden_channel, in_channel)
                                )

        self.ln2 = nn.LayerNorm((in_channel))


    def forward(self, x):
        """
        input : indexed words (batch_size, num_words)
        output : processed attention scores (batch_size, embedding_dim)
        """
        outs = [self.heads[i](x, x, x) for i in range(self.num_attention)]
        out = torch.cat(outs, dim=2)
        out = self.fc(out)

        out = self.ln1(out + x)

        out = self.ln2(out + self.ffc(out))

        return out


class TransformerEncoder(nn.Module):
    """
    Arguments:
        num_embeddings : the number of word types
        num_enc_layers : the number of encoder stack
        embedding_dim : the dimension of embedding vector
        num_attention : the number of attention heads
        hidden_channel : the number of hidden channels in Position-wise Feed-Forward Networks
        use_embedding : Transformer embedding enabled or not
    """

    def __init__(self, num_embeddings, num_enc_layers=6, embedding_dim=512, num_attention=8, hidden_channel=2048, use_embedding=True):
        super(TransformerEncoder, self).__init__()

        self.num_enc_layers = num_enc_layers
        self.embedding_dim = embedding_dim
        self.num_attention = num_attention
        self.hidden_channel = hidden_channel
        self.use_embedding = use_embedding

        if use_embedding:
            self.embedding = EmbeddingBlock(num_embeddings, embedding_dim)


        self.multihead_attention_blocks = nn.ModuleList([MultiHeadAttentionBlock(in_channel=self.embedding_dim,
                                                                       num_attention=self.num_attention,
                                                                       hidden_channel=self.hidden_channel)
                                                                            for _ in range(self.num_enc_layers)])

    def forward(self, x):
        """
        input : indexed words (batch_size, num_words)
        output : features (batch_size, embedding_dim)
        """

        out = x

        if self.use_embedding:
            out = self.embedding(x)

        for multihead_attention in self.multihead_attention_blocks:
            out = multihead_attention(out)

        return out

In [None]:
class SegmentEmbedding(nn.Embedding):   # referenced from https://github.com/codertimo/BERT-pytorch/blob/master/bert_pytorch/model/embedding/segment.py

    def __init__(self, embedding_dim):
        super(SegmentEmbedding, self).__init__(3, embedding_dim)


class BERTEmbeddingBlock(nn.Module):
    """
    Arguments:
        num_embeddings : the number of word types
        embedding_dim : the dimension of embedding vector

    Variables:
        out_channel : d_model
    """

    def __init__(self, num_embeddings, embedding_dim):
        super(BERTEmbeddingBlock, self).__init__()

        self.embedding_dim = embedding_dim
        self.pos_units = [10000**(2*i/self.embedding_dim) for i in range(self.embedding_dim//2)]

        self.embedding = nn.Embedding(num_embeddings=num_embeddings, embedding_dim=embedding_dim)
        self.segment_embedding = SegmentEmbedding(self.embedding_dim) # 줄여도 될듯

    def forward(self, x, segment_info):
        """
        input : indexed words (batch_size, num_words)
        output : word embeddings (batch_size, num_words, embedding_dim)
        """

        out = self.embedding(x)         # Tocken Embedding
        pos = torch.zeros(out.shape)    # Position Embedding
        for p in range(pos.shape[1]):
            for i in range(0, pos.shape[2], 2):
                pos[:, p, i] = torch.sin(torch.Tensor([p/self.pos_units[i//2]]))
                pos[:, p, i+1] = torch.cos(torch.Tensor([p/self.pos_units[i//2]]))
        out += pos

        out += self.segment_embedding(segment_info)    # Segment Embedding   # referenced from https://github.com/codertimo/BERT-pytorch/blob/master/bert_pytorch/model/embedding/segment.py

        return out



class BERT(nn.Module):

    """
    Arguments:
        num_embeddings : the number of word types
        num_transformer_block : the dimension of embedding vector
        num_enc_layers : the number of encoder stack
        embedding_dim : the dimension of embedding vector
        num_attention : the number of attention heads
        hidden_channel : the number of hidden channels in Position-wise Feed-Forward Networks

    Variables:
        out_channel : d_model
    """
    def __init__(self, num_embeddings=30000, num_transformer_block=6, num_enc_layers=1, embedding_dim=768, num_attention=12, hidden_channel=3072):
        super(BERT, self).__init__()

        self.num_embeddings = num_embeddings
        self.num_transformer_block = num_transformer_block
        self.num_enc_layers = num_enc_layers
        self.embedding_dim = embedding_dim
        self.num_attention = num_attention
        self.hidden_channel = hidden_channel

        self.embedding = BERTEmbeddingBlock(self.num_embeddings, self.embedding_dim)

        self.transformer_blocks = nn.ModuleList([TransformerEncoder(num_embeddings=self.embedding_dim,
                                                                  num_enc_layers=self.num_enc_layers,
                                                                  embedding_dim=self.embedding_dim,
                                                                  num_attention=self.num_attention,
                                                                  hidden_channel=self.hidden_channel,
                                                                  use_embedding=False,)
                                                                        for _ in range(self.num_transformer_block)])

    def forward(self, x, segment_info):         # referenced from https://github.com/codertimo/BERT-pytorch/blob/master/bert_pytorch/model/embedding/segment.py
        x = self.embedding(x, segment_info)

        for transformer in self.transformer_blocks:
            x = transformer.forward(x)

        return x

In [None]:
patient_ques
docter_ans

In [None]:
from transformers import BertTokenizer, BertModel

In [None]:
import os

seed = 42
np.random.seed(seed)
os.environ['PYTHONHASHSEED'] = str(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

model = BERT(num_embeddings=len(tokenizer.word_index))
print(sum(param.numel() for param in model.parameters()))

input = 'hello friends?'
input = torch.randint(0, 3, (2, 3)) # num_embeddings:3  batch_size:2  max_len: 3
segment_info = torch.tensor([[0, 0, 0], [1, 1, 1]], dtype=torch.int64)

output = model(input, segment_info)
print(output)

46685952
tensor([[[ 0.0535, -2.4851, -0.4532,  ...,  0.6194,  1.4986,  1.0238],
         [ 0.4832, -2.6986, -0.1319,  ...,  0.5717,  1.4471,  0.9413],
         [ 1.3806, -1.0021,  1.1133,  ..., -0.9119,  0.9023,  0.3246]],

        [[-1.3049, -1.3911,  0.5762,  ..., -2.0174, -0.7108,  0.8555],
         [-1.1055, -2.6931, -0.2265,  ..., -1.4985, -0.6140,  1.2815],
         [-1.0695, -1.8823,  0.8891,  ..., -1.9811, -0.8192,  0.8120]]],
       grad_fn=<NativeLayerNormBackward0>)


In [None]:
import torch
from torch.utils.data import DataLoader
from torch.nn import CrossEntropyLoss
from torch.optim import Adam

# Assuming pat_qus, doc_ans, seg_emb are already defined
# and that your BERT model and ConversationDataset class are properly implemented

# Create dataset and dataloader
conversation_dataset = ConversationDataset(pat_qus, doc_ans, seg_emb)
conversation_dataloader = DataLoader(conversation_dataset, batch_size=batch_size, shuffle=True)

# Initialize the model
model = BERT(num_embeddings=len(tokenizer.word_index))
model.to(device)

# Define loss function and optimizer
loss_function = CrossEntropyLoss()
optimizer = Adam(model.parameters(), lr=0.001)

# Training loop
for epoch in range(num_epochs):  # num_epochs should be defined
    model.train()
    total_loss = 0

    for i, data in enumerate(conversation_dataloader):
        # Forward pass
        patient_questions = data['patient_question'].to(device)
        doctor_answers = data['doctor_answer'].to(device)
        segment_embeddings = data['segment_embedding'].to(device)

        optimizer.zero_grad()
        output = model(patient_questions, segment_embeddings)[:, 201:, :]

        # Compute loss - ensure doctor_answers is the correct target and has the right shape
        loss = loss_function(output.view(-1, output.size(-1)), doctor_answers.view(-1))
        total_loss += loss.item()

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {total_loss/len(conversation_dataloader)}")

# Optionally, you can add validation steps and model saving


NameError: ignored

In [None]:
loss_function(output.view(-1, output.size(-1)), doctor_answers.view(-1))