In [1]:
import os
import re
import json
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from collections import Counter

from sklearn.model_selection import train_test_split

from transformers import AutoTokenizer, TFAutoModel

import tensorflow as tf
import tensorflow.keras.backend as K
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Input,Dense
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import Concatenate,Conv1D,MaxPool1D,Dropout

from src.config import Config

In [2]:
corpus_movie_conv = "cornell movie-dialogs corpus/movie_conversations.txt"
corpus_movie_lines = "cornell movie-dialogs corpus/movie_lines.txt"
max_len = 25

In [3]:
dialogpt_tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-small")
dialogpt_model = TFAutoModel.from_pretrained("microsoft/DialoGPT-small")

biobert_tokenizer = AutoTokenizer.from_pretrained("cambridgeltl/BioRedditBERT-uncased")
biobert_model = TFAutoModel.from_pretrained("cambridgeltl/BioRedditBERT-uncased")

All model checkpoint layers were used when initializing TFGPT2Model.

All the layers of TFGPT2Model were initialized from the model checkpoint at microsoft/DialoGPT-small.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2Model for predictions without further training.
All model checkpoint layers were used when initializing TFBertModel.

All the layers of TFBertModel were initialized from the model checkpoint at cambridgeltl/BioRedditBERT-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [5]:
def parse_chat_data():
    file = os.path.join(Config.FILES["RAW_DATA_DIR"], "chat.json")
    with open(file) as f:
        data = json.loads(f.read())
        
    parsed_dials = []
    for line in data:
        tags = line["topic"]
        dialogue = line["dialogue"]
        
        all_tags = []
        all_emotion = []
        all_text = []
        for content in dialogue:
            text = content["text"]
            all_text.append(text)
            emotion = content["emotion"]
            all_emotion.append(emotion)
            all_tags.append(tags)

        parsed_dials.append({"persona1": [], "persona2": [], "turns": all_text, "emotions": all_emotion, "tags": all_tags})
        dial_sent_lens = [len(s) for turn in parsed_dials for s in turn["turns"]]
        
    dialogues_df = pd.DataFrame()
    for turns in parsed_dials:
        temp_df = pd.DataFrame()
        temp_turns = turns["turns"]
        tuples_turns_list = [(temp_turns[i], temp_turns[i+1]) for i in range(0, len(temp_turns)-1, 2)]
        temp_turns_df = pd.DataFrame(tuples_turns_list, columns=["questions", "answers"])
        
        temp_emotions = turns["emotions"]
        tuples_emotions_list = [(temp_emotions[i], temp_emotions[i+1]) for i in range(0, len(temp_emotions)-1, 2)]
        temp_emotions_df = pd.DataFrame(tuples_emotions_list, columns=["questions_emotions", "answers_emotions"])
        
        temp_tags = turns["tags"]
        tuples_tags_list = [(temp_tags[i], temp_tags[i+1]) for i in range(0, len(temp_tags)-1, 2)]
        temp_tags_df = pd.DataFrame(tuples_tags_list, columns=["questions_tags", "answers_tags"])
        
        temp_df = pd.concat([temp_turns_df, temp_emotions_df, temp_tags_df], axis=1)
        dialogues_df = pd.concat([dialogues_df, temp_df])
        
    dialogues_df = dialogues_df.reset_index(drop=True)
    dialogues_df["emotions"] = dialogues_df.apply(lambda row: [row["questions_emotions"], row["answers_emotions"]], axis=1)
    dialogues_df["tags"] = dialogues_df.apply(lambda row: [row["questions_tags"], row["answers_tags"]], axis=1)
    
    dialogues_df["tags_encode"] = dialogues_df["tags"].apply(lambda x: _extract_tags(x))
    dialogues_df["label"] = dialogues_df["tags_encode"].apply(lambda x: _tags_encoder(x))
        
    return data, parsed_dials, dialogues_df


def decontractions(phrase):
    """decontracted takes text and convert contractions into natural form.
     ref: https://stackoverflow.com/questions/19790188/expanding-english-language-contractions-in-python/47091490#47091490"""
    phrase = re.sub(r"won\"t", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)
    phrase = re.sub(r"won\’t", "will not", phrase)
    phrase = re.sub(r"can\’t", "can not", phrase)
    phrase = re.sub(r"n\"t", " not", phrase)
    phrase = re.sub(r"\"re", " are", phrase)
    phrase = re.sub(r"\"s", " is", phrase)
    phrase = re.sub(r"\"d", " would", phrase)
    phrase = re.sub(r"\"ll", " will", phrase)
    phrase = re.sub(r"\"t", " not", phrase)
    phrase = re.sub(r"\"ve", " have", phrase)
    phrase = re.sub(r"\"m", " am", phrase)

    phrase = re.sub(r"n\’t", " not", phrase)
    phrase = re.sub(r"\’re", " are", phrase)
    phrase = re.sub(r"\’s", " is", phrase)
    phrase = re.sub(r"\’d", " would", phrase)
    phrase = re.sub(r"\’ll", " will", phrase)
    phrase = re.sub(r"\’t", " not", phrase)
    phrase = re.sub(r"\’ve", " have", phrase)
    phrase = re.sub(r"\’m", " am", phrase)

    return phrase


def preprocess(text):
    # convert all the text into lower letters
    # remove the words betweent brakets ()
    # remove these characters: {"$", ')', '?', '"', '’', '.',  '°', '!', ';', '/', "'", '€', '%', ':', ',', '('}
    # replace these spl characters with space: '\u200b', '\xa0', '-', '/'
    
    text = text.lower()
    text = decontractions(text)
    text = re.sub('[$)\?"’.°!;\'€%:,(/]', "", text)
    text = re.sub("\u200b", " ", text)
    text = re.sub("\xa0", " ", text)
    text = re.sub("-", " ", text)
    return text


def get_chat_data():
    chat_data, parsed_dials, dialogues_df = parse_chat_data()
    
    dialogues_df["preprocessed_question"] = dialogues_df["questions"].apply(preprocess)
    dialogues_df["preprocessed_answer"] = dialogues_df["answers"].apply(preprocess)
    
    dialogues_df["question_len"] = dialogues_df["preprocessed_question"].apply(lambda x: len(x.split(" ")))
    dialogues_df["answer_len"] = dialogues_df["preprocessed_answer"].apply(lambda x: len(x.split(" ")))

    dialogues_df["short_question"] = dialogues_df.apply(lambda x: " ".join(x.preprocessed_question.split(" ")[:500]) if x.question_len>500 else x.preprocessed_question ,axis=1)
    dialogues_df["short_answer"] = dialogues_df.apply(lambda x: " ".join(x.preprocessed_answer.split(" ")[:500]) if x.answer_len>500 else x.preprocessed_answer ,axis=1)
        
    return chat_data, parsed_dials, dialogues_df


def extract_negative_samples(question, tags):
  stop=False
  while (not stop):
    sample_row = dialogues_df.sample()
    sample_tags = dialogues_df["tags"].values[0]
    inter_tags = set(tags[0]).intersection(set(sample_tags))
    
    if len(inter_tags)==0:
      stop=True
  
  return sample_row


def _extract_tags(tags_list):
    if all(x == tags_list[0] for x in tags_list):
        return tags_list[0]
    else:
        return np.nan
    
    
def _tags_encoder(tags):
    if tags == "greeting":
        return 1
    elif tags == "delivery":
        return 2
    elif tags == "complaint":
        return 3
    elif tags == "credit_card":
        return 4
    elif tags == "loan":
        return 5
    elif tags == "insurance":
        return 6
    elif tags == "remittance":
        return 7
    elif tags == "attitude_and_emotion":
        return 8
    elif tags == "relationship":
        return 9
    else:
        return np.nan

In [6]:
chat_data, parsed_dials, dialogues_df = get_chat_data()

In [7]:
dialogues_df.head(2)

Unnamed: 0,questions,answers,questions_emotions,answers_emotions,questions_tags,answers_tags,emotions,tags,tags_encode,label,preprocessed_question,preprocessed_answer,question_len,answer_len,short_question,short_answer
0,Hi,"Hi there, I am RAK-Voice, what can I do for you?",neutral,neutral,greeting,greeting,"[neutral, neutral]","[greeting, greeting]",greeting,1,hi,hi there i am rak voice what can i do for you,1,12,hi,hi there i am rak voice what can i do for you
1,"RAK-Voice, this is the first time I knowing yo...","Glad that you ask this questions, I am an inte...",neutral,neutral,greeting,greeting,"[neutral, neutral]","[greeting, greeting]",greeting,1,rak voice this is the first time i knowing you...,glad that you ask this questions i am an intel...,13,33,rak voice this is the first time i knowing you...,glad that you ask this questions i am an intel...


In [8]:
train, validation = train_test_split(dialogues_df, 
                                     test_size=0.1,
                                     random_state=42,
                                     shuffle=True,
                                     stratify=dialogues_df.label)

## Training Model

In [55]:
MAX_LENGTH = 512

def tokenize_and_filter(q, a):
    tokenized_questions, tokenized_answers = [], []
    
    for (question, answer) in zip (q, a):
        tokenized_question = dialogpt_tokenizer.encode(question)
        tokenized_answer = dialogpt_tokenizer.encode(answer)
        tokenized_questions.append(tokenized_question)
        tokenized_answers.append(tokenized_answer)
        
    # padding the sequences
    tokenized_questions = tf.keras.preprocessing.sequence.pad_sequences(
        tokenized_questions, maxlen=MAX_LENGTH, padding="post")
    tokenized_answers = tf.keras.preprocessing.sequence.pad_sequences(
        tokenized_answers, maxlen=MAX_LENGTH, padding="post")
    
    return tokenized_questions, tokenized_answers


def tokenize_and_filter(q, a):
  tokenized_questions, tokenized_answers = [], []
  
  for (question, answer) in zip(q, a):
    # generating sequences
    tokenized_question =  biobert_tokenizer.encode(question)
    tokenized_answer = biobert_tokenizer.encode(answer)
    tokenized_questions.append(tokenized_question)
    tokenized_answers.append(tokenized_answer)

  # padding the sequences
  tokenized_questions = tf.keras.preprocessing.sequence.pad_sequences(
      tokenized_questions, maxlen=MAX_LENGTH, padding="post")
  tokenized_answers = tf.keras.preprocessing.sequence.pad_sequences(
      tokenized_answers, maxlen=MAX_LENGTH, padding="post")
  
  return tokenized_questions, tokenized_answers

In [56]:
questions = train["short_question"]
answers = train["short_answer"]
labels = train["label"]

questions, answers = tokenize_and_filter(questions, answers)

train_question_mask = [[1 if token!=0 else 0 for token in question] for question in questions]
train_answer_mask = [[1 if token!=0 else 0 for token in answer] for answer in answers]

In [57]:
val_questions = validation["short_question"]
val_answers = validation["short_answer"]
val_labels = validation["label"]

val_questions, val_answers = tokenize_and_filter(val_questions, val_answers)

val_question_mask = [[1 if token!=0 else 0 for token in question] for question in val_questions]
val_answer_mask = [[1 if token!=0 else 0 for token in answer] for answer in val_answers]

In [58]:
BATCH_SIZE = 2
BUFFER_SIZE = 20000

dataset = tf.data.Dataset.from_tensor_slices((
    {
        "question": questions,
        "answer": answers,
        "question_mask": train_question_mask,
        "answer_mask": train_answer_mask
    },
    {
        "label": labels.values
    },
))

dataset = dataset.shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)
dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)


val_dataset = tf.data.Dataset.from_tensor_slices((
    {
        "question": val_questions,
        "answer": val_answers,
        "question_mask": val_question_mask,
        "answer_mask": val_answer_mask
    },
    {
        "label": val_labels.values
    },
))

val_dataset = val_dataset.cache()
val_dataset = val_dataset.shuffle(BUFFER_SIZE)
val_dataset = val_dataset.batch(BATCH_SIZE, drop_remainder=True)
val_dataset = val_dataset.prefetch(tf.data.experimental.AUTOTUNE)

In [59]:
class FFN(tf.keras.layers.Layer):
    def __init__(
            self,
            name="FFN",
            **kwargs):
        """Simple Dense wrapped with various layers
        """

        super(FFN, self).__init__(name=name, **kwargs)
        self.dropout = 0.2
        self.ffn_layer = tf.keras.layers.Dense(
            units=768,
            activation="relu",
            kernel_initializer=tf.keras.initializers.glorot_normal(seed=32),
            name="FC1")
        

    def call(self, inputs):
        ffn_embedding = self.ffn_layer(inputs)
        ffn_embedding = tf.keras.layers.Dropout(
            self.dropout)(ffn_embedding)
        ffn_embedding += inputs
        
        return ffn_embedding

In [60]:
biobert_model.summary()

Model: "tf_bert_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  108310272 
                                                                 
Total params: 108,310,272
Trainable params: 108,310,272
Non-trainable params: 0
_________________________________________________________________


In [61]:
class DialogQAModelwithBert(tf.keras.Model):
    def __init__(self, trainable=False, name=""):
        super(DialogQAModelwithBert, self).__init__(name=name)
        self.q_ffn_layer = FFN(name="q_ffn")
        self.a_ffn_layer = FFN(name="a_ffn")
        self.dialogpt_model = dialogpt_model
        self.dialogpt_model.trainable = trainable
        self.cos = tf.keras.layers.Dot(axes=1, normalize=True)
        
        
    def call(self, inputs):
        question_embeddings = self.dialoggpt_model(input_ids=inputs["question"], attention_mask=inputs["question_mask"]).pooler_output
        answer_embeddings = self.dialoggpt_model(input_ids=inputs["answer"], attention_mask=inputs["answer_mask"]).pooler_output
        q_ffnn = self.q_ffn_layer(question_embeddings)
        a_ffnn = self.a_ffn_layer(answer_embeddings)
        output = self.cos([q_ffnn,a_ffnn])
        
        return {"label":output}
    

class MedicalQAModelwithBert(tf.keras.Model):
    def __init__(
            self,
            trainable=False,
            name=""):
        super(MedicalQAModelwithBert, self).__init__(name=name)

        self.q_ffn_layer = FFN(name="q_ffn")
        self.a_ffn_layer = FFN(name="a_ffn")
        self.biobert_model = biobert_model
        self.biobert_model.trainable = trainable
        self.cos = tf.keras.layers.Dot(axes=1, normalize=True)

    def call(self, inputs):
      question_embeddings = self.biobert_model(input_ids=inputs["question"], attention_mask=inputs["question_mask"]).pooler_output
      answer_embeddings = self.biobert_model(input_ids=inputs["answer"], attention_mask=inputs["answer_mask"]).pooler_output
      q_ffnn = self.q_ffn_layer(question_embeddings)
      a_ffnn = self.a_ffn_layer(answer_embeddings)
      output = self.cos([q_ffnn, a_ffnn])
      
      return {"label":output}

In [73]:
class Custom_Callback(tf.keras.callbacks.Callback):
    def on_train_begin(self, logs={}):
        self.history = {"acc": []}
        
        
    def on_epoch_end(self, epoch, logs={}):
        self.history["acc"].append(logs.get("custom_metric_acc"))
        
        if (epoch==0) or (logs.get("custom_metric_acc") > self.history(["acc"][epoch-1])):
            self.model.save_weights("dialog_gpt_re" + str(epoch) + "/rakchat" + str(epoch) + "_" + str(logs.get("custom_metric_acc")))
            
        is_nan_values = []
        for i in self.model.get_weights():
            is_nan_values.append(np.isnan(i).any())
            
        if (np.array(is_nan_value).any() or (tf.math.is_nan(logs.get("loss"))) or (np.isinf(logs.get("loss")))):
            self.model.stop_training = True
            

class custom_callback(tf.keras.callbacks.Callback):
  def on_train_begin(self, logs={}):
        ## on begin of training, we are creating a instance varible called history
        ## it is a dict with keys [loss, acc, val_loss, val_acc]
        self.history={"acc": []}

  def on_epoch_end(self, epoch, logs={}):
        self.history["acc"].append(logs.get("custom_metric_acc"))
  
        #saving the model if validation accuracy increased from previous epoch
        if  (epoch==0) or (logs.get("custom_metric_acc")>self.history["acc"][epoch-1]):
          fname = os.path.join(Config.FILES["MODEL_DATA_DIR"], ("medical_bert_re" + str(epoch) + "\medic" + str(epoch) + "_" + str(logs.get("custom_metric_acc"))))
          self.model.save_weights(fname)

        is_nan_values=[]
        for i in self.model.get_weights():
          is_nan_values.append(np.isnan(i).any())

        #stopping the training if weights is nan or loss is nan or inf
        if (np.array(is_nan_values).any() or (tf.math.is_nan(logs.get("loss"))) or (np.isinf(logs.get("loss")))):
          self.model.stop_training = True

In [63]:
batch_size = 2

def custom_metric_acc(y_true, y_pred):
  y_true = tf.reshape(y_true, [tf.constant(batch_size)])
  y_pred = tf.reshape(y_pred, [tf.constant(batch_size)])
  c = tf.constant(0, dtype="float32")
  d = tf.cast(tf.math.greater_equal(y_true,c), dtype="float32")
  e = tf.cast(tf.math.greater_equal(y_pred,c), dtype="float32")
  f = tf.cast(tf.math.equal(d,e), dtype="float32")
  g = tf.reduce_sum(f)
  h = tf.cast(tf.shape(f), dtype="float32")
  i = g/h
  return i

In [74]:
K.set_floatx("float32")
learning_rate = 5e-6
num_epochs = 5
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
medical_qa_model = MedicalQAModelwithBert(trainable=True)
medical_qa_model.compile(
    optimizer=optimizer, loss=tf.keras.losses.mean_squared_error, metrics=[custom_metric_acc])

epochs = num_epochs

medical_qa_model.fit(dataset, validation_data=val_dataset, epochs=epochs, callbacks=[custom_callback()])
medical_qa_model.summary()

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Model: ""
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 q_ffn (FFN)                 multiple                  590592    
                                                                 
 a_ffn (FFN)                 multiple                  590592    
                                                                 
 tf_bert_model (TFBertModel)  multiple                 108310272 
                                                                 
 dot_21 (Dot)                multiple                  0         
                                                                 
Total params: 109,491,456
Trainable params: 109,491,456
Non-trainable params: 0
_________________________________________________________________


In [75]:
K.set_floatx("float32")
medical_qa_model = MedicalQAModelwithBert(trainable=True)

model_fname = "./medical_bert_re0/medic0_1.0.data-00000-of-00001"
medical_qa_model.load_weights(model_fname)
learning_rate = 5e-6
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
medical_qa_model.compile(
    optimizer=optimizer, loss=tf.keras.losses.mean_squared_error, metrics=[custom_metric_acc])

predicted_labels=[]
from tqdm.notebook import tqdm
for i in tqdm(range(len(val_questions))):
  predicted_labels.append(medical_qa_model.predict({"question":np.array([val_questions[i]]),
                                                    "question_mask":np.array([val_question_mask[i]]),
                                                    "answer":np.array([val_answers[i]]),
                                                    "answer_mask":np.array([val_answer_mask[i]])})["label"][0][0])

ValueError: Unable to load weights saved in HDF5 format into a subclassed Model which has not created its variables yet. Call the Model first, then load the weights.

## Apendix

In [None]:
fname = os.path.join
with open('ehealthforumQAs.json') as f1:
  ehealth=json.load(f1)["data"]
with open('icliniqQAs.json') as f2:
  icliniq=json.load(f2)["data"]
with open('questionDoctorQAs.json') as f3:
  questiondoctor=json.load(f3)["data"]
with open('webmdQAs.json') as f4:
  webmd=json.load(f4)["data"]