In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from transformers import BertTokenizer
from google.colab import drive
from tqdm import tqdm
import re
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
def configure_gpu():
    """Configure GPU settings for optimal performance"""
    gpus = tf.config.list_physical_devices('GPU')
    if gpus:
        try:
            # Enable memory growth
            for gpu in gpus:
                tf.config.experimental.set_memory_growth(gpu, True)

            # Use mixed precision
            policy = tf.keras.mixed_precision.Policy('mixed_float16')
            tf.keras.mixed_precision.set_global_policy(policy)

            print(f"Found {len(gpus)} GPU(s). GPU configuration successful.")
            return True
        except RuntimeError as e:
            print(f"GPU configuration failed: {e}")
            return False
    else:
        print("No GPUs found. Using CPU.")
        return False

In [3]:
dialogues_df = pd.read_parquet('dialogues.parquet')
dialogues_df.drop(columns=['Description'], inplace=True)
dialogues_df.rename(columns={'Patient': 'query', 'Doctor': 'response'}, inplace=True)
dialogues_df.drop_duplicates(inplace=True)
dialogues_df.reset_index(drop=True, inplace=True)
dialogues_df

Unnamed: 0,query,response
0,"Hi doctor,I am just wondering what is abutting...",Hi. I have gone through your query with dilige...
1,"Hi doctor, I am a 22-year-old female who was d...",Hi. You have really done well with the hypothy...
2,Hi doctor! I used to have clear skin but since...,Hi there Acne has multifactorial etiology. Onl...
3,"Hello doctor,I am having an uncomfortable feel...",Hello. The popping and discomfort what you fel...
4,"Hello doctor,Before two years had sex with a c...",Hello. The HIV test uses a finger prick blood ...
...,...,...
246522,I am suffering from excessive hairfall. My doc...,"Hello Dear Thanks for writing to us, we are he..."
246523,"Hi Doctor, I have been having severe hair fall...","hello, hair4u is combination of minoxid..."
246524,Hi..i hav sever hair loss problem so consulted...,HI I have evaluated your query thoroughly you...
246525,"Hi, i am 25 year old girl, i am having massive...",Hello and Welcome to ‘Ask A Doctor’ service.I ...


In [4]:
meddatatset_df = pd.read_csv('/content/medDataset_processed.csv')
meddatatset_df.drop(columns=['qtype'], inplace=True)
meddatatset_df.rename(columns={'Question': 'query', 'Answer': 'response'},
inplace=True)
meddatatset_df.drop_duplicates(inplace=True)
meddatatset_df.reset_index(drop=True, inplace=True)
meddatatset_df

Unnamed: 0,query,response
0,Who is at risk for Lymphocytic Choriomeningiti...,LCMV infections can occur after exposure to fr...
1,What are the symptoms of Lymphocytic Choriomen...,LCMV is most commonly recognized as causing ne...
2,Who is at risk for Lymphocytic Choriomeningiti...,Individuals of all ages who come into contact ...
3,How to diagnose Lymphocytic Choriomeningitis (...,"During the first phase of the disease, the mos..."
4,What are the treatments for Lymphocytic Chorio...,"Aseptic meningitis, encephalitis, or meningoen..."
...,...,...
16354,What are the symptoms of Familial visceral myo...,What are the signs and symptoms of Familial vi...
16355,What is (are) Pseudopelade of Brocq ?,Pseudopelade of Brocq (PBB) is a slowly progre...
16356,What are the symptoms of Pseudopelade of Brocq ?,What are the signs and symptoms of Pseudopelad...
16357,What are the treatments for Pseudopelade of Br...,Is there treatment or a cure for pseudopelade ...


In [5]:
df = pd.concat([dialogues_df, meddatatset_df], ignore_index=True)
df =  df.sample(frac=1, random_state=42).reset_index(drop=True)
df

Unnamed: 0,query,response
0,Is Tourette syndrome inherited ?,Is Tourette syndrome inherited? Evidence from ...
1,"My name is Gwendolyn Fails, D.O.B.10/08/1969. ...","Hello Mr fails, I could understand that u have..."
2,I am concerned about my 50 year old brother wh...,In my opinion its not inflammation its swellin...
3,I have had back and hip pain/discomfort since ...,"Hi,i think you should wait for a while before ..."
4,husband going to dr. tomorrow. Trying to figur...,HiThank you for asking HCMI have gone through ...
...,...,...
262881,"Hey, my friend cut her radial artery last nig...","Hi, Welcome to Health care magic forum. ..."
262882,What are the symptoms of Cone-rod dystrophy am...,What are the signs and symptoms of Cone-rod dy...
262883,I am a 55 year old female. My two older sister...,"hi mam, with a strong family history of NHL it..."
262884,My 3 yr old had an asd closure with an amplatz...,Hi...I suggest you call on her cardiologist. A...


In [6]:
tokenizer = BertTokenizer.from_pretrained('dmis-lab/biobert-base-cased-v1.1')
MAX_LENGTH = 256

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [7]:
class BioBERTPreprocessor:
    def __init__(self, model_name='dmis-lab/biobert-base-cased-v1.1'):
        """Initialize the BioBERT Preprocessor with the model's tokenizer."""
        self.tokenizer = BertTokenizer.from_pretrained(model_name)
        self.vocab_size = self.tokenizer.vocab_size
        self.max_position_embeddings = 512  # BioBERT's max position embeddings
        print(f"Initialized BioBERT tokenizer with vocabulary size: {self.vocab_size}")

    def _clean_medical_text(self, text):
        """Clean medical text while preserving important terms like medical abbreviations and units."""
        if isinstance(text, bytes):
            text = text.decode('utf-8')

        # Remove multiple spaces and newlines
        text = re.sub(r'\s+', ' ', text)

        # Handle medical abbreviations and units
        text = re.sub(r'(?<=[a-zA-Z])-(?=[a-zA-Z])', ' - ', text)  # Preserve hyphenated terms
        text = re.sub(r'(\d+)([a-zA-Z]+)', r'\1 \2', text)  # Add space between numbers and units
        text = re.sub(r'([A-Za-z])\.([A-Za-z])\.', r'\1\2', text)  # Handle abbreviations

        # Keep medical symbols while removing other special characters
        text = re.sub(r'[^A-Za-z0-9\s\-\+°%/()[\]{}±→←↔≥≤=≠μαβγ.]', ' ', text)

        return text.strip()

    def prepare_dialogue_pair(self, query, response, max_length):
        """Prepare a single dialogue pair (query and response) for tokenization."""
        # Clean query and response
        clean_query = self._clean_medical_text(query)
        clean_response = self._clean_medical_text(response)

        # Tokenize query and response
        query_encodings = self.tokenizer(
            clean_query,
            padding='max_length',
            add_special_tokens=True,
            truncation=True,
            max_length=max_length,
            return_tensors='tf',
            return_token_type_ids=True,  # To include token type ids (segment IDs)
        )

        response_encodings = self.tokenizer(
            clean_response,
            padding='max_length',
            truncation=True,
            add_special_tokens=True,
            max_length=max_length,
            return_tensors='tf',
            return_token_type_ids=True,  # To include token type ids (segment IDs)
        )

        return {
            'query_input_ids': query_encodings['input_ids'][0],
            'query_attention_mask': query_encodings['attention_mask'][0],
            'response_input_ids': response_encodings['input_ids'][0],
            'response_attention_mask': response_encodings['attention_mask'][0]
        }

    def prepare_dataset(self, df, max_length, batch_size=32):
            """Process a DataFrame and convert it to a TensorFlow dataset."""
            query_input_ids = []
            query_attention_masks = []
            response_input_ids = []
            response_attention_masks = []

            print("Processing dialogue pairs...")
            for row in tqdm(df.itertuples(index=False), total=len(df)):
                try:
                    # Prepare each dialogue pair
                    dialogue_pair = self.prepare_dialogue_pair(
                        row.query,
                        row.response,
                        max_length
                    )
                    # Append results to respective lists
                    query_input_ids.append(dialogue_pair['query_input_ids'])
                    query_attention_masks.append(dialogue_pair['query_attention_mask'])
                    response_input_ids.append(dialogue_pair['response_input_ids'])
                    response_attention_masks.append(dialogue_pair['response_attention_mask'])
                except Exception as e:
                    print(f"Error processing row: {e}")
                    continue

            print("Creating TensorFlow dataset...")
            # Directly convert lists into tensors
            dataset = tf.data.Dataset.from_tensor_slices({
                'query_input_ids': tf.convert_to_tensor(query_input_ids),
                'query_attention_mask': tf.convert_to_tensor(query_attention_masks),
                'response_input_ids': tf.convert_to_tensor(response_input_ids),
                'response_attention_mask': tf.convert_to_tensor(response_attention_masks)
            })

            return dataset.batch(batch_size)

In [8]:
def save_dataset_to_tfrecord(dataset, filename):
    """Save TensorFlow dataset to a TFRecord file."""
    print(f"Saving dataset to {filename}")
    with tf.io.TFRecordWriter(filename) as writer:
        # Unbatch the dataset first to handle individual examples
        unbatched_dataset = dataset.unbatch()
        for example in tqdm(unbatched_dataset):
            features = {
                'query_input_ids': tf.train.Feature(
                    int64_list=tf.train.Int64List(value=tf.cast(example['query_input_ids'], tf.int64).numpy())),
                'query_attention_mask': tf.train.Feature(
                    int64_list=tf.train.Int64List(value=tf.cast(example['query_attention_mask'], tf.int64).numpy())),
                'response_input_ids': tf.train.Feature(
                    int64_list=tf.train.Int64List(value=tf.cast(example['response_input_ids'], tf.int64).numpy())),
                'response_attention_mask': tf.train.Feature(
                    int64_list=tf.train.Int64List(value=tf.cast(example['response_attention_mask'], tf.int64).numpy()))
            }
            tf_example = tf.train.Example(features=tf.train.Features(feature=features))
            writer.write(tf_example.SerializeToString())
    print(f"Dataset saved to {filename}")

def verify_tfrecord(filename):
    """Verify the TFRecord file by reading a few examples."""
    print(f"Verifying TFRecord file: {filename}")
    try:
        dataset = tf.data.TFRecordDataset(filename)
        feature_description = {
            'query_input_ids': tf.io.FixedLenSequenceFeature([], tf.int64, allow_missing=True),
            'query_attention_mask': tf.io.FixedLenSequenceFeature([], tf.int64, allow_missing=True),
            'response_input_ids': tf.io.FixedLenSequenceFeature([], tf.int64, allow_missing=True),
            'response_attention_mask': tf.io.FixedLenSequenceFeature([], tf.int64, allow_missing=True),
        }

        def _parse_function(example_proto):
            return tf.io.parse_single_example(example_proto, feature_description)

        parsed_dataset = dataset.map(_parse_function)
        for i, example in enumerate(parsed_dataset.take(1)):
            print(f"Successfully read example {i+1}")
            print(f"Shape of query_input_ids: {example['query_input_ids'].shape}")
        return True
    except Exception as e:
        print(f"Error verifying TFRecord: {e}")
        return False

In [9]:
def configure_gpu():
    """Check for GPU availability and configure TensorFlow accordingly."""
    if tf.config.list_physical_devices('GPU'):
        print("GPU is available.")
        return True
    else:
        print("No GPU found.")
        return False

In [10]:
def main():
    # Constants
    MAX_LENGTH = 256
    MODEL_NAME = 'dmis-lab/biobert-base-cased-v1.1'
    OUTPUT_FILE = 'chatbot_med_df.tfrecord'

    try:
        # Initialize preprocessor
        print("Initializing BioBERT preprocessor...")
        preprocessor = BioBERTPreprocessor(MODEL_NAME)

        # Prepare dataset
        print("Preparing dataset...")
        processed_dataset = preprocessor.prepare_dataset(df, MAX_LENGTH)

        # Save to TFRecord
        print("Saving processed dataset...")
        save_dataset_to_tfrecord(processed_dataset, OUTPUT_FILE)

        # Verify the saved file
        if verify_tfrecord(OUTPUT_FILE):
            print("TFRecord file verified successfully!")
        else:
            print("Failed to verify TFRecord file.")

        return processed_dataset

    except Exception as e:
        print(f"An error occurred: {e}")
        raise

In [11]:
# Assuming main() has been run and processed_dataset is available
processed_dataset = main()

# Take the first 5 entries from the processed dataset
top_5 = processed_dataset.take(5)  # No need for list()

for idx, example in enumerate(top_5):
    # Convert each example to a NumPy-friendly format
    example = {key: value.numpy() for key, value in example.items()}

    print(f"Example {idx + 1}:")
    print(f"Query input IDs: {example['query_input_ids']}")
    print(f"Query attention mask: {example['query_attention_mask']}")
    print(f"Response input IDs: {example['response_input_ids']}")
    print(f"Response attention mask: {example['response_attention_mask']}")
    print("-" * 50)

Initializing BioBERT preprocessor...
Initialized BioBERT tokenizer with vocabulary size: 28996
Preparing dataset...
Processing dialogue pairs...


100%|██████████| 262886/262886 [39:00<00:00, 112.34it/s]


Creating TensorFlow dataset...
Saving processed dataset...
Saving dataset to chatbot_med_df.tfrecord


262886it [07:03, 620.79it/s]


Dataset saved to chatbot_med_df.tfrecord
Verifying TFRecord file: chatbot_med_df.tfrecord
Successfully read example 1
Shape of query_input_ids: (256,)
TFRecord file verified successfully!
Example 1:
Query input IDs: [[  101  1110  2465 ...     0     0     0]
 [  101  1139  1271 ...     0     0     0]
 [  101   178  1821 ...     0     0     0]
 ...
 [  101 20844  1103 ...     0     0     0]
 [  101  1139  1797 ...     0     0     0]
 [  101  1184  1110 ...     0     0     0]]
Query attention mask: [[1 1 1 ... 0 0 0]
 [1 1 1 ... 0 0 0]
 [1 1 1 ... 0 0 0]
 ...
 [1 1 1 ... 0 0 0]
 [1 1 1 ... 0 0 0]
 [1 1 1 ... 0 0 0]]
Response input IDs: [[  101  1110  2465 ...     0     0     0]
 [  101 19082   182 ...     0     0     0]
 [  101  1107  1139 ...     0     0     0]
 ...
 [  101  7059  4795 ...     0     0     0]
 [  101 20844  4208 ...     0     0     0]
 [  101  1143  4371 ...     0     0     0]]
Response attention mask: [[1 1 1 ... 0 0 0]
 [1 1 1 ... 0 0 0]
 [1 1 1 ... 0 0 0]
 ...
 [1 1 1

In [12]:
processed_dataset

<_BatchDataset element_spec={'query_input_ids': TensorSpec(shape=(None, 256), dtype=tf.int32, name=None), 'query_attention_mask': TensorSpec(shape=(None, 256), dtype=tf.int32, name=None), 'response_input_ids': TensorSpec(shape=(None, 256), dtype=tf.int32, name=None), 'response_attention_mask': TensorSpec(shape=(None, 256), dtype=tf.int32, name=None)}>