In [1]:
def setup_gpu_strategy():
    try:
        # Configure GPU memory growth
        physical_devices = tf.config.list_physical_devices('GPU')
        if physical_devices:
            for device in physical_devices:
                try:
                    tf.config.experimental.set_memory_growth(device, True)
                except RuntimeError as e:
                    print(f"Could not set memory growth for {device.name}: {e}")
            print(f"Found {len(physical_devices)} GPU(s). GPU configuration successful.")

            # Create and return GPU strategy
            strategy = tf.distribute.MirroredStrategy()
            print(f"Number of devices: {strategy.num_replicas_in_sync}")
            return strategy
        else:
            print("No GPUs found. Falling back to CPU strategy.")
            return tf.distribute.OneDeviceStrategy(device="/cpu:0")
    except RuntimeError as e:
        print(f"GPU configuration failed: {e}")
        print("Falling back to default strategy.")
        return tf.distribute.OneDeviceStrategy(device="/cpu:0")

In [2]:
!pip install keras-tuner
import tensorflow as tf
import os
import logging
from tensorflow.keras import layers, models
from tensorflow.keras.regularizers import l2
import keras_tuner as kt
from transformers import AutoConfig, TFBertModel



In [3]:
# Configuration Constants
FILE_NAME = '/content/chatbot-med-df.tfrecord'
MAX_LENGTH = 256
BATCH_SIZE = 1
SUBSET_SIZE = 100
VAL_SPLIT = 0.1
TEST_SPLIT = 0.1
NUM_EPOCHS = 100
BERT_MODEL_NAME = 'dmis-lab/biobert-base-cased-v1.1'
MODEL_SAVE_PATH = '/saved_model'

In [4]:
def load_dataset_for_tuning(filename, batch_size, subset_size=None, val_split=0.1):
    """
    Load and split the dataset into training and validation datasets, tailored for hyperparameter tuning.
    """

    # Feature description for parsing the TFRecord
    feature_description = {
        'query_input_ids': tf.io.FixedLenFeature([MAX_LENGTH], tf.int64),
        'query_attention_mask': tf.io.FixedLenFeature([MAX_LENGTH], tf.int64),
        'response_input_ids': tf.io.FixedLenFeature([MAX_LENGTH], tf.int64)
    }

    def parse_example(example):
        parsed = tf.io.parse_single_example(example, feature_description)

        # Define inputs and targets
        inputs = {
            'query_input_ids': tf.cast(parsed['query_input_ids'], tf.int32),
            'query_attention_mask': tf.cast(parsed['query_attention_mask'], tf.int32),
        }
        targets = tf.cast(parsed['response_input_ids'], tf.int32)
        return inputs, targets

    # Load the TFRecord dataset
    raw_dataset = tf.data.TFRecordDataset([filename])
    parsed_dataset = raw_dataset.map(parse_example, num_parallel_calls=tf.data.AUTOTUNE)

    # Determine dataset size
    total_size = sum(1 for _ in parsed_dataset)
    print(f"Original dataset size: {total_size}")

    # Subset handling
    if subset_size and subset_size < total_size:
        total_size = subset_size
        parsed_dataset = parsed_dataset.take(subset_size)
        print(f"Taking subset of size: {subset_size}")

    # Calculate split sizes
    train_size = int(total_size * (1 - val_split))
    val_size = total_size - train_size

    # Splits
    train_dataset = parsed_dataset.take(train_size).shuffle(buffer_size=10000)
    train_dataset = train_dataset.batch(batch_size).prefetch(tf.data.AUTOTUNE)

    val_dataset = parsed_dataset.skip(train_size)
    val_dataset = val_dataset.batch(batch_size).prefetch(tf.data.AUTOTUNE)

    print(f"Dataset successfully loaded for tuning:")
    print(f"  - Total size: {total_size}")
    print(f"  - Train size: {train_size}")
    print(f"  - Validation size: {val_size}")

    # Return only the training and validation datasets, along with their sizes
    return train_dataset, val_dataset, train_size, val_size

In [5]:
@tf.keras.utils.register_keras_serializable(package="Custom")
class BioBertEncoder(tf.keras.layers.Layer):
    def __init__(self, bert_model_name, trainable=False, **kwargs):
        super().__init__(**kwargs)
        self.bert_model_name = bert_model_name
        self.trainable = trainable
        self.bert_model = None

    def build(self, input_shape):
        # Initialize the BERT model
        self.bert_config = AutoConfig.from_pretrained(self.bert_model_name)
        self.bert_model = TFBertModel.from_pretrained(
            self.bert_model_name, config=self.bert_config, from_pt=True
        )
        self.bert_model.trainable = self.trainable
        super().build(input_shape)

    def call(self, inputs, training=False):
        input_ids, attention_mask = inputs
        outputs = self.bert_model(input_ids=input_ids, attention_mask=attention_mask, training=training)
        return outputs.last_hidden_state

    def get_config(self):
        config = super().get_config()
        config.update({"bert_model_name": self.bert_model_name, "trainable": self.trainable})
        return config

In [6]:
@tf.keras.utils.register_keras_serializable(package="Custom")
def masked_loss(y_true, y_pred):
    mask = tf.cast(y_true != 0, tf.float32)
    loss = tf.keras.losses.sparse_categorical_crossentropy(y_true, y_pred)
    loss *= mask
    return tf.reduce_sum(loss) / tf.reduce_sum(mask)

@tf.keras.utils.register_keras_serializable(package="Custom")
def masked_loss_metric(y_true, y_pred):
    # Convert the loss function into a metric
    mask = tf.cast(y_true != 0, tf.float32)
    loss = tf.keras.losses.sparse_categorical_crossentropy(y_true, y_pred)
    loss *= mask
    return tf.reduce_sum(loss) / tf.reduce_sum(mask)

In [7]:
class BioBertCnnBiLSTM:
    def __init__(self, bert_model_name, vocab_size):
        self.bert_model_name = bert_model_name
        self.vocab_size = vocab_size

    def build(self, config):
        # Define inputs
        query_input_ids = tf.keras.Input(shape=(MAX_LENGTH,), dtype=tf.int32, name='query_input_ids')
        query_attention_mask = tf.keras.Input(shape=(MAX_LENGTH,), dtype=tf.int32, name='query_attention_mask')

        bert_layer = BioBertEncoder(self.bert_model_name, trainable=False)

        query_bert_output = bert_layer([query_input_ids, query_attention_mask])

        # CNN Layer
        kernel_sizes = config['kernels']
        query_cnn = []
        for kernel_size in kernel_sizes:
            conv = tf.keras.layers.Conv1D(
                filters=config['cnn_filters'],
                kernel_size=kernel_size,
                padding='same',
                activation=config['activation_fn'],
                kernel_regularizer=l2(config['cnn_regularization'])
            )(query_bert_output)
            bn = tf.keras.layers.BatchNormalization()(conv)
            dropout = tf.keras.layers.Dropout(config['dropout_cnn'])(bn)
            query_cnn.append(dropout)

        query_cnn = tf.keras.layers.Concatenate()(query_cnn)

        # LSTM Layer
        query_lstm = tf.keras.layers.Bidirectional(
            tf.keras.layers.LSTM(config['lstm_units'], dropout=config['dropout_lstm'], return_sequences=True)
        )(query_cnn)

        # Output Layer
        output = tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(self.vocab_size, activation='softmax'))(query_lstm)

        model = tf.keras.Model(inputs={'query_input_ids': query_input_ids, 'query_attention_mask': query_attention_mask}, outputs=output)

        # Optimizer with learning rate schedule
        lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
            initial_learning_rate=config['learning_rate'],
            decay_steps=100000,
            decay_rate=0.96,
            staircase=True
        )

        optimizer = tf.keras.optimizers.Adam(learning_rate=lr_schedule)

        model.compile(optimizer=optimizer, loss=masked_loss, metrics=[masked_loss_metric, 'accuracy'])

        return model

In [8]:
def build_model(hp):
    config = {
        'kernels': [hp.Choice('kernels', values=[3, 5, 7])],
        'cnn_filters': hp.Int('cnn_filters', min_value=32, max_value=128, step=32),
        'activation_fn': hp.Choice('activation_fn', values=['relu', 'tanh', 'swish']),
        'cnn_regularization': hp.Float('cnn_regularization', min_value=0.01, max_value=0.1, step=0.01),
        'dropout_cnn': hp.Float('dropout_cnn', min_value=0.1, max_value=0.5, step=0.1),
        'lstm_units': hp.Int('lstm_units', min_value=32, max_value=128, step=32),
        'dropout_lstm': hp.Float('dropout_lstm', min_value=0.1, max_value=0.5, step=0.1),
        'learning_rate': hp.Float('learning_rate', min_value=1e-5, max_value=1e-3, sampling='log'),
    }

    model = BioBertCnnBiLSTM(bert_model_name=BERT_MODEL_NAME, vocab_size=28996)
    return model.build(config)

In [9]:
def tune_hyperparameters(file_path):
    """
    Tune the model's hyperparameters using Keras Tuner.
    """
    # 2.1. Load and split dataset
    print("Loading datasets...")
    train_dataset, val_dataset, train_size, val_size = load_dataset_for_tuning(
        filename=file_path,
        batch_size=BATCH_SIZE,
        subset_size=SUBSET_SIZE,
        val_split=VAL_SPLIT
    )

    # 2.2. Set up Keras Tuner with Hyperband search
    tuner = kt.Hyperband(
        build_model,
        objective=kt.Objective('masked_loss_metric', direction='min'),
        max_epochs=NUM_EPOCHS,
        factor=3,
        hyperband_iterations=1,
        directory='tuner_dir',
        project_name='chatbot_tuning'
    )

    # 2.3. Start hyperparameter tuning search
    tuner.search(
        train_dataset,
        validation_data=val_dataset,
        epochs=NUM_EPOCHS,
        steps_per_epoch=train_size // BATCH_SIZE,
        validation_steps=val_size // BATCH_SIZE
    )

    # 2.4. Get best hyperparameters and best model
    best_hp = tuner.get_best_hyperparameters(num_trials=1)[0]
    print(f"Best Hyperparameters: {best_hp.values}")

    best_model = tuner.get_best_models(num_models=1)[0]

    # 2.5. Evaluate the best model on the test set
    test_results = best_model.evaluate(test_dataset)
    print(f"Test results: Loss = {test_results[0]:.4f}, Accuracy = {test_results[1]:.4f}")

    return best_model, test_results

In [None]:
if __name__ == "__main__":
    logging.getLogger("tensorflow").setLevel(logging.ERROR)
    logging.getLogger("transformers").setLevel(logging.CRITICAL)
    os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

    file_path = FILE_NAME

    best_model, test_results = tune_hyperparameters(file_path)

Trial 64 Complete [00h 02m 43s]
masked_loss_metric: 0.0

Best masked_loss_metric So Far: 0.0
Total elapsed time: 03h 17m 36s

Search: Running Trial #65

Value             |Best Value So Far |Hyperparameter
7                 |5                 |kernels
96                |32                |cnn_filters
tanh              |relu              |activation_fn
0.1               |0.01              |cnn_regularization
0.5               |0.2               |dropout_cnn
64                |128               |lstm_units
0.2               |0.4               |dropout_lstm
4.9007e-05        |3.3907e-05        |learning_rate
17                |6                 |tuner/epochs
6                 |0                 |tuner/initial_epoch
2                 |2                 |tuner/bracket
1                 |0                 |tuner/round
0049              |None              |tuner/trial_id

Epoch 7/17
[1m26/90[0m [32m━━━━━[0m[37m━━━━━━━━━━━━━━━[0m [1m10s[0m 163ms/step - accuracy: 4.5255e-04 - loss: 21.0