**Installation**

# Translating medical terminology for rare diseases used by laypeople into the Human Phenotype Ontology with NLP and the BERT model

## Package Installation

Below is the command to install all the required packages for the project.

In [1]:
# Installing necessary packages
!pip install transformers pandas numpy torch scikit-learn keras-preprocessing plotly prettytable

Collecting keras-preprocessing
  Downloading Keras_Preprocessing-1.1.2-py2.py3-none-any.whl (42 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.6/42.6 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: keras-preprocessing
Successfully installed keras-preprocessing-1.1.2


## Importing Required Libraries and Packages

Below are the import statements required for the project.

In [2]:
# Importing necessary libraries from the transformers package for BERT model
from transformers import BertTokenizer, BertForSequenceClassification

# Importing data manipulation libraries
import pandas as pd
import numpy as np

# Importing torch for neural network and related operations
import torch
import torch.nn as nn
import torch.nn.functional as F

# Importing DataLoader and TensorDataset for batching and managing datasets in PyTorch
from torch.utils.data import DataLoader, TensorDataset

# Importing loss functions from PyTorch for binary and multi-class classification
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss

# Importing optimization algorithms from PyTorch
from torch.optim import AdamW, SGD

# Importing learning rate scheduler from PyTorch
from torch.optim.lr_scheduler import StepLR

# Importing train-test split function from scikit-learn for splitting datasets
from sklearn.model_selection import train_test_split

# Importing accuracy score from scikit-learn for model evaluation
from sklearn.metrics import accuracy_score

# Importing sequence padding function from Keras preprocessing
from keras_preprocessing.sequence import pad_sequences

# Importing various metrics from scikit-learn for model evaluation
from sklearn.metrics import (
    precision_recall_fscore_support, 
    hamming_loss, 
    jaccard_score, 
    log_loss, 
    accuracy_score, 
    roc_auc_score
)

# Importing Plotly's graph objects for data visualization
import plotly.graph_objects as go

# Importing PrettyTable for tabular data representation
from prettytable import PrettyTable

caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']


## Data Loading and Preliminary Analysis

We load a dataset from a specified path, display its dimensions, show a few rows of data, compute and display the maximum sentence length in a specific column, and report the total number of records in the following code snippets. Finally, the text data and labels are separated for further processing.

### Note
If running on local or any other machine, please change the file path below.

In [3]:
# Load the data from a CSV file
# The commented line is an alternative path for a different dataset
# df = pd.read_csv('')
df = pd.read_csv('/kaggle/input/r-diseases-dataset/model_input_latest_subset.csv')

# Print the shape of the dataframe to get an understanding of its size
print(df.shape)

(1237, 81)


In [4]:
# Display the first 5 rows of the data to understand its structure
df.head(5)

Unnamed: 0,week_description,HP:0000010,HP:0000012,HP:0000019,HP:0000211,HP:0000219,HP:0000467,HP:0000522,HP:0000717,HP:0000756,...,HP:0030237,HP:0030319,HP:0031744,HP:0032365,HP:0033679,HP:0040282,HP:0040283,HP:0100524,HP:0100759,HP:0100852
0,"The dizziness has become more frequent, and it...",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,"The muscle twitches are still there, but now I...",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,"The muscle twitches are still there, but now I...",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
3,"This week, the tingling sensations in my finge...",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,"This week, the tingling sensations in my finge...",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


In [5]:
# Initialize a variable to keep track of the maximum sentence length
max_sentence_length = 0

# Iterate through each record in the dataset
for index, row in df.iterrows():
    # Extract the week description from the current record
    week_description = row['week_description']
    
    # Compute the length of the sentence by splitting it into words
    sentence_length = len(week_description.split())
    
    # Update the maximum sentence length if the current sentence is longer
    max_sentence_length = max(max_sentence_length, sentence_length)

# Print the maximum sentence length
print(f"Maximum sentence length: {max_sentence_length}")

Maximum sentence length: 89


In [6]:
# Report the total number of records in the dataset
print('Number of training sentences: {:,}\n'.format(df.shape[0]))

# Separate the text data and labels for further processing
# Extract the 'week_description' column as the text data
texts = df['week_description'].values

# Drop the 'week_description' column and use the remaining columns as labels
labels = df.drop('week_description', axis=1).values

Number of training sentences: 1,237



## Text Tokenization and Input Preparation

In this part of the code, we focus on preparing our text data for the BERT model by tokenizing the text, padding and truncating the token sequences, and creating attention masks to indicate which tokens are meaningful and which are padding.

In [7]:
# Instantiate a BERT tokenizer to convert text to token IDs
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Step 1: Tokenize Text
# Tokenize the text data, adding the special tokens [CLS] and [SEP] as required by BERT
input_ids = [tokenizer.encode(text, add_special_tokens=True) for text in texts]

# Step 2: Padding and Truncating
# Ensure that all sequences are of the same length by padding and truncating
# We choose a maximum length of 128 tokens for this purpose
input_ids = pad_sequences(
    input_ids, 
    maxlen=128, 
    dtype="long", 
    value=0,  # Value used for padding
    truncating="post",  # Truncate sequences from the end if necessary
    padding="post"  # Pad sequences at the end if necessary
)

# Step 3: Create Attention Masks
# Create attention masks to differentiate actual tokens from padding
# A mask value of 1 indicates a real token, while a value of 0 indicates padding
attention_masks = []
for seq in input_ids:
    # Create a mask for the current sequence
    seq_mask = [float(i > 0) for i in seq]  # i > 0 checks whether the token ID is not padding (0)
    attention_masks.append(seq_mask)

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

## Data Splitting and DataLoader Preparation

In this section, we split the input data and labels into training, validation, and test sets to prepare them for model training, validation, and testing. Additionally, we convert the data into PyTorch tensors and organize them into DataLoaders for efficient batch processing during training and evaluation.

In [8]:
# Step 1: Splitting the Data
# Split the input data and labels into training and temporary sets (for further splitting)
train_inputs, temp_inputs, train_labels, temp_labels = train_test_split(
    input_ids, labels, 
    random_state=42,  # Ensures reproducibility
    test_size=0.2  # Specifies the proportion of the data to include in the test set
)

# Split the temporary sets into validation and test sets
val_inputs, test_inputs, val_labels, test_labels = train_test_split(
    temp_inputs, temp_labels, 
    random_state=42,  # Ensures reproducibility
    test_size=0.5  # Specifies the proportion of the data to include in the test set
)

# Also split the attention masks in a similar fashion
train_masks, temp_masks, _, _ = train_test_split(attention_masks, labels, random_state=42, test_size=0.2)
val_masks, test_masks, _, _ = train_test_split(temp_masks, temp_labels, random_state=42, test_size=0.5)

# Step 2: Converting Data to PyTorch Tensors
# Convert all inputs, masks, and labels to PyTorch tensors as required for training in PyTorch
train_inputs = torch.tensor(train_inputs)
val_inputs = torch.tensor(val_inputs)
test_inputs = torch.tensor(test_inputs)

train_masks = torch.tensor(train_masks)
val_masks = torch.tensor(val_masks)
test_masks = torch.tensor(test_masks)

train_labels = torch.tensor(train_labels)
val_labels = torch.tensor(val_labels)
test_labels = torch.tensor(test_labels)

# Step 3: Creating DataLoaders
# Organize the data into DataLoaders for efficient batch processing during training and evaluation
# Set the batch size to 32
batch_size = 32

# Create DataLoader for the training set
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_dataloader = DataLoader(train_data, batch_size=batch_size)

# Create DataLoader for the validation set
val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_dataloader = DataLoader(val_data, batch_size=batch_size)

# Create DataLoader for the test set
test_data = TensorDataset(test_inputs, test_masks, test_labels)
test_dataloader = DataLoader(test_data, batch_size=batch_size)


## Model Initialization and Device Preparation

In this section, we initialize the BERT model for sequence classification and a linear classifier. We also prepare the computing device (CPU or GPU) for training and evaluation.

In [9]:
# Step 1: Initialize BERT Model
# Initialize a BERT model for sequence classification with the required number of output labels
# We use the 'bert-base-uncased' pre-trained model as a starting point
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased", 
    num_labels=train_labels.shape[1]  # The number of output labels equals the number of columns in train_labels
)

# Step 2: Initialize Linear Classifier
# Initialize a linear classifier to be trained alongside the BERT model
# The input and output dimensions are both equal to the number of labels
classifier = torch.nn.Linear(
    train_labels.shape[1], 
    train_labels.shape[1]
)

# Step 3: Prepare Computing Device
# Determine the computing device (CPU or GPU) and send the model and classifier to this device
device = torch.device(
    "cuda" if torch.cuda.is_available() else "cpu"  # Use GPU (cuda) if available, otherwise fall back to CPU
)

# Send the model and classifier to the chosen device
model.to(device)
classifier.to(device)

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly i

Linear(in_features=80, out_features=80, bias=True)

## Calculating Class Weights

In this section, we calculate the class weights which are useful to handle imbalanced datasets during training.

In [10]:
# Step 1: Count Positive and Negative Samples
# Count the number of positive samples for each label by summing up the training labels along the column axis
n_pos = train_labels.sum(axis=0)
# Uncomment the line below to print the count of positive samples for debugging
# print(n_pos)

# Calculate the number of negative samples for each label by subtracting the count of positive samples from the total count
n_neg = len(train_labels) - n_pos
# Uncomment the line below to print the count of negative samples for debugging
# print(n_neg)

# Step 2: Calculate Positive Class Weights
# The positive class weights are calculated as the ratio of negative samples to positive samples for each label
# Adding a small constant (1e-5) to avoid division by zero
pos_weights = (n_neg + 1e-5) / (n_pos + 1e-5)

# Uncomment the line below to print the calculated positive class weights for debugging
# print(f"Positive Weights: {pos_weights}")

# Step 3: Convert to Tensor and Send to Device
# Convert the numpy array of positive class weights to a PyTorch tensor
# Then send the tensor to the chosen computing device (CPU or GPU)
pos_weights = torch.tensor(pos_weights).to(device)


  pos_weights = torch.tensor(pos_weights).to(device)


## Optimizer, Learning Rate Scheduler, and Loss Function Initialization

In this section, we initialize the optimizer, learning rate scheduler, and the loss function which are essential components for training the neural network.

In [11]:
# Step 1: Initialize Optimizer
# We use the AdamW optimizer which is an extension of Adam optimized for training deep neural networks
# We include the parameters of both the BERT model and the linear classifier in the optimizer
optimizer = AdamW(
    list(model.parameters()) + list(classifier.parameters()),  # Combine the parameters of model and classifier
    lr=0.0001  # Set the learning rate
)

# Step 2: Initialize Learning Rate Scheduler
# The StepLR scheduler adjusts the learning rate at regular intervals for better convergence
scheduler = StepLR(
    optimizer, 
    step_size=10,  # Decrease the learning rate every 10 epochs
    gamma=0.7  # Multiplicative factor to decrease the learning rate
)

# Step 3: Initialize Loss Function
# The pos_weight argument helps handle imbalanced datasets by scaling the loss for positive samples
criterion = BCEWithLogitsLoss(
    pos_weight=pos_weights  # Set the positive weights to handle class imbalance
)

# Alternative Loss Function
# The Focal Loss is another alternative for handling imbalanced datasets
# Uncomment the following lines to use Focal Loss instead of BCEWithLogitsLoss
# criterion = FocalLoss(
#     alpha=1, 
#     gamma=2, 
#     logits=True, 
#     reduce=True
# )


## Definition of Focal Loss

In this section, we define a custom loss function known as Focal Loss.

In [None]:
class FocalLoss(nn.Module):
    def __init__(self, alpha=1, gamma=2, logits=False, reduce=True):
        """
        Constructor for the FocalLoss module.
        :param alpha: (float) Scaling factor for positive class.
        :param gamma: (float) Focusing parameter to down-weight easy examples.
        :param logits: (bool) Flag to indicate whether inputs are logits.
        :param reduce: (bool) Flag to indicate whether to reduce loss to a scalar.
        """
        super(FocalLoss, self).__init__()
        self.alpha = alpha  # Scaling factor for the positive class
        self.gamma = gamma  # Focusing parameter
        self.logits = logits  # Indicates if the inputs are logits
        self.reduce = reduce  # Indicates if we should reduce loss to a scalar

    def forward(self, inputs, targets):
        """
        Forward pass of the FocalLoss module.
        :param inputs: (tensor) The input logits.
        :param targets: (tensor) The ground truth labels.
        :return: (tensor) The computed Focal Loss value.
        """
        # Compute binary cross-entropy loss
        # If logits flag is set, use binary cross-entropy with logits, else use regular binary cross-entropy
        BCE_loss = F.binary_cross_entropy_with_logits(inputs, targets, reduce=False) if self.logits else F.binary_cross_entropy(inputs, targets, reduce=False)
        
        pt = torch.exp(-BCE_loss)  # Convert BCE loss values to probabilities
        F_loss = self.alpha * (1 - pt)**self.gamma * BCE_loss  # Compute Focal Loss
        
        return torch.mean(F_loss) if self.reduce else F_loss


## Utility Functions for Printing Metrics

In this section, we introduce two utility functions for pretty-printing the metrics during training and evaluation.

In [12]:
def pretty_print_metrics(title, metrics):
    """
    Prints the metrics in a tabular format.
    
    :param title: (str) Title to be displayed above the table.
    :param metrics: (dict) Dictionary containing metric names as keys and their values.
    """
    
    # Display the title for the metrics
    print(f"\n{title}")
    
    # Initialize the table
    table = PrettyTable()
    
    # Set the column names for the table
    table.field_names = ["Metric", "Value"]
    
    # Add rows to the table using metrics data
    for metric, value in metrics.items():
        table.add_row([metric, f"{value:.4f}"])
    
    # Print the table
    print(table)

def pretty_print_epoch(epoch, train_loss, val_loss):
    """
    Prints the training and validation loss for each epoch in a tabular format.
    
    :param epoch: (int) Current epoch number.
    :param train_loss: (float) Training loss for the current epoch.
    :param val_loss: (float) Validation loss for the current epoch.
    """
    
    # Display epoch information
    print(f"\n{'='*40}")
    print(f"Epoch {epoch}")
    print(f"{'='*40}")
    
    # Initialize the table
    table = PrettyTable()
    
    # Set the column names for the table
    table.field_names = ["Data Type", "Loss"]
    
    # Add rows to the table for training and validation data
    table.add_row(["Training", f"{train_loss:.4f}"])
    table.add_row(["Validation", f"{val_loss:.4f}"])
    
    # Print the table
    print(table)

## Updating Label Data Types

In this section, we are updating the data types of the label tensors to torch.float32. This data type conversion is essential for ensuring compatibility with PyTorch operations, when using loss functions like BCEWithLogitsLoss.

In [13]:
# Converting the data type of training labels to float32
train_labels = torch.tensor(train_labels, dtype=torch.float32)

# Converting the data type of validation labels to float32
val_labels = torch.tensor(val_labels, dtype=torch.float32)

# Converting the data type of testing labels to float32
test_labels = torch.tensor(test_labels, dtype=torch.float32)

  train_labels = torch.tensor(train_labels, dtype=torch.float32)
  val_labels = torch.tensor(val_labels, dtype=torch.float32)
  test_labels = torch.tensor(test_labels, dtype=torch.float32)


## Training and Validation Loop

In this section, we set up and run the training and validation loop. We initialize arrays to keep track of training and validation losses over the epochs. In each epoch, we perform a forward and backward pass on the training data, update the model parameters, and compute the average training loss. Then, we evaluate the model on the validation set, compute the average validation loss, and evaluate several performance metrics.

In [14]:
# Arrays to store training and validation loss values across epochs
train_loss_values = []
val_loss_values = []

# Training loop across epochs
for epoch in range(16):
    model.train()  # Set model to training mode
    
    avg_train_loss = 0  # Initialize average training loss for the epoch
    avg_val_loss = 0  # Initialize average validation loss for the epoch
    
    # Loop over batches of training data
    for i, batch in enumerate(train_dataloader):
        
        # Send input data and labels to the device
        inputs, masks, labels = tuple(t.to(device) for t in batch)
        
        # Ensure labels are float for loss computation
        labels = labels.float()

        # Forward pass: compute predictions
        outputs = model(inputs, attention_mask=masks)
        logits_from_model = outputs.logits
        logits = classifier(logits_from_model)
        
        # Compute loss
        loss = criterion(logits, labels)
        
        # Perform backward pass to compute gradients
        loss.backward()
        
        # Gradient clipping to prevent exploding gradients
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        
        # Update model parameters
        optimizer.step()
        optimizer.zero_grad()
        
        # Update average training loss
        avg_train_loss += loss.item() / len(train_dataloader)
        
        # Print training loss every 100 batches
        if i % 100 == 0:
            print(f"Epoch: {epoch}, Batch: {i}, Training Loss: {loss.item()}")
        
    # Store average training loss for the epoch
    train_loss_values.append(avg_train_loss)

    # Validation: Evaluate model on validation data
    model.eval()  # Set model to evaluation mode
    val_preds = []
    val_true = []
    with torch.no_grad():  # No gradient computation
        for batch in val_dataloader:
            
            # Send input data and labels to the device
            inputs, masks, labels = tuple(t.to(device) for t in batch)
            
            # Forward pass: compute predictions
            outputs = model(inputs, attention_mask=masks)
            logits_from_model = outputs.logits
            logits = classifier(logits_from_model)
            
            # Store predictions and true labels for later evaluation
            preds = torch.sigmoid(logits).cpu().numpy()
            val_preds.extend(preds)
            val_true.extend(labels.cpu().numpy())
            
            # Update average validation loss
            avg_val_loss += loss.item() / len(val_dataloader)
        val_loss_values.append(avg_val_loss)

    # Convert prediction probabilities to binary predictions
    val_preds = np.array(val_preds) >= 0.7
    val_true = np.array(val_true)
    
    # Compute validation accuracy
    val_accuracy = accuracy_score(val_true, val_preds)
    
    # Print summary for the epoch
    pretty_print_epoch(epoch, avg_train_loss, avg_val_loss)
    
    # Compute and print several performance metrics on validation data
    precision_micro, recall_micro, f1_micro, _ = precision_recall_fscore_support(val_true, val_preds, average='micro')
    precision_macro, recall_macro, f1_macro, _ = precision_recall_fscore_support(val_true, val_preds, average='macro')
    hamming = hamming_loss(val_true, val_preds)
    jaccard = jaccard_score(val_true, val_preds, average='samples')
    logloss = log_loss(val_true, val_preds)
    
    val_metrics = {
        'Validation Accuracy': val_accuracy,
        'Micro-average Precision': precision_micro,
        'Micro-average Recall': recall_micro,
        'Micro-average F1': f1_micro,
        'Macro-average Precision': precision_macro,
        'Macro-average Recall': recall_macro,
        'Macro-average F1': f1_macro,
        'Hamming Loss': hamming,
        'Jaccard Similarity': jaccard,
        'Log Loss': logloss
    }

    pretty_print_metrics('Validation Metrics', val_metrics)  # Use PrettyTable to print validation metrics


Epoch: 0, Batch: 0, Training Loss: 1.3420305252075195

Epoch 0
+------------+--------+
| Data Type  |  Loss  |
+------------+--------+
|  Training  | 1.3238 |
| Validation | 1.1733 |
+------------+--------+

Validation Metrics
+-------------------------+---------+
|          Metric         |  Value  |
+-------------------------+---------+
|   Validation Accuracy   |  0.0242 |
| Micro-average Precision |  0.0852 |
|   Micro-average Recall  |  0.1210 |
|     Micro-average F1    |  0.1000 |
| Macro-average Precision |  0.0327 |
|   Macro-average Recall  |  0.0802 |
|     Macro-average F1    |  0.0337 |
|       Hamming Loss      |  0.0272 |
|    Jaccard Similarity   |  0.0612 |
|         Log Loss        | 17.3227 |
+-------------------------+---------+


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epoch: 1, Batch: 0, Training Loss: 1.172568678855896

Epoch 1
+------------+--------+
| Data Type  |  Loss  |
+------------+--------+
|  Training  | 1.0910 |
| Validation | 0.9795 |
+------------+--------+

Validation Metrics
+-------------------------+--------+
|          Metric         | Value  |
+-------------------------+--------+
|   Validation Accuracy   | 0.0484 |
| Micro-average Precision | 0.0959 |
|   Micro-average Recall  | 0.8065 |
|     Micro-average F1    | 0.1714 |
| Macro-average Precision | 0.1553 |
|   Macro-average Recall  | 0.5992 |
|     Macro-average F1    | 0.2059 |
|       Hamming Loss      | 0.0975 |
|    Jaccard Similarity   | 0.1709 |
|         Log Loss        | 7.8362 |
+-------------------------+--------+


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epoch: 2, Batch: 0, Training Loss: 0.9479204416275024

Epoch 2
+------------+--------+
| Data Type  |  Loss  |
+------------+--------+
|  Training  | 0.9164 |
| Validation | 0.8335 |
+------------+--------+

Validation Metrics
+-------------------------+--------+
|          Metric         | Value  |
+-------------------------+--------+
|   Validation Accuracy   | 0.0323 |
| Micro-average Precision | 0.1051 |
|   Micro-average Recall  | 0.9113 |
|     Micro-average F1    | 0.1885 |
| Macro-average Precision | 0.1698 |
|   Macro-average Recall  | 0.6596 |
|     Macro-average F1    | 0.2382 |
|       Hamming Loss      | 0.0981 |
|    Jaccard Similarity   | 0.1712 |
|         Log Loss        | 4.8578 |
+-------------------------+--------+


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epoch: 3, Batch: 0, Training Loss: 0.8092786073684692

Epoch 3
+------------+--------+
| Data Type  |  Loss  |
+------------+--------+
|  Training  | 0.7836 |
| Validation | 0.7078 |
+------------+--------+

Validation Metrics
+-------------------------+--------+
|          Metric         | Value  |
+-------------------------+--------+
|   Validation Accuracy   | 0.0565 |
| Micro-average Precision | 0.1380 |
|   Micro-average Recall  | 0.9194 |
|     Micro-average F1    | 0.2400 |
| Macro-average Precision | 0.2421 |
|   Macro-average Recall  | 0.6637 |
|     Macro-average F1    | 0.3074 |
|       Hamming Loss      | 0.0728 |
|    Jaccard Similarity   | 0.2317 |
|         Log Loss        | 4.5575 |
+-------------------------+--------+


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epoch: 4, Batch: 0, Training Loss: 0.6726325750350952

Epoch 4
+------------+--------+
| Data Type  |  Loss  |
+------------+--------+
|  Training  | 0.6576 |
| Validation | 0.5940 |
+------------+--------+

Validation Metrics
+-------------------------+--------+
|          Metric         | Value  |
+-------------------------+--------+
|   Validation Accuracy   | 0.0887 |
| Micro-average Precision | 0.1729 |
|   Micro-average Recall  | 0.9274 |
|     Micro-average F1    | 0.2915 |
| Macro-average Precision | 0.2621 |
|   Macro-average Recall  | 0.6762 |
|     Macro-average F1    | 0.3265 |
|       Hamming Loss      | 0.0564 |
|    Jaccard Similarity   | 0.2895 |
|         Log Loss        | 3.7857 |
+-------------------------+--------+


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epoch: 5, Batch: 0, Training Loss: 0.5445205569267273

Epoch 5
+------------+--------+
| Data Type  |  Loss  |
+------------+--------+
|  Training  | 0.5415 |
| Validation | 0.4781 |
+------------+--------+

Validation Metrics
+-------------------------+--------+
|          Metric         | Value  |
+-------------------------+--------+
|   Validation Accuracy   | 0.2500 |
| Micro-average Precision | 0.2457 |
|   Micro-average Recall  | 0.9194 |
|     Micro-average F1    | 0.3878 |
| Macro-average Precision | 0.3241 |
|   Macro-average Recall  | 0.6621 |
|     Macro-average F1    | 0.3883 |
|       Hamming Loss      | 0.0363 |
|    Jaccard Similarity   | 0.4275 |
|         Log Loss        | 3.9316 |
+-------------------------+--------+


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epoch: 6, Batch: 0, Training Loss: 0.4461219310760498

Epoch 6
+------------+--------+
| Data Type  |  Loss  |
+------------+--------+
|  Training  | 0.4501 |
| Validation | 0.4025 |
+------------+--------+

Validation Metrics
+-------------------------+--------+
|          Metric         | Value  |
+-------------------------+--------+
|   Validation Accuracy   | 0.3468 |
| Micro-average Precision | 0.2687 |
|   Micro-average Recall  | 0.9274 |
|     Micro-average F1    | 0.4167 |
| Macro-average Precision | 0.3466 |
|   Macro-average Recall  | 0.6762 |
|     Macro-average F1    | 0.4098 |
|       Hamming Loss      | 0.0325 |
|    Jaccard Similarity   | 0.4962 |
|         Log Loss        | 3.0011 |
+-------------------------+--------+


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epoch: 7, Batch: 0, Training Loss: 0.3676368296146393

Epoch 7
+------------+--------+
| Data Type  |  Loss  |
+------------+--------+
|  Training  | 0.3757 |
| Validation | 0.3243 |
+------------+--------+

Validation Metrics
+-------------------------+--------+
|          Metric         | Value  |
+-------------------------+--------+
|   Validation Accuracy   | 0.3387 |
| Micro-average Precision | 0.3018 |
|   Micro-average Recall  | 0.9274 |
|     Micro-average F1    | 0.4554 |
| Macro-average Precision | 0.3543 |
|   Macro-average Recall  | 0.6746 |
|     Macro-average F1    | 0.4190 |
|       Hamming Loss      | 0.0277 |
|    Jaccard Similarity   | 0.5077 |
|         Log Loss        | 2.6842 |
+-------------------------+--------+


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epoch: 8, Batch: 0, Training Loss: 0.2997286915779114

Epoch 8
+------------+--------+
| Data Type  |  Loss  |
+------------+--------+
|  Training  | 0.3192 |
| Validation | 0.2775 |
+------------+--------+

Validation Metrics
+-------------------------+--------+
|          Metric         | Value  |
+-------------------------+--------+
|   Validation Accuracy   | 0.4113 |
| Micro-average Precision | 0.3574 |
|   Micro-average Recall  | 0.9194 |
|     Micro-average F1    | 0.5147 |
| Macro-average Precision | 0.4138 |
|   Macro-average Recall  | 0.6621 |
|     Macro-average F1    | 0.4669 |
|       Hamming Loss      | 0.0217 |
|    Jaccard Similarity   | 0.5648 |
|         Log Loss        | 2.8326 |
+-------------------------+--------+


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epoch: 9, Batch: 0, Training Loss: 0.2542976140975952

Epoch 9
+------------+--------+
| Data Type  |  Loss  |
+------------+--------+
|  Training  | 0.2748 |
| Validation | 0.2435 |
+------------+--------+

Validation Metrics
+-------------------------+--------+
|          Metric         | Value  |
+-------------------------+--------+
|   Validation Accuracy   | 0.4435 |
| Micro-average Precision | 0.3750 |
|   Micro-average Recall  | 0.9194 |
|     Micro-average F1    | 0.5327 |
| Macro-average Precision | 0.4334 |
|   Macro-average Recall  | 0.6621 |
|     Macro-average F1    | 0.4828 |
|       Hamming Loss      | 0.0202 |
|    Jaccard Similarity   | 0.5911 |
|         Log Loss        | 2.7826 |
+-------------------------+--------+


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epoch: 10, Batch: 0, Training Loss: 0.21940842270851135

Epoch 10
+------------+--------+
| Data Type  |  Loss  |
+------------+--------+
|  Training  | 0.2400 |
| Validation | 0.2116 |
+------------+--------+

Validation Metrics
+-------------------------+--------+
|          Metric         | Value  |
+-------------------------+--------+
|   Validation Accuracy   | 0.4677 |
| Micro-average Precision | 0.3966 |
|   Micro-average Recall  | 0.9274 |
|     Micro-average F1    | 0.5556 |
| Macro-average Precision | 0.4580 |
|   Macro-average Recall  | 0.6663 |
|     Macro-average F1    | 0.5034 |
|       Hamming Loss      | 0.0185 |
|    Jaccard Similarity   | 0.6137 |
|         Log Loss        | 2.4529 |
+-------------------------+--------+


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epoch: 11, Batch: 0, Training Loss: 0.1919858604669571

Epoch 11
+------------+--------+
| Data Type  |  Loss  |
+------------+--------+
|  Training  | 0.2103 |
| Validation | 0.1848 |
+------------+--------+

Validation Metrics
+-------------------------+--------+
|          Metric         | Value  |
+-------------------------+--------+
|   Validation Accuracy   | 0.4839 |
| Micro-average Precision | 0.4107 |
|   Micro-average Recall  | 0.9274 |
|     Micro-average F1    | 0.5693 |
| Macro-average Precision | 0.4808 |
|   Macro-average Recall  | 0.6746 |
|     Macro-average F1    | 0.5225 |
|       Hamming Loss      | 0.0175 |
|    Jaccard Similarity   | 0.6239 |
|         Log Loss        | 2.4280 |
+-------------------------+--------+


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epoch: 12, Batch: 0, Training Loss: 0.17076902091503143

Epoch 12
+------------+--------+
| Data Type  |  Loss  |
+------------+--------+
|  Training  | 0.1883 |
| Validation | 0.1634 |
+------------+--------+

Validation Metrics
+-------------------------+--------+
|          Metric         | Value  |
+-------------------------+--------+
|   Validation Accuracy   | 0.5000 |
| Micro-average Precision | 0.4238 |
|   Micro-average Recall  | 0.9194 |
|     Micro-average F1    | 0.5802 |
| Macro-average Precision | 0.4684 |
|   Macro-average Recall  | 0.6621 |
|     Macro-average F1    | 0.5114 |
|       Hamming Loss      | 0.0166 |
|    Jaccard Similarity   | 0.6378 |
|         Log Loss        | 2.6833 |
+-------------------------+--------+


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epoch: 13, Batch: 0, Training Loss: 0.14972208440303802

Epoch 13
+------------+--------+
| Data Type  |  Loss  |
+------------+--------+
|  Training  | 0.1699 |
| Validation | 0.1519 |
+------------+--------+

Validation Metrics
+-------------------------+--------+
|          Metric         | Value  |
+-------------------------+--------+
|   Validation Accuracy   | 0.4839 |
| Micro-average Precision | 0.4302 |
|   Micro-average Recall  | 0.9194 |
|     Micro-average F1    | 0.5861 |
| Macro-average Precision | 0.4774 |
|   Macro-average Recall  | 0.6621 |
|     Macro-average F1    | 0.5161 |
|       Hamming Loss      | 0.0162 |
|    Jaccard Similarity   | 0.6306 |
|         Log Loss        | 2.6795 |
+-------------------------+--------+


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epoch: 14, Batch: 0, Training Loss: 0.13372908532619476

Epoch 14
+------------+--------+
| Data Type  |  Loss  |
+------------+--------+
|  Training  | 0.1573 |
| Validation | 0.1386 |
+------------+--------+

Validation Metrics
+-------------------------+--------+
|          Metric         | Value  |
+-------------------------+--------+
|   Validation Accuracy   | 0.4758 |
| Micro-average Precision | 0.4101 |
|   Micro-average Recall  | 0.9194 |
|     Micro-average F1    | 0.5672 |
| Macro-average Precision | 0.4705 |
|   Macro-average Recall  | 0.6621 |
|     Macro-average F1    | 0.5110 |
|       Hamming Loss      | 0.0175 |
|    Jaccard Similarity   | 0.6195 |
|         Log Loss        | 2.7120 |
+-------------------------+--------+


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epoch: 15, Batch: 0, Training Loss: 0.12760165333747864

Epoch 15
+------------+--------+
| Data Type  |  Loss  |
+------------+--------+
|  Training  | 0.1458 |
| Validation | 0.1300 |
+------------+--------+

Validation Metrics
+-------------------------+--------+
|          Metric         | Value  |
+-------------------------+--------+
|   Validation Accuracy   | 0.4839 |
| Micro-average Precision | 0.4330 |
|   Micro-average Recall  | 0.9113 |
|     Micro-average F1    | 0.5870 |
| Macro-average Precision | 0.4815 |
|   Macro-average Recall  | 0.6517 |
|     Macro-average F1    | 0.5140 |
|       Hamming Loss      | 0.0160 |
|    Jaccard Similarity   | 0.6329 |
|         Log Loss        | 3.2193 |
+-------------------------+--------+


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Evaluation on Test Set

This section presents the evaluation of the model on the test set. The model's performance is assessed across various metrics including loss, accuracy, precision, recall, F1-score, Hamming Loss, Jaccard Similarity, and Log Loss. The evaluation is carried out batch-wise, and the predictions along with labels are stored for metric computation.

In [15]:
# Set the model to evaluation mode
model.eval()

# Initialize variables to keep track of total loss and correct predictions
total_loss = 0
total_correct = 0

# Lists to store all predictions and labels
all_preds = []
all_labels = []

# Loop over batches in the test dataloader
for i, batch in enumerate(test_dataloader):
    
    # Unpack the batch and move tensors to the appropriate device
    inputs, masks, labels = tuple(t.to(device) for t in batch)
    
    # Ensure labels are float for loss computation
    labels = labels.float()
    
    # No gradient computation in evaluation phase
    with torch.no_grad():
        # Forward pass: Compute predictions
        outputs = model(inputs, attention_mask=masks)
        logits_from_model = outputs.logits
        logits = classifier(logits_from_model)
        
        # Compute loss
        loss = criterion(logits, labels)
        
        # Accumulate loss
        total_loss += loss.item()
        
        # Convert logits to probabilities and store predictions and labels
        preds = torch.sigmoid(logits).cpu().numpy()
        all_preds.extend(preds)
        all_labels.extend(labels.cpu().numpy())

# Convert list of predictions and labels to NumPy arrays for easier manipulation
all_preds_np = np.array(all_preds)
all_labels_np = np.array(all_labels)

# Binarize predictions based on threshold
binary_preds = all_preds_np >= 0.7

# Calculate Metrics
precision_micro, recall_micro, f1_micro, _ = precision_recall_fscore_support(all_labels_np, binary_preds, average='micro')
precision_macro, recall_macro, f1_macro, _ = precision_recall_fscore_support(all_labels_np, binary_preds, average='macro')
hamming = hamming_loss(all_labels_np, binary_preds)
jaccard = jaccard_score(all_labels_np, binary_preds, average='samples')
logloss = log_loss(all_labels_np, all_preds_np)
accuracy = accuracy_score(all_labels_np, binary_preds)

# Organize metrics in a dictionary and print using pretty_print_metrics function
test_metrics = {
    'Test Loss': total_loss / len(test_dataloader),
    'Test Accuracy': accuracy,
    'Micro-Average Precision': precision_micro,
    'Micro-Average Recall': recall_micro,
    'Micro-Average F1-score': f1_micro,
    'Macro-Average Precision': precision_macro,
    'Macro-Average Recall': recall_macro,
    'Macro-Average F1-score': f1_macro,
    'Hamming Loss': hamming,
    'Jaccard Similarity': jaccard,
    'Log Loss': logloss
}

# Use PrettyTable to display test metrics
pretty_print_metrics('Test Metrics', test_metrics)



Test Metrics
+-------------------------+--------+
|          Metric         | Value  |
+-------------------------+--------+
|        Test Loss        | 0.3620 |
|      Test Accuracy      | 0.4274 |
| Micro-Average Precision | 0.4264 |
|   Micro-Average Recall  | 0.9113 |
|  Micro-Average F1-score | 0.5810 |
| Macro-Average Precision | 0.5084 |
|   Macro-Average Recall  | 0.7271 |
|  Macro-Average F1-score | 0.5557 |
|       Hamming Loss      | 0.0164 |
|    Jaccard Similarity   | 0.6036 |
|         Log Loss        | 2.0545 |
+-------------------------+--------+


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Plotting Training and Validation Loss

In this section, we use Plotly to create a plot that depicts the training and validation loss over epochs. Each point on the plot represents the loss at the end of an epoch, for both training and validation phases. This visualization helps in understanding how the model is learning and whether it's overfitting (if the validation loss starts increasing).

In [16]:
# Create a figure using Plotly's go.Figure()
fig = go.Figure()

# Add a trace for training loss values over epochs
fig.add_trace(
    go.Scatter(
        x=list(range(len(train_loss_values))),  # X-axis: Epoch number
        y=train_loss_values,  # Y-axis: Training loss value
        mode='lines+markers',  # Mode: Lines + Markers for individual data points
        name='Training Loss'  # Trace name
    )
)

# Add a trace for validation loss values over epochs
fig.add_trace(
    go.Scatter(
        x=list(range(len(val_loss_values))),  # X-axis: Epoch number
        y=val_loss_values,  # Y-axis: Validation loss value
        mode='lines+markers',  # Mode: Lines + Markers for individual data points
        name='Validation Loss'  # Trace name
    )
)

# Update the layout of the figure to include title and axis labels
fig.update_layout(
    title='Training and Validation Loss',  # Title of the plot
    xaxis_title='Epoch',  # X-axis label
    yaxis_title='Loss'  # Y-axis label
)

# Display the figure
fig.show()


## Predicting on Sample Texts

In this section, we create a function called predict_sample_texts to preprocess the provided text samples, run them through the model for predictions, and interpret the results to obtain human-readable labels. This function demonstrates how to use a pretrained transformer model and a classifier to predict on new data.

In [17]:
# Function to preprocess and predict on new text data
def predict_sample_texts(texts, model, classifier, tokenizer, device, threshold=0.7):
    results = []  # List to store the results
    
    # Get the column headers except for the first one as a list
    headers = df.columns.tolist()[1:]
    
    for text in texts:
        # Step 1: Preprocess
        # Tokenize the text and add special tokens
        input_id = tokenizer.encode(text, add_special_tokens=True)
        # Pad or truncate the token ids to a fixed length
        input_id = pad_sequences([input_id], maxlen=128, dtype="long", value=0, truncating="post", padding="post")
        
        # Create attention mask
        attention_mask = [[float(i > 0) for i in seq] for seq in input_id]
        
        # Convert to PyTorch tensors and move to the given device
        input_id = torch.tensor(input_id).to(device)
        attention_mask = torch.tensor(attention_mask).to(device)
        
        # Step 2: Model Prediction
        # Set the model and classifier to evaluation mode
        model.eval()
        classifier.eval()
        
        # No gradient computation to save memory
        with torch.no_grad():
            # Obtain model outputs
            output = model(input_id, attention_mask=attention_mask)
            logits_from_model = output.logits
            logits = classifier(logits_from_model)
            # Convert logits to probabilities
            probabilities = torch.sigmoid(logits).cpu().numpy()
            
        # Step 3: Interpret the Result
        # Binarize probabilities based on the threshold
        binary_output = (probabilities >= threshold).astype(int)
        
        # Store the result in a dictionary and append to results list
        results.append({
            'text': text,
            'probabilities': probabilities,
            'binary_output': binary_output,
            # Get the predicted labels
            'predicted_labels': [label for label, output in zip(headers, binary_output[0]) if output == 1]
        })
        
    return results  # Return the results list

# Array of sample texts along with HPO terms for context
# Headache (HP:0002315)
# Dry skin (HP:0000958)
# Nocturia (HP:0000017)
sample_texts = [
    "Still thirsty, still constantly in the bathroom, and now I'm getting headaches that are making it tough to focus. Probably from all the lost fluids. Doctor says my sodium levels are still high. It's like my body's a malfunctioning tap.",
    "I thought things couldn't get worse, but now my skin's as dry as the Sahara. Between this and everything else, I'm feeling pretty miserable.",
    "Been waking up in the middle of the night to use the bathroom. It's affecting my sleep now, and I've started to feel groggy all day long."
]

# Get prediction
results = predict_sample_texts(sample_texts, model, classifier, tokenizer, device)

# Output the results
for i, result in enumerate(results):
    print(f"Sample Text {i+1}: {result['text']}")
    print(f"Predicted Labels: {result['predicted_labels']}")
    print("=" * 50)  # Print a separator


Sample Text 1: Still thirsty, still constantly in the bathroom, and now I'm getting headaches that are making it tough to focus. Probably from all the lost fluids. Doctor says my sodium levels are still high. It's like my body's a malfunctioning tap.
Predicted Labels: ['HP:0006959']
Sample Text 2: I thought things couldn't get worse, but now my skin's as dry as the Sahara. Between this and everything else, I'm feeling pretty miserable.
Predicted Labels: []
Sample Text 3: Been waking up in the middle of the night to use the bathroom. It's affecting my sleep now, and I've started to feel groggy all day long.
Predicted Labels: ['HP:0002451']


## Hyperparameter Tuning for BERT Sequence Classification

In this snippet, we define a training routine for a BERT-based sequence classification task. We utilize the hyperopt library to search for the optimal hyperparameters for training.

In [None]:
from transformers import BertForSequenceClassification
from torch.utils.data import DataLoader, TensorDataset
from torch.optim import Adam, SGD, lr_scheduler
import torch
import numpy as np
from hyperopt import fmin, tpe, hp, Trials, STATUS_OK

# Assume the definition of Focal Loss is available
# from your_focal_loss import FocalLoss

# Assume train_data, val_data, train_labels, n_neg, and n_pos are pre-defined TensorDatasets
# They can also be passed as parameters if needed

def training_function(params):
    # Extract parameters from the params dictionary
    batch_size = params['batch_size']
    learning_rate = params['lr']
    num_epochs = params['num_epochs']
    optimizer_type = params['optimizer']
    step_size = params['step_size']
    step_gamma = params['step_gamma']
    
    # Initialize metric storage
    train_loss_values = []
    val_loss_values = []
    
    # Initialize data loaders
    train_dataloader = DataLoader(train_data, batch_size=batch_size)
    val_dataloader = DataLoader(val_data, batch_size=batch_size)
    
    # Initialize model
    model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=train_labels.shape[1])
    classifier = torch.nn.Linear(train_labels.shape[1], train_labels.shape[1])

    # Send model to device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    classifier.to(device)
    
    # Initialize optimizer and scheduler
    if optimizer_type == 'adam':
        optimizer = Adam(list(model.parameters()) + list(classifier.parameters()), lr=learning_rate)
    else:
        optimizer = SGD(list(model.parameters()) + list(classifier.parameters()), lr=learning_rate)
    
    scheduler = lr_scheduler.StepLR(optimizer, step_size=step_size, gamma=step_gamma)
    
    # Initialize loss function
    pos_weights = (n_neg + 1e-5) / (n_pos + 1e-5)
    pos_weights = torch.tensor(pos_weights).to(device)
    criterion = torch.nn.BCEWithLogitsLoss(pos_weight=pos_weights)
    
    # Training loop
    for epoch in range(num_epochs):
        model.train()
        avg_train_loss = 0
        avg_val_loss = 0
        
        for i, batch in enumerate(train_dataloader):
            inputs, masks, labels = tuple(t.to(device) for t in batch)
            labels = labels.float()  # Ensure labels are float if required
            outputs = model(inputs, attention_mask=masks)
            logits_from_model = outputs.logits
            logits = classifier(logits_from_model)
            loss = criterion(logits, labels)

            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            
            avg_train_loss += loss.item() / len(train_dataloader)
        
        scheduler.step()
        
        # Validation loop
        model.eval()
        with torch.no_grad():
            for batch in val_dataloader:
                inputs, masks, labels = tuple(t.to(device) for t in batch)
                labels = labels.float()  # Ensure labels are float if required
                outputs = model(inputs, attention_mask=masks)
                logits_from_model = outputs.logits
                logits = classifier(logits_from_model)
                loss = criterion(logits, labels)
                
                avg_val_loss += loss.item() / len(val_dataloader)
                
        train_loss_values.append(avg_train_loss)
        val_loss_values.append(avg_val_loss)
        
    return {'loss': avg_val_loss, 'status': STATUS_OK}

# Define the space of hyperparameters to search
space = {
    'lr': hp.loguniform('lr', -7, 0),
    'batch_size': hp.choice('batch_size', [8, 16, 32, 64]),
    'num_epochs': hp.choice('num_epochs', [2, 5, 10]),
    'step_size': hp.choice('step_size', [5, 10, 20]),
    'step_gamma': hp.uniform('step_gamma', 0.5, 1),
    'optimizer': hp.choice('optimizer', ['adam', 'sgd']),
}

# Objective function is the function to optimize
trials = Trials()
best = fmin(training_function,
            space=space,
            algo=tpe.suggest,
            max_evals=7,
            trials=trials)

print(f"Best hyperparameters: {best}")