In [1]:
import pandas as pd
import matplotlib.pyplot as plt

# Load the training dataset
train_df = pd.read_csv("Train.csv")
disease_desc_df = pd.read_csv("Disease_Description.csv")

print(train_df.head, disease_desc_df.head)

<bound method NDFrame.head of         Id                                        description  \
0        1   Patient had a recurrent left arm pain after h...   
1        2   The patient is an 84-year-old female presente...   
2        3                                   Hand dermatitis.   
3        4   Recurrent degenerative spondylolisthesis and ...   
4        5   Chiropractic IME with old files review.  Deta...   
...    ...                                                ...   
3379  3380   Morbid obesity.  Laparoscopic Roux-en-Y gastr...   
3380  3381   Lateral and plantar condylectomy, fifth left ...   
3381  3382   Right heart and left heart catheterization by...   
3382  3383   Left axillary lymph node excisional biopsy.  ...   
3383  3384   Aspiration pneumonia and chronic obstructive ...   

                medical_specialty                             sample_name  \
0      Cardiovascular / Pulmonary                       Angina - Consult    
1      Cardiovascular / Pulmonary  

In [2]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download necessary NLTK resources (only needed once)
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

# Define a text cleaning function
def clean_text(text, remove_stopwords=True, perform_lemmatization=True):
    """
    Clean the input text:
    - Convert to lowercase
    - Remove punctuation and special characters
    - Remove extra whitespace
    - Tokenize and remove stopwords
    - Perform lemmatization
    """
    # Lowercase conversion
    text = text.lower()
    
    # Remove punctuation and special characters (keeping only alphanumeric and whitespace)
    text = re.sub(r'[^a-z0-9\s]', '', text)
    
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    
    # Tokenize text
    tokens = nltk.word_tokenize(text)
    
    # Remove stopwords
    if remove_stopwords:
        stop_words = set(stopwords.words('english'))
        tokens = [word for word in tokens if word not in stop_words]
    
    # Perform lemmatization
    if perform_lemmatization:
        lemmatizer = WordNetLemmatizer()
        tokens = [lemmatizer.lemmatize(word) for word in tokens]
    
    # Join tokens back to a single string
    cleaned_text = " ".join(tokens)
    return cleaned_text

[nltk_data] Downloading package stopwords to /Users/dj/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/dj/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/dj/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
# Handle missing values
train_df['transcription'] = train_df['transcription'].fillna('')

# Apply text cleaning
train_df['cleaned_description'] = train_df['description'].apply(lambda x: clean_text(x))
train_df['cleaned_transcription'] = train_df['transcription'].apply(lambda x: clean_text(x))

# For Disease_Description.csv, clean the "Description" column
disease_desc_df['cleaned_Description'] = disease_desc_df['Description'].apply(lambda x: clean_text(x))

# Check sample outputs
print("Sample from Train.csv:")
print(train_df[['description', 'cleaned_description']].head())
print(train_df[['transcription', 'cleaned_transcription']].head())

print("\nSample from Disease_Description.csv:")
print(disease_desc_df[['Description', 'cleaned_Description']].head())

Sample from Train.csv:
                                         description  \
0   Patient had a recurrent left arm pain after h...   
1   The patient is an 84-year-old female presente...   
2                                   Hand dermatitis.   
3   Recurrent degenerative spondylolisthesis and ...   
4   Chiropractic IME with old files review.  Deta...   

                                 cleaned_description  
0  patient recurrent left arm pain stent three da...  
1  patient 84yearold female presented emergency r...  
2                                    hand dermatitis  
3  recurrent degenerative spondylolisthesis steno...  
4  chiropractic ime old file review detailed thor...  
                                       transcription  \
0  HISTORY OF PRESENT ILLNESS: , The patient is a...   
1  REASON FOR CONSULTATION:,  Pericardial effusio...   
2  SUBJECTIVE:,  This is a 29-year-old Vietnamese...   
3  PREOPERATIVE DIAGNOSIS: , Recurrent degenerati...   
4  DATE OF INJURY : October 4,

## Medical Specialty Classification(Use the cleaned_description for LogisticRegression as baseline model)

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer

In [23]:
# Split the data into training and testing sets.
X = train_df['cleaned_description']
y = train_df['medical_specialty']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize a TF-IDF vectorizer and transform the text data.
tfidf_vectorizer = TfidfVectorizer(max_features=1000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Train a Logistic Regression classifier as a baseline.
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train_tfidf, y_train)

# Predict on the test set and print a classification report.
y_pred = clf.predict(X_test_tfidf)
print("=== Medical Specialty Classification Report ===")
print(classification_report(y_test, y_pred, zero_division=0))

=== Medical Specialty Classification Report ===
                                precision    recall  f1-score   support

                    Bariatrics       0.00      0.00      0.00         5
    Cardiovascular / Pulmonary       0.35      0.29      0.31        59
                  Chiropractic       0.00      0.00      0.00         3
    Consult - History and Phy.       0.22      0.35      0.27        62
    Cosmetic / Plastic Surgery       0.00      0.00      0.00         2
                     Dentistry       0.00      0.00      0.00         7
                   Dermatology       0.00      0.00      0.00         5
             Discharge Summary       0.33      0.07      0.11        15
          ENT - Otolaryngology       0.17      0.08      0.11        13
                 Endocrinology       0.00      0.00      0.00         3
              Gastroenterology       0.12      0.10      0.11        29
              General Medicine       0.17      0.11      0.13        47
         Hemato

In [24]:
# Train a Logistic Regression classifier with class weight balenced as a baseline.
clf_balanced = LogisticRegression(max_iter=1000, class_weight='balanced')
clf_balanced.fit(X_train_tfidf, y_train)

y_pred_balanced = clf_balanced.predict(X_test_tfidf)
print("=== Medical Specialty Classification Report Balenced ===")
print(classification_report(y_test, y_pred_balanced, zero_division=0))

=== Medical Specialty Classification Report Balenced ===
                                precision    recall  f1-score   support

          Allergy / Immunology       0.00      0.00      0.00         0
                    Bariatrics       0.62      1.00      0.77         5
    Cardiovascular / Pulmonary       0.47      0.46      0.46        59
                  Chiropractic       0.09      0.33      0.14         3
    Consult - History and Phy.       0.06      0.02      0.03        62
    Cosmetic / Plastic Surgery       0.00      0.00      0.00         2
                     Dentistry       0.58      1.00      0.74         7
                   Dermatology       0.27      0.60      0.38         5
             Discharge Summary       0.11      0.13      0.12        15
          ENT - Otolaryngology       0.15      0.31      0.21        13
                 Endocrinology       0.25      0.67      0.36         3
              Gastroenterology       0.36      0.69      0.47        29
      

In [25]:
import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score
from sklearn.utils.class_weight import compute_class_weight
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments, AutoConfig
from datasets import Dataset, DatasetDict
import torch.nn as nn
import torch.nn.functional as F

In [27]:
# Load Data and Prepare Labels
labels = train_df['medical_specialty'].unique()
label2id = {label: i for i, label in enumerate(labels)}
id2label = {i: label for label, i in label2id.items()}
train_df['label'] = train_df['medical_specialty'].map(label2id)

print(train_df.head)

<bound method NDFrame.head of         Id                                        description  \
0        1   Patient had a recurrent left arm pain after h...   
1        2   The patient is an 84-year-old female presente...   
2        3                                   Hand dermatitis.   
3        4   Recurrent degenerative spondylolisthesis and ...   
4        5   Chiropractic IME with old files review.  Deta...   
...    ...                                                ...   
3379  3380   Morbid obesity.  Laparoscopic Roux-en-Y gastr...   
3380  3381   Lateral and plantar condylectomy, fifth left ...   
3381  3382   Right heart and left heart catheterization by...   
3382  3383   Left axillary lymph node excisional biopsy.  ...   
3383  3384   Aspiration pneumonia and chronic obstructive ...   

                medical_specialty                             sample_name  \
0      Cardiovascular / Pulmonary                       Angina - Consult    
1      Cardiovascular / Pulmonary  

In [29]:
label_counts = train_df['label'].value_counts()
print(label_counts)

low_freq_labels = label_counts[label_counts < 2].index.tolist()
print("label counts less then 2:", low_freq_labels)

label
2     788
8     310
0     309
5     294
9     225
6     189
4     188
11    166
20    130
21     79
18     75
16     73
10     72
12     69
22     64
7      55
13     43
14     35
1      23
27     22
25     21
24     19
23     18
26     17
29     17
17     14
3      13
19     12
33     11
30      9
15      8
32      8
31      5
28      2
34      1
Name: count, dtype: int64
label counts less then 2: [34]


In [30]:
# Filter low_freq_labels out
train_df_filtered = train_df[~train_df['label'].isin(low_freq_labels)]
print(len(train_df_filtered))

3383


In [44]:
# Split Data and Convert to Hugging Face Dataset
train_data, test_data = train_test_split(
    train_df_filtered, 
    test_size=0.2, 
    random_state=42, 
    stratify=train_df_filtered['label']
)

train_dataset = Dataset.from_pandas(train_data)
test_dataset = Dataset.from_pandas(test_data)
dataset = DatasetDict({"train": train_dataset, "test": test_dataset})

In [45]:
# Tokenization using Bio_ClinicalBERT Tokenizer
tokenizer = BertTokenizer.from_pretrained('emilyalsentzer/Bio_ClinicalBERT')

def tokenize_function(example):
    return tokenizer(example['cleaned_transcription'], padding='max_length', truncation=True, max_length=128)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/2706 [00:00<?, ? examples/s]

Map:   0%|          | 0/677 [00:00<?, ? examples/s]

In [47]:
# Compute Class Weights to address imbalance
class_weights = compute_class_weight('balanced', classes=np.unique(train_df['label']), y=train_df['label'])
class_weights = torch.tensor(class_weights, dtype=torch.float)
print("Class weights:", class_weights)

Class weights: tensor([ 0.3129,  4.2037,  0.1227,  7.4374,  0.5143,  0.3289,  0.5116,  1.7579,
         0.3119,  0.4297,  1.3429,  0.5824,  1.4012,  2.2485,  2.7624, 12.0857,
         1.3245,  6.9061,  1.2891,  8.0571,  0.7437,  1.2239,  1.5107,  5.3714,
         5.0887,  4.6041,  5.6874,  4.3948, 48.3429,  5.6874, 10.7429, 19.3371,
        12.0857,  8.7896, 96.6857])


In [48]:
# Define a Custom Model that Incorporates Class Weights
class WeightedBertForSequenceClassification(BertForSequenceClassification):
    def __init__(self, config, class_weights):
        super().__init__(config)
        self.class_weights = class_weights

    def forward(self, input_ids, attention_mask=None, token_type_ids=None, labels=None):
        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids
        )
        pooled_output = outputs[1]
        logits = self.classifier(pooled_output)
        
        loss = None
        if labels is not None:
            # Use weighted CrossEntropyLoss to handle class imbalance
            loss_fct = nn.CrossEntropyLoss(weight=self.class_weights.to(input_ids.device))
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
        return {'loss': loss, 'logits': logits} if loss is not None else logits

In [49]:
# Initialize the Model with the Custom Loss Function
config = AutoConfig.from_pretrained(
    'emilyalsentzer/Bio_ClinicalBERT', 
    num_labels=len(labels), 
    id2label=id2label, 
    label2id=label2id
)
model = WeightedBertForSequenceClassification.from_pretrained(
    'emilyalsentzer/Bio_ClinicalBERT', 
    config=config, 
    class_weights=class_weights
)

Some weights of WeightedBertForSequenceClassification were not initialized from the model checkpoint at emilyalsentzer/Bio_ClinicalBERT and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [50]:
# Define Evaluation Metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    accuracy = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average='macro')
    precision = precision_score(labels, preds, average='macro')
    recall = recall_score(labels, preds, average='macro')
    
    # For ROC-AUC, compute probabilities via softmax
    try:
        probs = torch.softmax(torch.tensor(logits), dim=1).numpy()
        roc_auc = roc_auc_score(labels, probs, multi_class='ovr')
    except Exception as e:
        roc_auc = None
    return {
        'accuracy': accuracy, 
        'f1': f1, 
        'precision': precision, 
        'recall': recall, 
        'roc_auc': roc_auc
    }

In [51]:
# Set Training Arguments and Initialize Trainer
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_steps=50,
    load_best_model_at_end=True,
    metric_for_best_model="f1"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    compute_metrics=compute_metrics
)

In [52]:
# Train and Evaluate the Model
trainer.train()

metrics = trainer.evaluate()
print("Evaluation Metrics:")
print(metrics)

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,Roc Auc
1,3.2607,3.084475,0.27031,0.092048,0.118946,0.115542,
2,2.7013,2.62066,0.302806,0.198543,0.247159,0.229519,
3,2.3475,2.439146,0.311669,0.213972,0.240775,0.259677,


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Evaluation Metrics:
{'eval_loss': 2.439145803451538, 'eval_accuracy': 0.31166912850812406, 'eval_f1': 0.21397215960652202, 'eval_precision': 0.24077525683952933, 'eval_recall': 0.259677136379215, 'eval_roc_auc': None, 'eval_runtime': 4.5822, 'eval_samples_per_second': 147.745, 'eval_steps_per_second': 18.55, 'epoch': 3.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
