In [1]:
import pandas as pd

# Load your dataset
file_path = r'C:\Users\jites\Desktop\new_folder\synthetic_patient_data_with_lab_results.xlsx'
df = pd.read_excel(file_path)
print(df.shape)
df.head(3)

(1000, 6)


Unnamed: 0,Age,Gender,Symptoms,Lab Test Results,Illness History,Diagnosis Suggested
0,84,Female,"Hot Flashes, Muscle Cramps","HR: 97 bpm, BP: 103/78 mmHg, RR: 13, O2 Sat: 9...","Leg Weakness, Strong Urine Odor",Chronic Obstructive Pulmonary Disease
1,31,Male,"Bleeding after Injury, Hoarse Voice, Dehydration","HR: 75 bpm, RR: 23, Temp: 37.3°C, BP: 146/119 ...","Blue or Purple Fingers, Dehydration",Cancer
2,51,Male,"Leg Pain, Sore Throat, Dry Eyes, Swollen Ankle...","HR: 93 bpm, BP: 98/113 mmHg, Temp: 38.9°C, Amy...",,Acute kidney injury


In [2]:
# Prepare the dataset
# Combine all relevant information into a single input text
df['input_text'] = df.apply(lambda x: f"Age: {x['Age']} Gender: {x['Gender']} Symptoms: {x['Symptoms']} History: {x['Illness History']} LabTests: {x['Lab Test Results']}", axis=1)
df['Diagnosis'] = df['Diagnosis Suggested']  # Ensure you have the correct column for labels

# Convert diagnosis to numerical codes
labels = df['Diagnosis'].astype('category').cat.codes

In [3]:
from sklearn.model_selection import train_test_split

train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['input_text'], labels, test_size=0.1, random_state=42
)

In [4]:
from datasets import Dataset

train_dataset = Dataset.from_dict({'input_text': train_texts, 'labels': train_labels})
val_dataset = Dataset.from_dict({'input_text': val_texts, 'labels': val_labels})


  from .autonotebook import tqdm as notebook_tqdm


In [5]:
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
import torch

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


# Load BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# Load BERT model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(df['Diagnosis'].unique())).to(device)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
# Tokenize the data
def tokenize_function(examples):
    return tokenizer(examples['input_text'], truncation=True, padding='max_length', max_length=256)

# Preprocess the datasets
train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)

Map: 100%|██████████| 900/900 [00:00<00:00, 1387.89 examples/s]
Map: 100%|██████████| 100/100 [00:00<00:00, 1207.81 examples/s]


In [7]:
from transformers import get_linear_schedule_with_warmup
from torch.optim.lr_scheduler import CosineAnnealingLR, ReduceLROnPlateau
from torch.optim import AdamW
from transformers import EarlyStoppingCallback


# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    overwrite_output_dir=True,
    num_train_epochs=10,
    per_device_train_batch_size=8,      ## 2 ...   8   4
    gradient_accumulation_steps=4,  # Accumulate over 4 batches to simulate a batch size of 32
    per_device_eval_batch_size=8,
    eval_steps=200, ## 400,
    save_steps=300, ## 500,
    warmup_steps=300, ## 500,
    eval_strategy="epoch",
    save_strategy="epoch",
    # eval_strategy="steps",
    # save_strategy='steps',
    weight_decay=0.01,  ## L2 regularization
    logging_dir='./logs',
    logging_steps=100,
    load_best_model_at_end=True,  # Load best model at the end based on evaluation
    metric_for_best_model='eval_loss',
    greater_is_better=False,
    learning_rate=2e-4,
    save_total_limit=3,                    # Limit saved models
    eval_accumulation_steps=4,             # Accumulate eval gradients
)


In [8]:
from transformers.optimization import get_scheduler


optimizer = AdamW(model.parameters(), lr=training_args.learning_rate)
num_training_steps = len(train_dataset) // training_args.per_device_train_batch_size * training_args.num_train_epochs
num_warmup_steps = int(0.1 * num_training_steps)  # Example: 10% of training steps for warmup

scheduler = get_scheduler("linear", optimizer=optimizer, 
                            num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps)

In [9]:
trainer = Trainer(
    model=model,
    args=training_args,
    # compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    optimizers=(optimizer, scheduler),
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

# Train model
trainer.train()

# Save the model
model.save_pretrained('./bert_diagnosis_model')
tokenizer.save_pretrained('./bert_diagnosis_tokenizer')

trainer.evaluate()

  attn_output = torch.nn.functional.scaled_dot_product_attention(
                                                
 10%|█         | 28/280 [05:19<47:15, 11.25s/it]

{'eval_loss': 3.268796682357788, 'eval_runtime': 3.0126, 'eval_samples_per_second': 33.194, 'eval_steps_per_second': 4.315, 'epoch': 0.99}


                                                
 20%|██        | 56/280 [10:32<41:07, 11.02s/it]

{'eval_loss': 3.2538602352142334, 'eval_runtime': 1.5886, 'eval_samples_per_second': 62.948, 'eval_steps_per_second': 8.183, 'epoch': 1.98}


                                                
 30%|███       | 84/280 [15:55<37:10, 11.38s/it]

{'eval_loss': 3.241973876953125, 'eval_runtime': 1.5932, 'eval_samples_per_second': 62.765, 'eval_steps_per_second': 8.159, 'epoch': 2.97}


 36%|███▌      | 100/280 [18:21<28:13,  9.41s/it]

{'loss': 3.2851, 'grad_norm': 2.805276393890381, 'learning_rate': 0.0001785714285714286, 'epoch': 3.54}


                                                 
 40%|████      | 113/280 [20:23<24:57,  8.96s/it]

{'eval_loss': 3.251154899597168, 'eval_runtime': 1.7478, 'eval_samples_per_second': 57.214, 'eval_steps_per_second': 7.438, 'epoch': 4.0}


                                                 
 50%|█████     | 141/280 [23:30<15:11,  6.56s/it]

{'eval_loss': 3.250839948654175, 'eval_runtime': 1.5778, 'eval_samples_per_second': 63.38, 'eval_steps_per_second': 8.239, 'epoch': 4.99}


 50%|█████     | 141/280 [23:32<23:12, 10.01s/it]


{'train_runtime': 1412.0879, 'train_samples_per_second': 6.374, 'train_steps_per_second': 0.198, 'train_loss': 3.285126273513686, 'epoch': 4.99}


100%|██████████| 13/13 [00:01<00:00,  8.17it/s]


{'eval_loss': 3.241973876953125,
 'eval_runtime': 1.6026,
 'eval_samples_per_second': 62.397,
 'eval_steps_per_second': 8.112,
 'epoch': 4.991150442477876}