In [11]:
!pip install transformers datasets scikit-learn pandas -q


In [2]:
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import classification_report
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from transformers import DataCollatorWithPadding
from datasets import Dataset
import torch


In [4]:
from google.colab import files
uploaded = files.upload()

df = pd.read_csv("radiology_reports_expanded.csv")
df.head()


Saving radiology_reports_expanded.csv to radiology_reports_expanded (1).csv


Unnamed: 0,study_id,report_text,label
0,2001,Cardiac silhouette is widened.,cardiomegaly
1,2002,There is prominent cardiomegaly.,cardiomegaly
2,2003,There is prominent cardiomegaly.,cardiomegaly
3,2004,Cardiac silhouette is widened.,cardiomegaly
4,2005,The heart is enlarged.,cardiomegaly


In [5]:
keywords = {
    "cardiomegaly": ["cardiomegaly", "enlarged heart", "heart is enlarged"],
    "edema": ["edema", "interstitial", "pulmonary edema"],
    "effusion": ["effusion", "pleural effusion"],
    "no finding": ["no finding", "clear lungs", "normal heart", "no evidence", "unremarkable"]
}

def rule_based_classifier(text):
    labels_found = []
    text = text.lower()
    for label, keys in keywords.items():
        for key in keys:
            if key in text:
                labels_found.append(label)
                break
    return ", ".join(labels_found) if labels_found else "uncertain"

df["rule_based_prediction"] = df["report_text"].apply(rule_based_classifier)
df[["report_text", "label", "rule_based_prediction"]]


Unnamed: 0,report_text,label,rule_based_prediction
0,Cardiac silhouette is widened.,cardiomegaly,uncertain
1,There is prominent cardiomegaly.,cardiomegaly,cardiomegaly
2,There is prominent cardiomegaly.,cardiomegaly,cardiomegaly
3,Cardiac silhouette is widened.,cardiomegaly,uncertain
4,The heart is enlarged.,cardiomegaly,cardiomegaly
5,There is prominent cardiomegaly.,cardiomegaly,cardiomegaly
6,Signs of cardiomegaly are present.,cardiomegaly,cardiomegaly
7,Signs of cardiomegaly are present.,cardiomegaly,cardiomegaly
8,Cardiac silhouette is widened.,cardiomegaly,uncertain
9,The cardiac contour is abnormal.,cardiomegaly,uncertain


In [6]:
true_labels = df["label"].apply(lambda x: [l.strip() for l in x.split(",")])
mlb = MultiLabelBinarizer()
df["labels"] = mlb.fit_transform(true_labels).tolist()
mlb.classes_


array(['cardiomegaly', 'edema', 'effusion', 'no finding'], dtype=object)

In [7]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

def tokenize(batch):
    return tokenizer(batch["report_text"], truncation=True)

df["labels"] = df["labels"].apply(lambda x: [float(i) for i in x])
dataset = Dataset.from_pandas(df[["report_text", "labels"]])
dataset = dataset.map(tokenize, batched=True)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

In [None]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=len(mlb.classes_), problem_type="multi_label_classification")
args = TrainingArguments("bert-finetuned", num_train_epochs=20, per_device_train_batch_size=2, logging_dir="./logs", logging_steps=10)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=dataset,
    eval_dataset=dataset,
    tokenizer=tokenizer,
    data_collator=data_collator
)

trainer.train()


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mmohammed-alzaanin[0m ([33mmohammed-alzaanin-islamic-university-of-gaza[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
10,0.6478
20,0.57
30,0.5293
40,0.4144
50,0.2931
60,0.2447
70,0.1612
80,0.1315
90,0.1272
100,0.0966


TrainOutput(global_step=250, training_loss=0.15970101284980773, metrics={'train_runtime': 474.9738, 'train_samples_per_second': 1.053, 'train_steps_per_second': 0.526, 'total_flos': 3047415187440.0, 'train_loss': 0.15970101284980773, 'epoch': 10.0})

In [9]:
preds = trainer.predict(dataset)
pred_labels = (torch.sigmoid(torch.tensor(preds.predictions)) > 0.5).int().numpy()
true_labels_bin = mlb.transform(true_labels)
print(classification_report(true_labels_bin, pred_labels, target_names=mlb.classes_))


              precision    recall  f1-score   support

cardiomegaly       1.00      1.00      1.00        14
       edema       1.00      0.92      0.96        13
    effusion       1.00      1.00      1.00        13
  no finding       1.00      1.00      1.00        12

   micro avg       1.00      0.98      0.99        52
   macro avg       1.00      0.98      0.99        52
weighted avg       1.00      0.98      0.99        52
 samples avg       1.00      0.99      0.99        52



In [None]:
text = "The heart is enlarged with bilateral pleural effusion."

inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)

outputs = model(**inputs)

import torch
pred_probs = torch.sigmoid(outputs.logits)

pred_labels = (pred_probs > 0.3).int().numpy()[0]

labels = ['cardiomegaly', 'edema', 'effusion', 'no finding']

predicted = [labels[i] for i, val in enumerate(pred_labels) if val == 1]
print("Predicted Labels:", predicted)


Predicted Labels: ['effusion']
