In [1]:

!pip install transformers
!pip install git+https://github.com/openai/whisper.git

from transformers import pipeline
import whisper


ner_pipeline = pipeline("ner", model="dslim/bert-base-NER", aggregation_strategy="simple")

def predict_entities_pipeline(text):
    """Run the Hugging Face NER pipeline on the input text."""
    return ner_pipeline(text)


whisper_model = whisper.load_model("base")

def transcribe_audio(audio_path):
    """Transcribe audio from a file using Whisper."""
    result = whisper_model.transcribe(audio_path)
    return result["text"]


clinical_terms = {
    "acute myocardial infarction": "DISEASE",
    "aspirin": "MEDICATION",

}

import re

def rule_based_clinical_ner(text):
    """A simple rule-based function to detect predefined clinical terms."""
    entities = []
    lower_text = text.lower()
    for term, label in clinical_terms.items():

        for match in re.finditer(re.escape(term), lower_text):
            start, end = match.span()

            entity_text = text[start:end]
            entities.append({
                "entity_group": label,
                "score": 1.0,
                "word": entity_text,
                "start": start,
                "end": end
            })
    return entities

def predict_entities_clinical(text):
    """
    First, run the Hugging Face NER pipeline.
    If no clinical entities are found, use a rule-based approach.
    """
    pipeline_entities = predict_entities_pipeline(text)

    # Check if any of the pipeline-detected entities match common clinical terms.
    clinical_detected = []
    for ent in pipeline_entities:
        if any(term in ent["word"].lower() for term in clinical_terms):
            clinical_detected.append(ent)

    # If pipeline returns nothing clinical, use rule-based detection.
    if not clinical_detected:
        return rule_based_clinical_ner(text)
    else:
        return clinical_detected

def process_text(text):
    print("Input Text:", text)
    entities = predict_entities_clinical(text)
    print("Extracted Entities:", entities)
    return entities

# Test with a clinical sentence.
clinical_test_text = "Patient diagnosed with acute myocardial infarction and prescribed aspirin."
process_text(clinical_test_text)



Collecting git+https://github.com/openai/whisper.git
  Cloning https://github.com/openai/whisper.git to /tmp/pip-req-build-b5o1lpfr
  Running command git clone --filter=blob:none --quiet https://github.com/openai/whisper.git /tmp/pip-req-build-b5o1lpfr
  Resolved https://github.com/openai/whisper.git to commit 517a43ecd132a2089d85f4ebc044728a71d49f6e
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting tiktoken (from openai-whisper==20240930)
  Downloading tiktoken-0.9.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch->openai-whisper==20240930)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch->openai-whisper==20240930)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/829 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/433M [00:00<?, ?B/s]

Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tokenizer_config.json:   0%|          | 0.00/59.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Device set to use cpu
100%|███████████████████████████████████████| 139M/139M [00:10<00:00, 14.1MiB/s]
  checkpoint = torch.load(fp, map_location=device)


Input Text: Patient diagnosed with acute myocardial infarction and prescribed aspirin.
Extracted Entities: [{'entity_group': 'DISEASE', 'score': 1.0, 'word': 'acute myocardial infarction', 'start': 23, 'end': 50}, {'entity_group': 'MEDICATION', 'score': 1.0, 'word': 'aspirin', 'start': 66, 'end': 73}]


[{'entity_group': 'DISEASE',
  'score': 1.0,
  'word': 'acute myocardial infarction',
  'start': 23,
  'end': 50},
 {'entity_group': 'MEDICATION',
  'score': 1.0,
  'word': 'aspirin',
  'start': 66,
  'end': 73}]

In [2]:
clinical_test_text = "Patient diagnosed with acute myocardial infarction and prescribed aspirin."
process_text(clinical_test_text)

Input Text: Patient diagnosed with acute myocardial infarction and prescribed aspirin.
Extracted Entities: [{'entity_group': 'DISEASE', 'score': 1.0, 'word': 'acute myocardial infarction', 'start': 23, 'end': 50}, {'entity_group': 'MEDICATION', 'score': 1.0, 'word': 'aspirin', 'start': 66, 'end': 73}]


[{'entity_group': 'DISEASE',
  'score': 1.0,
  'word': 'acute myocardial infarction',
  'start': 23,
  'end': 50},
 {'entity_group': 'MEDICATION',
  'score': 1.0,
  'word': 'aspirin',
  'start': 66,
  'end': 73}]

In [3]:

!pip install transformers datasets torch


from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import torch

data = {
    "text": [
        "Patient diagnosed with acute myocardial infarction and prescribed aspirin.",
        "Patient presents with symptoms of pneumonia and is given antibiotics.",
        "The patient shows no signs of cardiovascular disease.",
        "Patient diagnosed with diabetes and advised to follow a low-sugar diet.",
        "Patient has a history of asthma and requires inhalers."
    ],
    "label": [0, 1, 2, 3, 1]  # Labels corresponding to the classes defined above.
}


label2name = {0: "Cardiovascular", 1: "Respiratory", 2: "None", 3: "Metabolic"}


dataset = Dataset.from_dict(data)


split_dataset = dataset.train_test_split(test_size=0.2, seed=42)
train_dataset = split_dataset["train"]
eval_dataset = split_dataset["test"]


model_name = "emilyalsentzer/Bio_ClinicalBERT"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(label2name))

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)

train_dataset = train_dataset.map(tokenize_function, batched=True)
eval_dataset = eval_dataset.map(tokenize_function, batched=True)

# Rename the label column to "labels" for Trainer compatibility.
train_dataset = train_dataset.rename_column("label", "labels")
eval_dataset = eval_dataset.rename_column("label", "labels")

# Set the format of the dataset to PyTorch tensors.
train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
eval_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])


training_args = TrainingArguments(
    output_dir="./clinical_classifier",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
)

print("Training the classifier...")
trainer.train()


def predict_disease(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)
    outputs = model(**inputs)
    predicted_label = torch.argmax(outputs.logits, dim=1).item()
    return label2name[predicted_label]


test_text = "Patient diagnosed with acute myocardial infarction and prescribed aspirin."
print("Predicted Disease:", predict_disease(test_text))

Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.3.2-py3-none-any.whl (485 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m485.4/485.4 kB[0m [31m16.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m12.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py311-none-any.whl (143 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m14.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading

config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/436M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at emilyalsentzer/Bio_ClinicalBERT and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/4 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]



Training the classifier...


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mnamanchawla819[0m ([33mnamanchawla819-bits-pilani-dubai-campus[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss
1,No log,1.03629
2,No log,1.126096
3,No log,1.116457


Predicted Disease: Respiratory
