In [1]:
!pip install spacy transformers torch
!python -m spacy download en_core_web_sm

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [2]:
import json
import re
import spacy
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForTokenClassification,
    AutoModelForSequenceClassification,
    pipeline
)

class MedicalNLPProcessor:
    def __init__(self):
        # Load models during initialization
        self.nlp = spacy.load("en_core_web_sm")

        # Biomedical NER components
        self.ner_tokenizer = AutoTokenizer.from_pretrained("d4data/biomedical-ner-all")
        self.ner_model = AutoModelForTokenClassification.from_pretrained("d4data/biomedical-ner-all")
        self.ner_pipeline = pipeline(
            "ner",
            model=self.ner_model,
            tokenizer=self.ner_tokenizer,
            aggregation_strategy="average",
            device=0 if torch.cuda.is_available() else -1
        )

        # Clinical sentiment analysis
        self.sentiment_tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
        self.sentiment_model = AutoModelForSequenceClassification.from_pretrained(
            "bvanaken/clinical-assertion-negation-bert",
            num_labels=3
        )
        self.sentiment_model = self.sentiment_model.to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))

        # Medical entity patterns
        self.entity_patterns = {
            'Symptoms': r'\b(pain|discomfort|stiffness|tenderness|headache|nausea|dizziness)\b',
            'Treatment': r'\b(physiotherapy|painkillers|surgery|medication|rehabilitation|exercise)\b',
            'Diagnosis': r'\b(whiplash|concussion|fracture|strain|sprain|herniation)\b',
            'Prognosis': r'\b(recovery|improvement|chronic|degeneration|rehabilitation)\b'
        }

    def extract_patient_info(self, text):
        """Extract structured medical information from transcript"""
        doc = self.nlp(text)
        patient_data = {
            'Patient_Name': self._extract_name(doc),
            'Medical_History': self._extract_medical_entities(text),
            'Sentiment': self.analyze_sentiment(text),
            'Intent': self.detect_intent(text),
            'SOAP_Note': self.generate_soap_note(text)
        }
        return patient_data

    def _extract_name(self, doc):
        """Extract patient name using spaCy NER"""
        for ent in doc.ents:
            if ent.label_ == "PERSON" and "doctor" not in ent.text.lower():
                return ent.text
        return "Unknown"

    def _extract_medical_entities(self, text):
        """Hybrid entity extraction using both model and patterns"""
        entities = {k: set() for k in self.entity_patterns.keys()}

        # Model-based extraction
        ner_results = self.ner_pipeline(text)
        for entity in ner_results:
            entity_type = self._map_entity_type(entity['entity_group'])
            if entity_type:
                entities[entity_type].add(entity['word'].strip())

        # Pattern-based extraction
        for category, pattern in self.entity_patterns.items():
            matches = re.findall(pattern, text, flags=re.IGNORECASE)
            entities[category].update(matches)

        return {k: list(v) if v else ["Not documented"] for k, v in entities.items()}

    def _map_entity_type(self, entity_group):
        """Map model entities to our categories"""
        mapping = {
            'DISO': 'Symptoms',
            'CHEM': 'Treatment',
            'PROC': 'Treatment',
            'DIAG': 'Diagnosis'
        }
        return mapping.get(entity_group)

    def analyze_sentiment(self, text):
        """Clinical sentiment classification"""
        inputs = self.sentiment_tokenizer(
            text,
            return_tensors="pt",
            truncation=True,
            max_length=512
        ).to(self.sentiment_model.device)

        with torch.no_grad():
            outputs = self.sentiment_model(**inputs)

        sentiment_map = {
            0: "Anxious",
            1: "Neutral",
            2: "Reassured"
        }
        return sentiment_map[torch.argmax(outputs.logits).item()]

    def detect_intent(self, text):
        """Rule-based intent detection"""
        intent_rules = {
            'Seeking reassurance': r'\b(worried|concerned|hope|unsure|wonder)\b',
            'Reporting symptoms': r'\b(pain|discomfort|hurt|ache|tenderness)\b',
            'Requesting information': r'\b(what|how|why|explain|mean|clarify)\b'
        }
        for intent, pattern in intent_rules.items():
            if re.search(pattern, text, re.IGNORECASE):
                return intent
        return 'Other'

    def generate_soap_note(self, text):
        """Generate structured SOAP note"""
        entities = self._extract_medical_entities(text)
        sections = {
            'Subjective': {
                'Chief_Complaint': entities['Symptoms'],
                'History_of_Present_Illness': self._extract_history(text)
            },
            'Objective': {
                'Physical_Exam': self._extract_exam_findings(text),
                'Diagnostic_Results': entities['Diagnosis']
            },
            'Assessment': {
                'Clinical_Impression': entities['Diagnosis'],
                'Prognosis': entities['Prognosis']
            },
            'Plan': {
                'Treatment_Plan': entities['Treatment'],
                'Follow_Up': 'Schedule follow-up in 6 weeks'
            }
        }
        return sections

    def _extract_history(self, text):
        """Extract historical information from patient statements"""
        patient_statements = [line for line in text.split('\n') if line.startswith('Patient:')]
        return ' '.join([s.split('Patient: ')[1] for s in patient_statements[:3]])

    def _extract_exam_findings(self, text):
        """Extract physical exam findings from physician statements"""
        exam_findings = []
        in_exam = False
        for line in text.split('\n'):
            if '[Physical Examination Conducted]' in line:
                in_exam = True
                continue
            if in_exam and line.startswith('Physician:'):
                finding = line.split('Physician: ')[1]
                exam_findings.append(finding)
        return exam_findings[-1] if exam_findings else 'No exam findings documented'

def main():
    # Initialize processor
    processor = MedicalNLPProcessor()

    # Example transcript
    transcript = """
    Physician: Good morning, Ms. Jones. How are you feeling today?
    Patient: Good morning, doctor. I’m doing better, but I still have some discomfort now and then.
    Physician: I understand you were in a car accident last September. Can you walk me through what happened?
    Patient: Yes, it was on September 1st, around 12:30 in the afternoon. I was driving from Cheadle Hulme to Manchester when I had to stop in traffic. Out of nowhere, another car hit me from behind, which pushed my car into the one in front.
    Physician: That sounds like a strong impact. Were you wearing your seatbelt?
    Patient: Yes, I always do.
    Physician: What did you feel immediately after the accident?
    Patient: At first, I was just shocked. But then I realized I had hit my head on the steering wheel, and I could feel pain in my neck and back almost right away.
    Physician: Did you seek medical attention at that time?
    Patient: Yes, I went to Moss Bank Accident and Emergency. They checked me over and said it was a whiplash injury, but they didn’t do any X-rays. They just gave me some advice and sent me home.
    Physician: How did things progress after that?
    Patient: The first four weeks were rough. My neck and back pain were really bad—I had trouble sleeping and had to take painkillers regularly. It started improving after that, but I had to go through ten sessions of physiotherapy to help with the stiffness and discomfort.
    Physician: That makes sense. Are you still experiencing pain now?
    Patient: It’s not constant, but I do get occasional backaches. It’s nothing like before, though.
    Physician: That’s good to hear. Have you noticed any other effects, like anxiety while driving or difficulty concentrating?
    Patient: No, nothing like that. I don’t feel nervous driving, and I haven’t had any emotional issues from the accident.
    Physician: And how has this impacted your daily life? Work, hobbies, anything like that?
    Patient: I had to take a week off work, but after that, I was back to my usual routine. It hasn’t really stopped me from doing anything.
    Physician: That’s encouraging. Let’s go ahead and do a physical examination to check your mobility and any lingering pain.
    [Physical Examination Conducted]
    Physician: Everything looks good. Your neck and back have a full range of movement, and there’s no tenderness or signs of lasting damage. Your muscles and spine seem to be in good condition.
    Patient: That’s a relief!
    Physician: Yes, your recovery so far has been quite positive. Given your progress, I’d expect you to make a full recovery within six months of the accident. There are no signs of long-term damage or degeneration.
    Patient: That’s great to hear. So, I don’t need to worry about this affecting me in the future?
    Physician: That’s right. I don’t foresee any long-term impact on your work or daily life. If anything changes or you experience worsening symptoms, you can always come back for a follow-up. But at this point, you’re on track for a full recovery.
    Patient: Thank you, doctor. I appreciate it.
    Physician: You’re very welcome, Ms. Jones. Take care, and don’t hesitate to reach out if you need anything.
    """

    # Process transcript
    result = processor.extract_patient_info(transcript)

    # Format output
    structured_output = {
        "Patient_Name": result['Patient_Name'],
        "Symptoms": result['Medical_History']['Symptoms'],
        "Diagnosis": result['Medical_History']['Diagnosis'][0],
        "Treatment": result['Medical_History']['Treatment'],
        "Prognosis": result['Medical_History']['Prognosis'][0],
        "Sentiment": result['Sentiment'],
        "Intent": result['Intent'],
        "SOAP_Note": result['SOAP_Note']
    }

    print(json.dumps(structured_output, indent=4, ensure_ascii=False))

if __name__ == "__main__":
    main()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/373 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/5.00k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/266M [00:00<?, ?B/s]

Device set to use cpu


config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/819 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/433M [00:00<?, ?B/s]

{
    "Patient_Name": "Jones",
    "Symptoms": [
        "stiffness",
        "pain",
        "discomfort",
        "tenderness"
    ],
    "Diagnosis": "whiplash",
    "Treatment": [
        "physiotherapy",
        "painkillers"
    ],
    "Prognosis": "degeneration",
    "Sentiment": "Anxious",
    "Intent": "Reporting symptoms",
    "SOAP_Note": {
        "Subjective": {
            "Chief_Complaint": [
                "stiffness",
                "pain",
                "discomfort",
                "tenderness"
            ],
            "History_of_Present_Illness": ""
        },
        "Objective": {
            "Physical_Exam": "No exam findings documented",
            "Diagnostic_Results": [
                "whiplash"
            ]
        },
        "Assessment": {
            "Clinical_Impression": [
                "whiplash"
            ],
            "Prognosis": [
                "degeneration",
                "recovery"
            ]
        },
        "Plan": {
   