# Imports

In [1]:
from datasets import load_dataset
from transformers import pipeline
import spacy

# Load SDOH-NLI dataset

In [2]:
dataset = load_dataset("tasksource/SDOH-NLI")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/604 [00:00<?, ?B/s]

sdoh_nli_train.csv: 0.00B [00:00, ?B/s]

sdoh_nli_validation.csv: 0.00B [00:00, ?B/s]

sdoh_nli_test.csv: 0.00B [00:00, ?B/s]

Generating train split:   0%|          | 0/21090 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/4033 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/4212 [00:00<?, ? examples/s]

# Load Transformer models
### NLI model (SDoH classification)

In [12]:
# NLI model for SDoH classification
sdoh_nli = pipeline(
    "text-classification",
    model="roberta-large-mnli"
)

Loading weights:   0%|          | 0/393 [00:00<?, ?it/s]

RobertaForSequenceClassification LOAD REPORT from: roberta-large-mnli
Key                         | Status     |  | 
----------------------------+------------+--+-
roberta.pooler.dense.weight | UNEXPECTED |  | 
roberta.pooler.dense.bias   | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


### Biomedical NER model

In [4]:
# Biomedical NER for medications / diagnoses
ner = pipeline(
    "ner",
    model="d4data/biomedical-ner-all",
    aggregation_strategy="simple"
)

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/266M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/102 [00:00<?, ?it/s]

tokenizer_config.json:   0%|          | 0.00/373 [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

### spaCy model

In [5]:
# spaCy for sentence splitting (optional but realistic)
nlp = spacy.load("en_core_web_sm")

# SDoH hypotheses

In [6]:
# SDoH hypotheses
SDOH_HYPOTHESES = {
    "tobacco_use": "The patient uses tobacco.",
    "alcohol_use": "The patient consumes alcohol.",
    "employment": "The patient is unemployed.",
    "housing": "The patient has housing insecurity.",
    "social_support": "The patient lacks social support."
}

# Sentence splitting helper

In [8]:
def sentence_split(text):
    return [sent.text.strip() for sent in nlp(text).sents]

# SDoH classification function & Medical entity extraction

In [None]:
def classify_sdoh(premise):
    """Run NLI for each SDoH category"""
    results = {}

    for category, hypothesis in SDOH_HYPOTHESES.items():
        nli_input = f"{premise} </s></s> {hypothesis}"
        prediction = sdoh_nli(nli_input)[0]

        results[category] = {
            "label": prediction["label"],
            "score": round(prediction["score"], 3)
        }

    return results

def extract_medical_entities(text):
    """Extract medications / diagnoses using NER"""
    entities = ner(text)

    return [
        {
            "text": ent["word"],
            "label": ent["entity_group"],
            "score": round(ent["score"], 3)
        }
        for ent in entities
    ]

# Run the pipeline on real data

In [9]:
# ---------------------------
# Run pipeline on sample data
# ---------------------------

example = dataset["train"][0]
clinical_note = example["premise"]

sentences = sentence_split(clinical_note)

output = {
    "clinical_note": clinical_note,
    "sdoh": classify_sdoh(clinical_note),
    "medical_entities": extract_medical_entities(clinical_note)
}



{'clinical_note': '1-2 packs of cigarettes per day.', 'sdoh': {'tobacco_use': {'label': 'ENTAILMENT', 'score': 0.895}, 'alcohol_use': {'label': 'CONTRADICTION', 'score': 0.984}, 'employment': {'label': 'CONTRADICTION', 'score': 0.662}, 'housing': {'label': 'NEUTRAL', 'score': 0.889}, 'social_support': {'label': 'NEUTRAL', 'score': 0.918}}, 'medical_entities': [{'text': '1', 'label': 'Dosage', 'score': np.float32(0.557)}, {'text': '2', 'label': 'Dosage', 'score': np.float32(0.516)}, {'text': 'packs', 'label': 'Detailed_description', 'score': np.float32(0.492)}, {'text': 'cigarettes', 'label': 'Detailed_description', 'score': np.float32(0.396)}]}


# Final Result

In [10]:
print(output)

{'clinical_note': '1-2 packs of cigarettes per day.', 'sdoh': {'tobacco_use': {'label': 'ENTAILMENT', 'score': 0.895}, 'alcohol_use': {'label': 'CONTRADICTION', 'score': 0.984}, 'employment': {'label': 'CONTRADICTION', 'score': 0.662}, 'housing': {'label': 'NEUTRAL', 'score': 0.889}, 'social_support': {'label': 'NEUTRAL', 'score': 0.918}}, 'medical_entities': [{'text': '1', 'label': 'Dosage', 'score': np.float32(0.557)}, {'text': '2', 'label': 'Dosage', 'score': np.float32(0.516)}, {'text': 'packs', 'label': 'Detailed_description', 'score': np.float32(0.492)}, {'text': 'cigarettes', 'label': 'Detailed_description', 'score': np.float32(0.396)}]}
