In [None]:
import pandas as pd
from tqdm import tqdm
import os
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import torch
import numpy as np
from concurrent.futures import ThreadPoolExecutor
import multiprocessing as mp
from google.colab import drive


# SDoH Keywords Dictionary adapted from [Yu et al.](https://pmc.ncbi.nlm.nih.gov/articles/PMC11141428/) (2024)

Notes on Modifications & Domain Exclusions:

- Some topics like marital status, language, gender, race, ethnicity, living supplies, how often someone socializes, and ICD codes were not included because they didn’t have clear keywords to search for in the notes.

- Merged similar categories for clarity (e.g., substance use).

Reference:
Adapted directly from the SDoH domains from Yu et al. (2024), available [here](https://pmc.ncbi.nlm.nih.gov/articles/PMC11141428/).

In [None]:
# SDoH Keywords Dictionary adapted from Yu et al. (2024)
sdoh_keywords_Yu = {
    "financial-constraint": [
        "income", "financial", "poverty", "socioeconomic",
        "occupation", "indigence", "retired", "worked in",
        "working in", "work as", "work", "working at", "cost", "bill", "difficulty paying"
    ],
    "living-condition": [
        "house", "housing", "homeless", "shelter", "lives with", "living with",
        "resideswith", "accompanied by", "here with", "live with"
    ],
    "transportation": [
        "transportation need", "transportation", "ride", "miss appointment", "commute"
    ],
    "education": [
        "education", "school", "diploma", "educational attainment"
    ],
    "employment-status": [
        "employment", "unemployment", "employed", "unemployed", "occupation", "retired"
    ],
    "social-connection": [
        "social worker", "social history", "community", "neighborhood", "neighbourhood"
    ],
    "violence-abuse": [
        "violence", "crime", "physical abuse", "sexual abuse", "abuse"
    ],
    "disability-status": [
        "disability", "disabled"
    ],
    "physical-activity": [
        "physical activity", "exercise", "running", "currently not exercising"
    ],
    "substance-use": [
        "tobacco", "smoking", "smoker", "cigarette", "alcohol", "drug use", "cocaine",
        "substance abuse"
    ],
    "sexual-activity": [
        "sexual activity", "partner", "protection"
    ]

}


#Loading (Patient Comments)

In [None]:
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
file_path_final = '/content/drive/MyDrive/LLama_SDoH_Final.csv'

df_final = pd.read_csv(file_path_final)

  warn(msg)


In [None]:
df_final.tail()

Unnamed: 0,Hospital,Type,Comment,Year,Valence,Unit,Code
126291,Woodstock General Hospital,General,You need peel & stick envelopes\nfor your survey,2021,Negative,Day\nSurgery\n(Mail),General Comment
126292,Woodstock General Hospital,General,You should have given me a\ncard with my follo...,2020,Negative,Day\nSurgery\n(Mail),"Access/Coord of Care,\nContinuity/Transition,\..."
126293,Woodstock General Hospital,Contact\nRequested,Your hospital and staff are\nabsolutely to be ...,2021,Positive,Emergency\n(Mail),"Admit/Registration,\nDoctor/Physician,\nHousek..."
126294,Woodstock General Hospital,General,"Your hospital is one of the cleanest, most org...",2016,Positive,Woodstock General Hospital; Day Surgery,"Access/Coord of Care, Housekeeping/Room"
126295,Woodstock General Hospital,General,Your nurses are wonderful Thank-you Dr XXXXXXX...,2017,Positive,Day\nSurgery\n(Mail),"Doctor/Physician,\nNurse/Nurse aide,\nPositive..."


#Phi-3 Zero shot

In [None]:
df_final = df_final.dropna(subset=["Gemma_Cleaned_Comment"]).reset_index(drop=True)

output_file = "LLama_SDoH_Final_phi3.csv"

if os.path.exists(output_file):
    print(f"Found existing {output_file}, resuming from previous progress...")
    final_results = pd.read_csv(output_file)
    already_processed = len(final_results[final_results["phi3_label"] != ""])
    print(f"Already processed: {already_processed}/{len(final_results)} comments")
else:
    print(f"Starting fresh classification...")
    final_results = df_final.copy()
    final_results["phi3_label"] = ""

model_name = "microsoft/phi-3-mini-128k-instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    trust_remote_code=True,
    device_map="auto",
    torch_dtype=torch.float16,
    attn_implementation="eager"
)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token


Starting fresh classification...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/306 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/599 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

configuration_phi3.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/phi-3-mini-128k-instruct:
- configuration_phi3.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_phi3.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/phi-3-mini-128k-instruct:
- modeling_phi3.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.67G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/181 [00:00<?, ?B/s]

In [None]:
sdoh_categories = list(sdoh_keywords_Yu.keys())

def classify_sdoh_phi3_batch(comments, batch_size=32):
    """
    Classify multiple comments in batches for better efficiency
    """
    results = []
    categories_str = ', '.join(sdoh_categories)

    for i in range(0, len(comments), batch_size):
        batch_comments = comments[i:i+batch_size]
        batch_prompts = []

        for comment in batch_comments:
            prompt = f"""### Instruction:
Classify the following patient comment into one of these Social Determinants of Health (SDoH) categories:
{categories_str}.
If the comment does not relate to any social determinants of health, classify it as "no_sdoh".
### Patient Comment:
{comment}
### SDoH Category:"""
            batch_prompts.append(prompt)

        try:
            inputs = tokenizer(
                batch_prompts,
                return_tensors="pt",
                truncation=True,
                max_length=512,
                padding=True
            )
            inputs = {k: v.to(model.device) for k, v in inputs.items()}

            with torch.no_grad():
                outputs = model.generate(
                    **inputs,
                    max_new_tokens=50,
                    do_sample=False,
                    pad_token_id=tokenizer.eos_token_id,
                    eos_token_id=tokenizer.eos_token_id,
                    use_cache=False
                )

            for j, output in enumerate(outputs):
                input_length = inputs['input_ids'][j].shape[0]
                new_tokens = output[input_length:]
                response = tokenizer.decode(new_tokens, skip_special_tokens=True)

                category = response.strip().split('\n')[0].lower()
                category = category.replace(":", "").replace(".", "").strip()

                if category in sdoh_categories:
                    results.append(category)
                else:
                    results.append("no_sdoh")

        except Exception as e:
            print(f"Error processing batch starting at index {i}: {e}")
            results.extend(["error"] * len(batch_comments))

    return results



Processing chunk 1/23 (rows 0 to 4999)




In [None]:
batch_size = 16
save_every = 5000
start_idx = 0

chunk_size = save_every
num_chunks = (len(df_final) + chunk_size - 1) // chunk_size

for chunk_idx in range(num_chunks):
    start = chunk_idx * chunk_size
    end = min(start + chunk_size, len(df_final))

    print(f"Processing chunk {chunk_idx + 1}/{num_chunks} (rows {start} to {end-1})")

    chunk_comments = df_final.loc[start:end-1, "Gemma_Cleaned_Comment"].tolist()

    chunk_labels = classify_sdoh_phi3_batch(chunk_comments, batch_size)

    for i, label in enumerate(chunk_labels):
        final_results.loc[start + i, "phi3_label"] = label

    final_results.to_csv(output_file, index=False)
    print(f"Autosaved after processing {end} rows")


final_results.to_csv(output_file, index=False)
print(f"Results saved to {output_file}")



#Gemma-2B-it Zero shot

In [None]:
output_file = "LLama_SDoH_Final_gemma.csv"
final_results = df_final.copy()
final_results["gemma_label"] = ""

model_name = "google/gemma-2b-it"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name, device_map=device, torch_dtype=torch.float16
)


In [None]:
sdoh_categories = list(sdoh_keywords_Yu.keys())

def classify_sdoh_gemma_batch(comments, batch_size=32):
    results = []
    categories_str = ', '.join(sdoh_categories)

    for i in range(0, len(comments), batch_size):
        batch_comments = comments[i:i+batch_size]
        batch_prompts = [
            f"""<start_of_turn>user
Classify the following patient comment into one of these Social Determinants of Health (SDoH) categories:
{categories_str}.
If none match, classify as "no_sdoh".

Patient Comment:
{comment}

SDoH Category:<end_of_turn>
<start_of_turn>model""" for comment in batch_comments
        ]

        try:
            inputs = tokenizer(
                batch_prompts,
                return_tensors="pt",
                truncation=True,
                max_length=512,
                padding=True
            ).to(device)

            with torch.no_grad():
                outputs = model.generate(
                    **inputs,
                    max_new_tokens=50,
                    do_sample=False,
                    pad_token_id=tokenizer.eos_token_id,
                    use_cache=False
                )

            for j, output in enumerate(outputs):
                input_length = inputs['input_ids'][j].shape[0]
                new_tokens = output[input_length:]
                response = tokenizer.decode(new_tokens, skip_special_tokens=True)

                category = response.strip().split("\n")[0].lower().replace(":", "").replace(".", "").strip()

                if category in sdoh_categories:
                    results.append(category)
                else:
                    results.append("no_sdoh")

        except Exception as e:
            print(f"Batch error at index {i}: {e}")
            results.extend(["error"] * len(batch_comments))

    return results


In [None]:
batch_size = 16
save_every = 5000
chunk_size = save_every
num_chunks = (len(df_final) + chunk_size - 1) // chunk_size

for chunk_idx in tqdm(range(num_chunks), desc="Gemma Chunks"):
    start = chunk_idx * chunk_size
    end = min(start + chunk_size, len(df_final))

    chunk_comments = df_final.loc[start:end-1, "Gemma_Cleaned_Comment"].tolist()
    chunk_labels = classify_sdoh_gemma_batch(chunk_comments, batch_size)

    final_results.loc[start:end-1, "gemma_label"] = chunk_labels

    final_results.to_csv(output_file, index=False)
    print(f" Autosaved at row {end}")


final_results.to_csv(output_file, index=False)
print(f" Results saved to {output_file}")
