# SDoH Extraction from MIMIC-III Annotated Dataset

In [10]:
import os
import torch
from transformers import AutoTokenizer, AutoModel
import transformers

In [16]:
# Use shared cache
os.environ['HF_HOME'] = '/data/resource/huggingface'
os.environ['TRANSFORMERS_CACHE'] = '/data/resource/huggingface/hub'
os.environ['TRANSFORMERS_OFFLINE'] = '1'  # Force offline mode

# What models are available
cache_dir = "/data/resource/huggingface/hub"
available_models = []

if os.path.exists(cache_dir):
    for item in os.listdir(cache_dir):
        if item.startswith("models--"):
            # Convert models--org--name to org/name format
            model_name = item.replace("models--", "").replace("--", "/")
            available_models.append(model_name)

print("Available cached models:")
for model in sorted(available_models):
    print(f"  {model}")


Available cached models:
  CohereForAI/aya-23-35B
  CohereForAI/aya-23-8B
  CohereForAI/aya-vision-8b
  HuggingFaceTB/SmolLM-135M-Instruct
  LLaMAX/LLaMAX3-8B-Alpaca
  Qwen/Qwen2.5-1.5B
  Qwen/Qwen2.5-3B
  Qwen/Qwen2.5-72B-Instruct
  Qwen/Qwen2.5-7B
  Qwen/Qwen2.5-7B-Instruct
  Qwen/Qwen2.5-7B-instruct
  Qwen/Qwen2.5-VL-7B-Instruct
  Unbabel/wmt20-comet-qe-da
  Unbabel/wmt22-comet-da
  bert-base-uncased
  bert-large-uncased
  cardiffnlp/twitter-roberta-base-sentiment
  clairebarale/refugee_cases_ner
  deepseek-ai/DeepSeek-R1-Distill-Llama-70B
  deepseek-ai/DeepSeek-R1-Distill-Llama-8B
  deepseek-ai/DeepSeek-R1-Distill-Qwen-14B
  deepseek-ai/DeepSeek-R1-Distill-Qwen-32B
  deepseek-ai/DeepSeek-R1-Distill-Qwen-7B
  facebook/nllb-200-3.3B
  facebook/nllb-200-distilled-1.3B
  facebook/nllb-200-distilled-600M
  gpt2
  gpt2-medium
  gpt2-xl
  hfl/chinese-electra-180g-small-discriminator
  hfl/chinese-legal-electra-base-discriminator
  hfl/chinese-legal-electra-small-discriminator
  hfl/chines

In [12]:
print(f"Transformers version: {transformers.__version__}")
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")

Transformers version: 4.52.3
PyTorch version: 2.6.0
CUDA available: True


In [None]:
# Test text with SDoH factors
text = """
Person is a 45-year-old unemployed male who lives alone. 
He has a history of alcohol abuse and is currently homeless.
Person reports feeling socially isolated and has limited social support.
He completed high school education but has no college degree.
"""

print(f"\nSample clinical text: {text.strip()}")


Sample clinical text: Person is a 45-year-old unemployed male who lives alone. 
He has a history of alcohol abuse and is currently homeless.
Person reports feeling socially isolated and has limited social support.
He completed high school education but has no college degree.


In [14]:
# Test BERT
print("\n" + "="*50)
print("TESTING BERT")
print("="*50)

try:
    print("Loading BERT tokenizer...")
    bert_tokenizer = AutoTokenizer.from_pretrained(
        "bert-large-uncased", 
        local_files_only=True,
        cache_dir="/data/resource/huggingface/hub"
    )
    print("✓ BERT tokenizer loaded!")
    
    print("Loading BERT model...")
    bert_model = AutoModel.from_pretrained(
        "bert-large-uncased", 
        local_files_only=True,
        cache_dir="/data/resource/huggingface/hub"
    )
    print("✓ BERT model loaded!")
    
    # Test tokenization
    bert_tokens = bert_tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    print(f"Number of BERT tokens: {len(bert_tokens['input_ids'][0])}")
    
    # Show some tokens
    decoded_tokens = bert_tokenizer.convert_ids_to_tokens(bert_tokens['input_ids'][0])
    print(f"First 10 BERT tokens: {decoded_tokens[:10]}")
    
    # Get embeddings
    with torch.no_grad():
        bert_outputs = bert_model(**bert_tokens)
        bert_embeddings = bert_outputs.last_hidden_state
    
    print(f"BERT embeddings shape: {bert_embeddings.shape}")
    print(f"BERT hidden size: {bert_embeddings.shape[-1]}")
    
    # Get [CLS] token embedding (sentence representation)
    bert_cls = bert_embeddings[0, 0, :]  # [CLS] is first token
    print(f"BERT [CLS] embedding shape: {bert_cls.shape}")
    
except Exception as e:
    print(f"✗ BERT failed: {e}")


TESTING BERT
Loading BERT tokenizer...
✓ BERT tokenizer loaded!
Loading BERT model...
✓ BERT model loaded!
Number of BERT tokens: 50
First 10 BERT tokens: ['[CLS]', 'person', 'is', 'a', '45', '-', 'year', '-', 'old', 'unemployed']
BERT embeddings shape: torch.Size([1, 50, 1024])
BERT hidden size: 1024
BERT [CLS] embedding shape: torch.Size([1024])


In [15]:
# Test RoBERTa
print("\n" + "="*50)
print("TESTING ROBERTA")
print("="*50)

try:
    print("Loading RoBERTa tokenizer...")
    roberta_tokenizer = AutoTokenizer.from_pretrained(
        "roberta-base", 
        local_files_only=True,
        cache_dir="/data/resource/huggingface/hub"
    )
    print("✓ RoBERTa tokenizer loaded!")
    
    print("Loading RoBERTa model...")
    roberta_model = AutoModel.from_pretrained(
        "roberta-base", 
        local_files_only=True,
        cache_dir="/data/resource/huggingface/hub"
    )
    print("✓ RoBERTa model loaded!")
    
    # Test tokenization
    roberta_tokens = roberta_tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    print(f"Number of RoBERTa tokens: {len(roberta_tokens['input_ids'][0])}")
    
    # Show some tokens
    decoded_tokens = roberta_tokenizer.convert_ids_to_tokens(roberta_tokens['input_ids'][0])
    print(f"First 10 RoBERTa tokens: {decoded_tokens[:10]}")
    
    # Get embeddings
    with torch.no_grad():
        roberta_outputs = roberta_model(**roberta_tokens)
        roberta_embeddings = roberta_outputs.last_hidden_state
    
    print(f"RoBERTa embeddings shape: {roberta_embeddings.shape}")
    print(f"RoBERTa hidden size: {roberta_embeddings.shape[-1]}")
    
    # Get <s> token embedding (sentence representation)
    roberta_cls = roberta_embeddings[0, 0, :]  # <s> is first token
    print(f"RoBERTa <s> embedding shape: {roberta_cls.shape}")
    
except Exception as e:
    print(f"✗ RoBERTa failed: {e}")


TESTING ROBERTA
Loading RoBERTa tokenizer...
✓ RoBERTa tokenizer loaded!
Loading RoBERTa model...


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


✓ RoBERTa model loaded!
Number of RoBERTa tokens: 56
First 10 RoBERTa tokens: ['<s>', 'Ċ', 'Person', 'Ġis', 'Ġa', 'Ġ45', '-', 'year', '-', 'old']
RoBERTa embeddings shape: torch.Size([1, 56, 768])
RoBERTa hidden size: 768
RoBERTa <s> embedding shape: torch.Size([768])


In [None]:
# SDoH keyword detection
print("\n" + "="*50)
print("SDOH KEYWORD ANALYSIS")
print("="*50)

sdoh_keywords = [
    "unemployed", "homeless", "alcohol", "abuse", "isolated", 
    "support", "alone", "education", "school", "college", "male"
]

text_lower = text.lower()
found_keywords = []
for keyword in sdoh_keywords:
    if keyword in text_lower:
        found_keywords.append(keyword)
        print(f"✓ Found SDoH indicator: {keyword}")