# SDoH Sample Text with [LLMs4SDoH](https://github.com/BIDS-Xu-Lab/LLMs4SDoH)

In [None]:
!git clone https://github.com/BIDS-Xu-Lab/LLMs4SDoH.git
%cd LLMs4SDoH



Cloning into 'LLMs4SDoH'...
remote: Enumerating objects: 27, done.[K
remote: Counting objects: 100% (27/27), done.[K
remote: Compressing objects: 100% (18/18), done.[K
remote: Total 27 (delta 10), reused 20 (delta 6), pack-reused 0 (from 0)[K
Receiving objects: 100% (27/27), 10.46 KiB | 10.46 MiB/s, done.
Resolving deltas: 100% (10/10), done.
/content/LLMs4SDoH


In [None]:
!pip install -q transformers torch

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m117.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m102.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m62.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m42.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m19.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, LlamaForCausalLM, LlamaTokenizer
import numpy as np

model_name = "YBXL/SDoH-llama-L1"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device).eval()

system_prompt = """
### Instruction:
Given a sentence output all SDoH factors that can be inferred from that sentence from
the following list: AdverseChildhood, Alcohol, BirthSex, Drug, EducationLevel,
EmploymentStatus, EnvironExposure, FinancialIssues, FoodInsecurity, GenderIdentity,
Insurance, Isolation, LivingStatus, LocationBornRaised, MaritalStatus, PhysicalActivity,
PhysSexAbuse, Race, SexualOrientation, Smoking, and SocialSupport.
If the sentence does not mention any SDoH factor then output - nonSDoH.
"""

# SDoH categories for parsing
sdoh_categories = [
    "AdverseChildhood", "Alcohol", "BirthSex", "Drug", "EducationLevel",
    "EmploymentStatus", "EnvironExposure", "FinancialIssues", "FoodInsecurity",
    "GenderIdentity", "Insurance", "Isolation", "LivingStatus", "LocationBornRaised",
    "MaritalStatus", "PhysicalActivity", "PhysSexAbuse", "Race", "SexualOrientation",
    "Smoking", "SocialSupport", "nonSDoH"
]

In [None]:
def parse_sdoh_response(response_text):
    """Parse the model's text response to extract SDoH categories"""
    response_text = response_text.strip().lower()

    found_categories = []
    for category in sdoh_categories:
        if category.lower() in response_text:
            found_categories.append(category)

    if "nonsdoh" in response_text or not found_categories:
        return ["nonSDoH"]

    # Remove nonSDoH if other categories are found
    found_categories = [cat for cat in found_categories if cat.lower() != "nonsdoh"]

    return found_categories if found_categories else ["nonSDoH"]

In [None]:
from tqdm import tqdm
import torch
import re

texts = df_final['Gemma_Cleaned_Comment'].astype(str).tolist()
save_every = 5000
batch_size = 8
output_preds = []



pred_file = open("model_predictions_log.txt", "w", encoding="utf-8")
pred_file.write('TEXT\tPREDICTED_SDOH\tRAW_RESPONSE\n')


# Clear GPU cache before starting
torch.cuda.empty_cache()

In [None]:
for i in tqdm(range(0, len(texts), batch_size), desc="Classifying batched responses"):
    try:
        batch_texts = texts[i:i + batch_size]

        prompts = []
        for text in batch_texts:
            prompt = f"{system_prompt}\n\n### Input:\n{text}\n\n### Response:\n"
            prompts.append(prompt)

        inputs = tokenizer(
            prompts,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=512
        ).to(device)

        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=50,
                do_sample=False,
                pad_token_id=tokenizer.pad_token_id,
                eos_token_id=tokenizer.eos_token_id,
                temperature=0.1,
                repetition_penalty=1.1
            )

            # Decode only the new tokens (response part)
            input_length = inputs.input_ids.shape[1]
            response_tokens = outputs[:, input_length:]
            responses = tokenizer.batch_decode(response_tokens, skip_special_tokens=True)

        for j, (text, response) in enumerate(zip(batch_texts, responses)):
            sdoh_categories_found = parse_sdoh_response(response)

            pred_label = ";".join(sdoh_categories_found)

            output_preds.append(pred_label)
            pred_file.write(f"{text.strip()}\t{pred_label}\t{response.strip()}\n")

        del inputs, outputs, response_tokens
        torch.cuda.empty_cache()

        # Debug: Print some examples from the first batch
        if i == 0:
            print("\n=== Sample Responses ===")
            for j in range(min(3, len(responses))):
                print(f"Input: {batch_texts[j][:100]}...")
                print(f"Raw Response: {responses[j]}")
                print(f"Parsed SDoH: {output_preds[j]}")
                print("-" * 50)

        if (i + batch_size) % save_every < batch_size or (i + batch_size) >= len(texts):
            df_partial = df_final.iloc[:len(output_preds)].copy()
            df_partial['LLama_SDoH_Labels'] = output_preds
            filename = f"sdoH_partial_output_{len(output_preds)}.csv"
            df_partial.to_csv(filename, index=False)
            from google.colab import files
            files.download(filename)
            print(f" Saved and downloaded: {filename}")

    except RuntimeError as e:
        if "out of memory" in str(e).lower():
            print(f" Out of memory at batch {i}. Try reducing batch_size to 4 or 2.")
            torch.cuda.empty_cache()
            break
        else:
            print(f" Error at batch {i}: {e}")
            break


In [None]:
if len(output_preds) == len(texts):
    df_final["LLama_SDoH_Labels"] = output_preds
    final_csv = "LLama_SDoH_Final.csv"
    df_final.to_csv(final_csv, index=False)
    files.download(final_csv)
    print("Final inference complete!")
else:
    print(f" Partial results: {len(output_preds)}/{len(texts)} processed")

pred_file.close()
files.download("model_predictions_log.txt")

# SDoH Sample Text with [SODA](https://github.com/uf-hobi-informatics-lab/SODA_Docker)

In [None]:
!rm -rf SODA_Docker
!git clone https://github.com/uf-hobi-informatics-lab/SODA_Docker.git
%cd SODA_Docker
!git checkout SDoH_pipeline
!git submodule init
!git submodule update


In [None]:
!git clone https://github.com/uf-hobi-informatics-lab/SODA_Docker.git
%cd SODA_Docker
!git checkout SDoH_pipeline
!git submodule init
!git submodule update

!pip install seqeval scikit-learn torch pandas


In [None]:

!pip uninstall -y transformers tokenizers

!pip install transformers tokenizers

In [None]:
from transformers import AutoModelForTokenClassification as load_model, AutoTokenizer
print("transformers is installed and supports ALBERT.")


In [None]:
%cd ..

In [None]:
!mkdir -p pretrained_models/SDOH_bert_final

In [None]:
import os

df_final['Gemma_Cleaned_Comment'].to_csv("soda_input.csv", index=False)

output_folder = "encoded_text"
os.makedirs(output_folder, exist_ok=True)

df_subset = df_final[df_final['Gemma_Cleaned_Comment'].notna()]

for idx, row in df_subset.iterrows():
    filename = f"comment_{idx+1}.txt"
    text = str(row['Gemma_Cleaned_Comment'])
    with open(os.path.join(output_folder, filename), 'w', encoding='utf-8') as f:
        f.write(text)

print(f"Saved {len(df_subset)} comments to encoded_text/ and soda_input.csv")


In [None]:
%%writefile config.yml
gpu_node: 0
root_dir: /content/SODA_Docker
raw_data_dir: ""
generate_bio: False
encoded_text: True
ner_model:
  type: bert
  path: bert-base-cased


In [None]:
!pwd
!ls

In [None]:
!chmod +x run.sh


In [None]:
!./run.sh -c config.yml -n 0 > soda_log.txt 2>&1

In [None]:
!ls soda_log.txt

In [None]:
!find . -type f -name "*.csv"

In [None]:
!tail -n 50 soda_log.txt


In [None]:
import pandas as pd

try:
    results = pd.read_csv("results/sdoh_output.csv")
    results.head()
except FileNotFoundError:
    print("Output not found. Check soda_log.txt for errors")
