![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/healthcare-nlp/27.0.Social_Determinant_of_Health_Models.ipynb)

# Social Determinant Of Health Models

This notebook includes details about different kinds of pretrained models to extracts terminology related to Social Determinants of Health from various kinds of biomedical documents, together with examples of each type of model.

In [0]:
from johnsnowlabs import nlp, medical

# After uploading your license run this to install all licensed Python Wheels and pre-download Jars the Spark Session JVM
#nlp.install()

In [0]:
import os
import json
import string
import numpy as np
import pandas as pd

import sparknlp
import sparknlp_jsl
from sparknlp.base import *
from sparknlp.util import *
from sparknlp.annotator import *
from sparknlp_jsl.annotator import *
from sparknlp.pretrained import ResourceDownloader

from pyspark.sql import functions as F
from pyspark.ml import Pipeline, PipelineModel
import pyspark.sql.types as T
import pyspark.sql as SQL
from pyspark import keyword_only

pd.set_option('max_colwidth', 100)
pd.set_option('display.max_columns', 100)  
pd.set_option('display.expand_frame_repr', False)

print("Spark NLP Version :", sparknlp.version())
print("Spark NLP_JSL Version :", sparknlp_jsl.version())

spark



Spark NLP Version : 5.2.2
Spark NLP_JSL Version : 5.2.1


## List of Pretrained Models

In [0]:
df = pd.DataFrame()
for model_type in ['MedicalNerModel', 'GenericClassifierModel']:
    model_list = sorted(list(set([model[0] for model in medical.InternalResourceDownloader.returnPrivateModels(model_type) if 'sdoh' in model[0]])))
    if len(model_list) > 0:
      if model_type == "MedicalNerModel":
        model_list = list(filter(lambda x: "wip" not in x, model_list))
      df = pd.concat([df, pd.DataFrame(model_list, columns = [model_type])], axis = 1)

df.fillna('')

Unnamed: 0,MedicalNerModel,GenericClassifierModel
0,ner_sdoh,genericclassifier_sdoh_alcohol_usage_binary_sbiobert_cased_mli
1,ner_sdoh_access_to_healthcare,genericclassifier_sdoh_alcohol_usage_sbiobert_cased_mli
2,ner_sdoh_community_condition,genericclassifier_sdoh_economics_binary_sbiobert_cased_mli
3,ner_sdoh_demographics,genericclassifier_sdoh_financial_insecurity_mpnet
4,ner_sdoh_health_behaviours_problems,genericclassifier_sdoh_food_insecurity_mpnet
5,ner_sdoh_income_social_status,genericclassifier_sdoh_housing_insecurity_sbiobert_cased_mli
6,ner_sdoh_langtest,genericclassifier_sdoh_insurance_coverage_sbiobert_cased_mli
7,ner_sdoh_mentions,genericclassifier_sdoh_insurance_status_sbiobert_cased_mli
8,ner_sdoh_mentions_test,genericclassifier_sdoh_insurance_type_sbiobert_cased_mli
9,ner_sdoh_social_environment,genericclassifier_sdoh_mental_health_clinical


## NER Models

The NER models from the list include different entity groups and levels of granularity.

In [0]:
document_assembler = nlp.DocumentAssembler()\
    .setInputCol("text")\
    .setOutputCol("document")

sentence_detector = nlp.SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare","en","clinical/models")\
    .setInputCols(["document"])\
    .setOutputCol("sentence")

tokenizer = nlp.Tokenizer()\
    .setInputCols(["sentence"])\
    .setOutputCol("token")\
    .setSplitChars(["-", "\/"])

word_embeddings = nlp.WordEmbeddingsModel.pretrained("embeddings_clinical","en","clinical/models")\
    .setInputCols(["sentence","token"])\
    .setOutputCol("embeddings")

## ner_sdoh
ner_sdoh = medical.NerModel.pretrained("ner_sdoh", "en", "clinical/models") \
    .setInputCols(["sentence", "token", "embeddings"]) \
    .setOutputCol("ner_sdoh")

ner_sdoh_converter = medical.NerConverterInternal() \
    .setInputCols(["sentence", "token", "ner_sdoh"]) \
    .setOutputCol("ner_chunk_sdoh") \

## ner_social_environment
ner_social_environment = medical.NerModel.pretrained("ner_sdoh_social_environment_wip", "en", "clinical/models") \
    .setInputCols(["sentence", "token", "embeddings"]) \
    .setOutputCol("ner_social_environment")

ner_social_environment_converter = medical.NerConverterInternal() \
    .setInputCols(["sentence", "token", "ner_social_environment"]) \
    .setOutputCol("ner_chunk_social_environment") \

## ner_sdoh_mentions
ner_sdoh_mentions = medical.NerModel.pretrained("ner_sdoh_mentions", "en", "clinical/models") \
    .setInputCols(["sentence", "token", "embeddings"]) \
    .setOutputCol("ner_sdoh_mentions")

ner_sdoh_mentions_converter = medical.NerConverterInternal() \
    .setInputCols(["sentence", "token", "ner_sdoh_mentions"]) \
    .setOutputCol("ner_chunk_ner_sdoh_mentions") \

ner_stages = [document_assembler,
    sentence_detector,
    tokenizer,
    word_embeddings,
    ner_sdoh,
    ner_sdoh_converter,
    ner_social_environment,
    ner_social_environment_converter,
    ner_sdoh_mentions,
    ner_sdoh_mentions_converter]

ner_pipeline = nlp.Pipeline(stages=ner_stages)

empty_data = spark.createDataFrame([[""]]).toDF("text")

ner_model = ner_pipeline.fit(empty_data)

sentence_detector_dl_healthcare download started this may take some time.
Approximate size to download 367.3 KB
[ | ][OK!]
embeddings_clinical download started this may take some time.
Approximate size to download 1.6 GB
[ | ][OK!]
ner_sdoh download started this may take some time.
[ | ][ / ][OK!]
ner_sdoh_social_environment_wip download started this may take some time.
[ | ][ / ][OK!]
ner_sdoh_mentions download started this may take some time.
[ | ][ / ][OK!]


In [0]:
ner_sdoh_labels = sorted(list(set([label.split('-')[-1] for label in ner_sdoh.getClasses() if label != 'O'])))

len(ner_sdoh_labels)

Out[5]: 46

In [0]:
label_df = pd.DataFrame()
for column in range((len(ner_sdoh_labels)//10)+1):
  label_df = pd.concat([label_df, pd.DataFrame(ner_sdoh_labels, columns = [''])[column*10:(column+1)*10].reset_index(drop= True)], axis = 1)

label_df.fillna('')

Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5
0,Access_To_Care,Employment,Hyperlipidemia,Other_SDoH_Keywords,Substance_Duration
1,Age,Environmental_Condition,Hypertension,Population_Group,Substance_Frequency
2,Alcohol,Exercise,Income,Quality_Of_Life,Substance_Quantity
3,Chidhood_Event,Family_Member,Insurance_Status,Race_Ethnicity,Substance_Use
4,Communicable_Disease,Financial_Status,Language,Sexual_Activity,Transportation
5,Community_Safety,Food_Insecurity,Legal_Issues,Sexual_Orientation,Violence_Or_Abuse
6,Diet,Gender,Marital_Status,Smoking,
7,Disability,Geographic_Entity,Mental_Health,Social_Exclusion,
8,Eating_Disorder,Healthcare_Institution,Obesity,Social_Support,
9,Education,Housing,Other_Disease,Spiritual_Beliefs,


In [0]:
ner_social_environment_labels = sorted(list(set([label.split('-')[-1] for label in ner_social_environment.getClasses() if label != 'O'])))

print(ner_social_environment_labels)

['Chidhood_Event', 'Social_Exclusion', 'Social_Support', 'Violence_Abuse_Legal']


In [0]:
ner_sdoh_mentions_labels = sorted(list(set([label.split('-')[-1] for label in ner_sdoh_mentions.getClasses() if label != 'O'])))

print(ner_sdoh_mentions_labels)

['behavior_alcohol', 'behavior_drug', 'behavior_tobacco', 'sdoh_community', 'sdoh_economics', 'sdoh_education', 'sdoh_environment']


In [0]:
sample_text_1 = '''Smith is 55 years old, living in New York, a divorced Mexcian American woman with financial problems. She speaks Spanish and Portuguese. She lives in an apartment. She has been struggling with diabetes for the past 10 years and has recently been experiencing frequent hospitalizations due to uncontrolled blood sugar levels. Smith works as a cleaning assistant and cannot access health insurance or paid sick leave. She has a son, a student at college. Pt with likely long-standing depression. She is aware she needs rehab. Pt reports having her catholic faith as a means of support as well.  She has a long history of etoh abuse, beginning in her teens. She reports she has been a daily drinker for 30 years, most recently drinking beer daily. She smokes a pack of cigarettes a day. She had DUI in April and was due to court this week.'''

sample_text_2 = '''Medical history: Jane was born in a low - income household and experienced significant trauma during her childhood, including physical and emotional abuse.'''

sample_text_3 = '''Mr. Known lastname 9880 is a pleasant, cooperative gentleman with a long standing history (20 years) diverticulitis. He is married and has 3 children. He works in a bank. He denies any alcohol or intravenous drug use. He has been smoking for many years.'''

In [0]:
data = spark.createDataFrame(pd.DataFrame([sample_text_1, sample_text_2, sample_text_3], columns = ['text']))

In [0]:
results = ner_model.transform(data).collect()

In [0]:
visualiser = nlp.viz.NerVisualizer()

In [0]:
visualiser.display(results[0], label_col='ner_chunk_sdoh')

In [0]:
visualiser.display(results[2], label_col='ner_chunk_ner_sdoh_mentions')

In [0]:
visualiser.display(results[1], label_col='ner_chunk_social_environment')

## Assertion model

In [0]:
document_assembler = nlp.DocumentAssembler()\
    .setInputCol("text")\
    .setOutputCol("document")

sentence_detector = nlp.SentenceDetectorDLModel.pretrained("sentence_detector_dl", "en")\
    .setInputCols(["document"])\
    .setOutputCol("sentence")

tokenizer = nlp.Tokenizer()\
    .setInputCols(["sentence"])\
    .setOutputCol("token")

clinical_embeddings = nlp.WordEmbeddingsModel.pretrained('embeddings_clinical', "en", "clinical/models")\
    .setInputCols(["sentence", "token"])\
    .setOutputCol("embeddings")

ner_model = medical.NerModel.pretrained("ner_sdoh", "en", "clinical/models")\
    .setInputCols(["sentence", "token","embeddings"])\
    .setOutputCol("ner")

ner_converter = medical.NerConverterInternal()\
    .setInputCols(['sentence', 'token', 'ner'])\
    .setOutputCol('ner_chunk')\
    .setBlackList(['Age','Gender','Language','Healthcare_Institution'])   # I dont need these assertion of entities

assertion = medical.AssertionDLModel.pretrained("assertion_sdoh_wip", "en", "clinical/models") \
    .setInputCols(["sentence", "ner_chunk", "embeddings"]) \
    .setOutputCol("assertion")

pipeline = nlp.Pipeline(
    stages=[
        document_assembler,
        sentence_detector,
        tokenizer,
        clinical_embeddings,
        ner_model,
        ner_converter,
        assertion
])

empty_data = spark.createDataFrame([[""]]).toDF("text")
model = pipeline.fit(empty_data)

sentence_detector_dl download started this may take some time.
Approximate size to download 354.6 KB
[ | ][OK!]
embeddings_clinical download started this may take some time.
Approximate size to download 1.6 GB
[ | ][OK!]
ner_sdoh download started this may take some time.
[ | ][OK!]
assertion_sdoh_wip download started this may take some time.
[ | ][ / ][OK!]


In [0]:
assertion.getClasses()

Out[17]: ['Absent', 'Present', 'Someone_Else', 'Past', 'Hypothetical', 'Possible']

In [0]:
sample_texts= [
"""Smith works as a cleaning assistant and does not have access to health insurance or paid sick leave.
But she has generally housing problems. She lives in a apartment now.  She has long history of EtOH abuse, beginning in her teens.
She is aware she needs to attend Rehab Programs. She had DUI back in April and was due to be in court this week.
Her partner is an alcoholic and a drug abuser for the last 5 years.
She also mentioned feeling socially isolated and lack of a strong support system """
]

In [0]:
light_model = nlp.LightPipeline(model)
light_result = light_model.fullAnnotate(sample_texts)

In [0]:
light_result[0].keys()

Out[20]: dict_keys(['document', 'ner_chunk', 'assertion', 'token', 'ner', 'embeddings', 'sentence'])

In [0]:
light_result[0]['ner_chunk']

Out[21]: [Annotation(chunk, 17, 34, cleaning assistant, {'chunk': '0', 'confidence': '0.76975', 'ner_source': 'ner_chunk', 'entity': 'Employment', 'sentence': '0'}, []),
 Annotation(chunk, 64, 79, health insurance, {'chunk': '1', 'confidence': '0.6325', 'ner_source': 'ner_chunk', 'entity': 'Insurance_Status', 'sentence': '0'}, []),
 Annotation(chunk, 156, 164, apartment, {'chunk': '2', 'confidence': '0.9575', 'ner_source': 'ner_chunk', 'entity': 'Housing', 'sentence': '2'}, []),
 Annotation(chunk, 196, 205, EtOH abuse, {'chunk': '3', 'confidence': '0.8286', 'ner_source': 'ner_chunk', 'entity': 'Alcohol', 'sentence': '3'}, []),
 Annotation(chunk, 265, 278, Rehab Programs, {'chunk': '4', 'confidence': '0.6292', 'ner_source': 'ner_chunk', 'entity': 'Access_To_Care', 'sentence': '4'}, []),
 Annotation(chunk, 289, 291, DUI, {'chunk': '5', 'confidence': '0.9603', 'ner_source': 'ner_chunk', 'entity': 'Legal_Issues', 'sentence': '5'}, []),
 Annotation(chunk, 363, 371, alcoholic, {'chunk': '6',

In [0]:
chunks=[]
entities=[]
status=[]
confidence=[]

for assertion_row in light_result[0]["assertion"]:
  chunk_id = assertion_row.metadata["chunk"]
  for chunk_row in light_result[0]["ner_chunk"]:
    if chunk_id == chunk_row.metadata["chunk"]:
        chunks.append(chunk_row.result)
        entities.append(chunk_row.metadata['entity'])
        status.append(assertion_row.result)
        confidence.append(assertion_row.metadata['confidence'])

df = pd.DataFrame({'chunks':chunks, 'entities':entities, 'assertion':status, 'confidence':confidence})

df

Unnamed: 0,chunks,entities,assertion,confidence
0,cleaning assistant,Employment,Present,0.7926
1,health insurance,Insurance_Status,Absent,0.5072
2,apartment,Housing,Present,0.9956
3,EtOH abuse,Alcohol,Past,0.6054
4,Rehab Programs,Access_To_Care,Hypothetical,0.5861
5,DUI,Legal_Issues,Past,0.5037
6,alcoholic,Alcohol,Someone_Else,0.9868
7,drug abuser,Substance_Use,Someone_Else,0.9996
8,last 5 years,Substance_Duration,Someone_Else,0.9951
9,socially isolated,Social_Exclusion,Present,0.9699


In [0]:
from sparknlp_display import AssertionVisualizer

vis = AssertionVisualizer()

vis.display(light_result[0], 'ner_chunk', 'assertion')

## Classification Model

In [0]:
document_assembler = nlp.DocumentAssembler()\
    .setInputCol("text")\
    .setOutputCol("document")

sentence_embeddings = nlp.BertSentenceEmbeddings.pretrained("sbiobert_base_cased_mli", 'en','clinical/models')\
    .setInputCols(["document"])\
    .setOutputCol("sentence_embeddings")

features_asm = medical.FeaturesAssembler()\
    .setInputCols(["sentence_embeddings"])\
    .setOutputCol("features")

generic_classifier = medical.GenericClassifierModel.pretrained("genericclassifier_sdoh_economics_binary_sbiobert_cased_mli", 'en', 'clinical/models')\
    .setInputCols(["features"])\
    .setOutputCol("classes")

pipeline = nlp.Pipeline(stages=[
    document_assembler,
    sentence_embeddings,
    features_asm,
    generic_classifier
])

empty_data = spark.createDataFrame([[""]]).toDF("text")
model = pipeline.fit(empty_data)


sbiobert_base_cased_mli download started this may take some time.
Approximate size to download 384.3 MB
[ | ][ / ][ — ][ \ ][ | ][ / ][ — ][ \ ][ | ][ / ][ — ][ \ ][ | ][ / ][ — ][ \ ][ | ][ / ][ — ][ \ ][ | ][ / ][ — ][ \ ][ | ][ / ][ — ][OK!]
genericclassifier_sdoh_economics_binary_sbiobert_cased_mli download started this may take some time.
[ | ][ / ][OK!]


In [0]:
sample_text = '''Patient works as a building inspector and remodeler. Married with 2 children. He is a current smoker, 1PPD for 25years. He drinks to beers/night, but has not had any alcohol in past 4 days. No IVDU.'''

In [0]:
classification_data = spark.createDataFrame(pd.DataFrame([sample_text], columns = ['text']))

In [0]:
classification_results = model.transform(classification_data)

In [0]:
classification_results.select("text", "classes.result").show(truncate=False)

+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------+
|text                                                                                                                                                                                                  |result|
+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------+
|Patient works as a building inspector and remodeler. Married with 2 children. He is a current smoker, 1PPD for 25years. He drinks to beers/night, but has not had any alcohol in past 4 days. No IVDU.|[True]|
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------

## Pretrained NER Profiling Pipelines

We can use pretrained NER profiling pipelines for exploring all the available pretrained NER models at once.

- `ner_profiling_sdoh` : Returns results for sdoh NER models.

For more examples, please check [this notebook](https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Healthcare/11.2.Pretrained_NER_Profiling_Pipelines.ipynb).





<center><b>NER Profiling SDOH Model List</b>

|| | | |
|--------------|-----------------|-----------------|-----------------|
| ner_sdoh | ner_sdoh_social_environment_wip | ner_sdoh_mentions |
| ner_sdoh_demographics_wip | ner_sdoh_community_condition_wip | ner_sdoh_substance_usage_wip | ner_sdoh_access_to_healthcare_wip |
| ner_sdoh_health_behaviours_problems_wip | ner_sdoh_income_social_status_wip |



</center>

In [0]:
#from sparknlp.pretrained import PretrainedPipeline

sdoh_profiling_pipeline = nlp.PretrainedPipeline("ner_profiling_sdoh", "en", "clinical/models")

ner_profiling_sdoh download started this may take some time.
Approx size to download 1.6 GB
[ | ][ / ][ — ][ \ ][ | ][ / ][ — ][ \ ][ | ][ / ][ — ][ \ ][ | ][OK!]


In [0]:
text = """Smith is 55 years old, living in New York, a divorced Mexcian American woman with financial problems. She speaks Spanish and Portuguese. She lives in an apartment. She has been struggling with diabetes for the past 10 years and has recently been experiencing frequent hospitalizations due to uncontrolled blood sugar levels. Smith works as a cleaning assistant and cannot access health insurance or paid sick leave. She has a son, a student at college. Pt with likely long-standing depression. She is aware she needs rehab. Pt reports having her catholic faith as a means of support as well.  She has a long history of etoh abuse, beginning in her teens. She reports she has been a daily drinker for 30 years, most recently drinking beer daily. She smokes a pack of cigarettes a day. She had DUI in April and was due to court this week."""

In [0]:
sdoh_result = sdoh_profiling_pipeline.fullAnnotate(text)[0]
sdoh_result.keys()

Out[31]: dict_keys(['ner_chunk_ner_sdoh_mentions', 'ner_chunk_sdoh_health_behaviours_problems', 'document', 'ner_chunk_social_environment', 'ner_chunk_sdoh_access_to_healthcare', 'ner_chunk_sdoh_substance_usage', 'ner_sdoh_mentions', 'ner_sdoh_community_condition', 'ner_chunk_ner_demographics', 'ner_sdoh_demographics', 'ner_chunk_sdoh_community_condition', 'ner_sdoh', 'ner_chunk_sdoh_income_social_status', 'ner_social_environment', 'ner_sdoh_access_to_healthcare', 'ner_sdoh_health_behaviours_problems', 'token', 'ner_sdoh_substance_usage', 'embeddings', 'ner_sdoh_income_social_status', 'ner_chunk_sdoh', 'sentence'])

In [0]:
def get_token_results(light_result):

    tokens = [j.result for j in light_result["token"]]
    sentences = [j.metadata["sentence"] for j in light_result["token"]]
    begins = [j.begin for j in light_result["token"]]
    ends = [j.end for j in light_result["token"]]
    model_list = [ a for a in light_result.keys() if (a not in ["sentence", "token"] and "_chunks" not in a)]

    df = pd.DataFrame({'sentence':sentences, 'begin': begins, 'end': ends, 'token':tokens})

    for model_name in model_list:

        temp_df = pd.DataFrame(light_result[model_name])
        temp_df["jsl_label"] = temp_df.iloc[:,0].apply(lambda x : x.result)
        temp_df = temp_df[["jsl_label"]]

        # temp_df = get_ner_result(model_name)
        temp_df.columns = [model_name]
        df = pd.concat([df, temp_df], axis=1)

    # Filter columns to include only sentence, begin, end, token and all columns that start with 'ner_vop'
    filtered_df = df.loc[:, ['sentence', 'begin', 'end', 'token'] + [col for col in df.columns if col.startswith('ner_sdoh')]]

    return filtered_df

In [0]:
get_token_results(sdoh_result)

Unnamed: 0,sentence,begin,end,token,ner_sdoh_mentions,ner_sdoh_community_condition,ner_sdoh_demographics,ner_sdoh,ner_sdoh_access_to_healthcare,ner_sdoh_health_behaviours_problems,ner_sdoh_substance_usage,ner_sdoh_income_social_status
0,0,0,4,Smith,B-sdoh_community,O,O,O,O,O,O,O
1,0,6,7,is,O,O,O,O,O,O,O,O
2,0,9,10,55,O,O,B-Age,B-Age,O,O,O,O
3,0,12,16,years,O,O,I-Age,I-Age,O,O,O,O
4,0,18,20,old,O,O,I-Age,I-Age,O,O,O,O
...,...,...,...,...,...,...,...,...,...,...,...,...
159,12,817,818,to,O,O,O,O,O,O,O,O
160,12,820,824,court,O,O,O,O,O,O,O,O
161,12,826,829,this,O,O,O,O,O,O,O,O
162,12,831,834,week,O,O,O,O,O,O,O,O
