In [None]:
import scipy.stats as stats, chi2_contingency, contingency, chi2
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from copy import deepcopy
import itertools

import sys
sys.path.insert(0, '/User/Symptom_Documentation/')
from functions import helpers as hf
from functions import helpers_plot as hp


In [None]:
# load data

projectfolder = '/User/Symptom_Documentation/'
datafolder = projectfolder + 'data/'
figfolder = projectfolder + 'figures/'

df_survey = pd.read_csv(datafolder + 'by_person.csv')  # survey responses
df_demog = pd.read_csv(datafolder + 'data.csv')  # demographic info

In [None]:
pd.set_option('display.max_columns', None)

display(df_survey.head())
display(df_demog.head())

for col in df_survey.columns:
    print(col)

# convert date from string to datetime
df_survey['date_v2'] = pd.to_datetime(df_survey['date'])
df_survey['date_nurse_v2'] = pd.to_datetime(df_survey['date_nurse'])

In [None]:
#### get race

info_filt = ~pd.isna(df_demog['mrn'])

hispanic_filt = df_demog['hispanic'] == 1
study_ids = df_demog.loc[info_filt, 'studyid']
study_id_hisp = df_demog.loc[info_filt & hispanic_filt, 'studyid'].to_list()
study_id_other = df_demog.loc[info_filt & ~hispanic_filt, 'studyid'].to_list()

print('Hispanic:', len(study_id_hisp), ', Non-Hispanic:', len(study_id_other))

## gender
# 0, Female | 1, Male | 2, Not reported

info_filt = ~pd.isna(df_demog['mrn'])
male_filt = df_demog['gender'] == 1
female_filt = df_demog['gender'] == 0
study_id_male = df_demog.loc[info_filt & male_filt, 'studyid'].to_list()
study_id_female = df_demog.loc[info_filt & female_filt, 'studyid'].to_list()

print('Male:', len(study_id_male), ', Female:', len(study_id_female))

# clean survey to contain only relevant columns

symptom_list = ['cramp', 'fatigue', 'musclesore', 'dryskin', 'itching']
pt_list = symptom_list
doc_list = [s + '_doc' for s in symptom_list]
nurse_list = [s + '_nurse' for s in symptom_list]
severity_list = [s + 'severity' for s in symptom_list]
df_survey_clean = df_survey[['studyid', 'date', 'date_v2', 'date_nurse_v2'] + pt_list + nurse_list + doc_list + severity_list]

# convert severity rating to integer from 0 to 4

df_survey_clean[severity_list] = np.round(df_survey_clean[severity_list]/25)

print('# rows:', len(df_survey_clean))
display(df_survey_clean.head())

In [None]:
# create dictionary of symptoms as reported by patient, doctor, nurse, and NLP

symptom_list = ['cramp', 'fatigue', 'musclesore', 'dryskin', 'itching']

uniq_sids = df_survey_clean['studyid'].unique()
symptom_dict = {}   # key = reporter+symptom, value=list of studyids
severity_dict = {}  # key1 = symptom, key2=severity, value = list of studyids w/ that symptom and severity

# create dictionary structure
for col in symptom_list:
    severity_dict[col] = {}
    for i in range(0, 5):
        severity_dict[col][i] = []

for sid in uniq_sids:
    sid_filt = df_survey_clean['studyid'] == sid
    for col in pt_list:
        if np.sum(df_survey_clean.loc[sid_filt, col]) > 0:  # patient has reported symptom
            severity_label = col.split('_')[0] + 'severity'  
            severity = np.max(df_survey_clean.loc[sid_filt, severity_label]) # get patient-reported severity
            if not pd.isna(severity):
                severity_dict[col][severity].append(sid)
            else:
                severity_dict[col][0].append(sid)

nurse_filt = ~pd.isna(df_survey_clean['date_nurse_v2']) 
pt_filt = ~pd.isna(df_survey_clean['date_v2']) 

symptom_dict = {}
for sym in symptom_list:
    symptom_dict[sym + '_actual'] = []
    symptom_dict[sym + '_nurse'] = []
    symptom_dict[sym + '_doc'] = []

for sid in uniq_sids:
    sid_filt = df_survey_clean['studyid'] == sid

    df_nurse_rand = df_survey_clean.loc[nurse_filt & sid_filt, :].sample()  # sample a random nurse survey

    date_nurse = df_nurse_rand['date_nurse_v2'].to_list()[0]
    date_filt = df_survey_clean['date_v2'] == date_nurse
    df_pt_rand = df_survey_clean.loc[pt_filt & sid_filt & date_filt, :]

    for sym in symptom_list:
        if np.sum(df_nurse_rand[sym + '_nurse']) > 0:
            symptom_dict[sym + '_nurse'].append(sid)
        if np.sum(df_pt_rand[sym]) > 0:
            symptom_dict[sym + '_actual'].append(sid)
        
    for col in doc_list:
        if np.sum(df_survey_clean.loc[sid_filt, col]) > 0:
            symptom_dict[col].append(sid)
            
# NLP results copied and pasted from NLP code

symptom_dict['cramp_nlp'] = [1, 3, 9, 11, 16, 18, 19, 20, 34, 36, 38, 50, 51, 54, 56, 57, 60, 65, 69, 70, 73, 76, 81, 82, 84, 100]
symptom_dict['fatigue_nlp'] = [8, 19, 33, 39, 42, 49, 61, 62, 75, 78, 79, 81, 89, 93]
symptom_dict['itching_nlp'] = [2, 3, 6, 9, 10, 11, 16, 31, 37, 38, 47, 54, 60, 62, 66, 75, 77, 83, 97, 100]
symptom_dict['musclesore_nlp'] = [1, 42, 66, 73, 76, 86, 88, 94, 95]
symptom_dict['dryskin_nlp'] = []

symptom_label_map = {'cramp': 'Cramp', 'fatigue': 'Fatigue', 'musclesore': 'Muscle Soreness', \
                     'itching': 'Itching', 'dryskin': 'Dry Skin'}
rater_label_map = {'nurse': 'Nurse', 'doc': 'Physician', 'nlp': 'NLP'}



In [None]:
# Creates a dataframe that contains unique study IDs and symptom presence based on severity thresholds and rater source
# Contains demographic information

all_ids = df_survey_clean['studyid'].unique()
df_sym = pd.DataFrame({'Study ID': all_ids})

sev_cutoff = 0
raters = ['doc', 'nurse', 'nlp']
for sym in severity_dict.keys():
    actual_ids = [severity_dict[sym][s] for s in severity_dict[sym] if s>=sev_cutoff]
    actual_ids = [*set(list(itertools.chain(*actual_ids)))]
    df_sym[sym + '_actual'] = df_sym['Study ID'].isin(actual_ids).to_list()
    
    for r in raters:
        rater_ids = symptom_dict[sym + '_' + r]
        df_sym[sym + '_' + r] = df_sym['Study ID'].isin(rater_ids).to_list()

df_sym['gender'] = 'female'
df_sym.loc[df_sym['Study ID'].isin(study_id_male), 'gender'] = 'male'
df_sym['race'] = 'hispanic'
df_sym.loc[df_sym['Study ID'].isin(study_id_other), 'race'] = 'nonhispanic'

highsev_ids = []
sev_min = 3
for sym in severity_dict:
    for sev in range(sev_min, 5):
        highsev_ids.extend(severity_dict[sym][sev])
    
highsev_ids = np.unique(highsev_ids)
df_sym['severity'] = 'low'
df_sym.loc[df_sym['Study ID'].isin(highsev_ids), 'severity'] = 'high'

display(df_sym.head())

In [None]:
# Defines a new column that combines physician and nurse evaluations
for sym in symptom_list:
    combined_col = f"{sym}_doc+nurse"
    df_sym[combined_col] = False
# If either nurse or doctor identifies symptom, record as identified
for sym in symptom_list:
    nurse_col = f'{sym}_nurse'
    doc_col = f'{sym}_doc'
    combined_col = f'{sym}_doc+nurse'

    df_sym[combined_col] = (df_sym[nurse_col] == True) | (df_sym[doc_col] == True)

df_sym.head()

In [None]:
# calculate number of reported patients per symptom

plt.figure(figsize=[9, 4])

# set width of bar
barWidth = 0.14

# only above certain severity
sev_cutoff = 0
keys_all = list(symptom_dict.keys())

counts_byrater = {'actual': [], 'nlp': [], 'doc': [], 'nurse': []}
rater_order = ['actual', 'doc', 'nurse', 'nlp']
rater_label_map = {'actual': 'Patient Survey', 'nurse': 'Nurse Survey', \
                   'doc': 'Physician Survey', 'nlp': 'NLP of EHR'}
color_map = {'actual': '#292787', 'nurse': '#68686b', 'doc': '#a35149', 'nlp': '#d1c886'}
symptom_label_map = {'cramp': 'Cramp', 'fatigue': 'Fatigue', 'musclesore': 'Muscle Soreness', \
                     'itching': 'Itching', 'dryskin': 'Dry Skin'}
symptom_order = ['fatigue', 'cramp', 'dryskin', 'musclesore', 'itching']

for i, sym in enumerate(symptom_order):
    sym_keys = [k for k in keys_all if (sym in k) & ('actual' not in k)]
    species = tuple(['pt']+[k.replace(sym+'_', '') for k in sym_keys])
    
    # actual_sids = [severity_dict[sym][s] for s in severity_dict[sym] if s>=sev_cutoff]
    # actual_sids = [*set(list(itertools.chain(*actual_sids)))]
    actual_sids = symptom_dict[sym + '_actual']
    
    counts_byrater['actual'].append(len(actual_sids))
    
    for k in sym_keys:
        rater = k.split('_')[1]
        sids_byrater = symptom_dict[k]
        counts_byrater[rater].append(len(sids_byrater))

for i, rater in enumerate(rater_order):
    barheights = counts_byrater[rater]
    if i==0:
        barposition = np.arange(len(barheights))
    else:
        barposition = [x+barWidth for x in prevbar]
    prevbar = barposition
    plt.bar(barposition, barheights, width=barWidth, label=rater_label_map[rater], \
            color=color_map[rater], edgecolor = 'black')

# # Adding Xticks
plt.xlabel('Symptom', fontweight ='bold', fontsize = 15)
plt.ylabel('# Reported Patients', fontweight ='bold', fontsize = 15)
plt.xticks([r + barWidth for r in range(len(symptom_order))], \
           [symptom_label_map[sym] for sym in symptom_order])
 
# plt.box(False)
plt.gca().spines['top'].set_color('none')
plt.gca().spines['right'].set_color('none')
plt.legend()

plt.savefig(figfolder + 'fig2_v2.png', dpi=200,  bbox_inches='tight')
plt.show()


In [None]:
import warnings
warnings.filterwarnings('ignore')

# Calculates McNemars for each symptom by rater source 

raters = ['actual', 'doc', 'nurse', 'nlp', 'doc+nurse']
rater_label_map = {'actual': 'Patient', 'nurse': 'Nurse', 'doc': 'Physician', 'nlp': 'NLP', 'doc+nurse': "Nurse+Physician"}

for sym in ['fatigue', 'cramp', 'dryskin', 'musclesore', 'itching']:
    rows = []
    for i in range(len(raters)):
        pval_dict = {}
        for j in range( len(raters)):
            label1 = sym+'_'+raters[i]
            label2 = sym+'_'+raters[j]
            pval = hf.get_mcnemars(df_sym, label1, label2, alpha=0.01, display=False)
#             print(sym, '\t', raters[i], '\t', raters[j], '\t', pval)
            pval_dict[rater_label_map[raters[j]]] = np.round(pval, 4)
        rows.append(pval_dict)
            
    df = pd.DataFrame(rows)
    df.index = [rater_label_map[r] for r in raters]
    print(sym)
    display(df)
    
    df.to_csv(datafolder + 'mcnemar_' + sym + '_v2.csv')

In [None]:
# Calculates Pearson Chi based on rater source and cohort
raters = ['nurse', 'doc', 'nlp']
groups = ['race', 'gender', 'severity']

for g in groups:
    for r in raters:
        hf.pearson_chi2_v2(df_sym, r, g, 'spec')