In [1]:
# PGHOST = 'haoma-db.c2ifgz0co6no.us-west-2.rds.amazonaws.com'
# PGDATABASE = 'haoma-db'
# PGUSER = 'postgres'
# PGPASSWORD = ''

In [2]:
from pipeline import *
from process_pdf import *
! python process_pdf.py

## Load the Data

#### Necessary Input from Discharge
1. Zipcode
2. bool_services: whether or not a patient needs nursing, physical therapy, occuptational therapy, speech pathology, social services, home health aide

In [3]:
zipcode = 94044
bool_services = [True]*6 

In [4]:
df_cal = load_df(zipcode)

### Remove Stop Words & Get List of Words

In [21]:
[i for i in word_list if 'breath' in i]

['breath', 'breath', 'breath', 'breath', 'breath']

In [5]:
word_list = text_process(text)

In [6]:
df_cal = renamed_qcols(df_cal)

In [7]:
# Create keyword lists
q3_keys = ["falling", "fall", "fell", "tripped", "trip", "tumble"]
q4_keys = ["depression", "mental", "bipolar", "sad", "upset", "trauma", "traumatic"]
q5_keys = ["flu"]
q6_keys = ["pneumonia", "pneumococcal"]
q7_keys = ["diabetes", "foot"]
q8_keys = ["walk", "walking", "moving"]
q9_keys = ["bed"]
q10_keys = ["bathing", "bathe", "bath"]
q11_keys = ["breath", "shortness of breath", "shortness", "breathing", "breathe", "oxygen"]
q12_keys = ["wound", "wounds", "cut", "slice", "lesion",\
            "gash", "puncture", "slash", "laceration", "tear",\
            "rent", "puncture", "slash", "sore", "graze", "scratch",\
            "scrape", "abrasion", "bruise", "contusion", "trauma"]
q16_keys = ["ulcer", "pressure", "skin", "swelling", "post-acute"]

In [86]:
q_keys = [q3_keys, q4_keys, q5_keys, q6_keys, q7_keys, q8_keys,
          q9_keys, q10_keys, q11_keys, q12_keys, q16_keys]

In [166]:
keys = [f'Q{i}' for i in [3,4,5,6,7,8,9,10,11,12,16]]

flags = [any(set(x).intersection(set(word_list))) for x in q_keys]

flagged_qtopic = dict(zip(keys, flags))

In [167]:
flagged_qtopic

{'Q3': False,
 'Q4': False,
 'Q5': False,
 'Q6': False,
 'Q7': False,
 'Q8': False,
 'Q9': False,
 'Q10': False,
 'Q11': True,
 'Q12': False,
 'Q16': True}

## Pipeline Inputs

In [168]:
Q_flagged = [Q for (Q, bool_flag) in flagged_qtopic.items() if bool_flag] # questions that were flagged by NLP

In [169]:
Q_dict = {'Q3':'Falling', 'Q4':'Depression', 'Q5':'Flu', 'Q6': 'Pneumonia', 
          'Q7': 'Diabetes', 'Q8': 'Moving Around', 'Q9': 'Getting in Bed',
           'Q10': 'Bathing', 'Q11':'Breathing', 'Q12': 'Wounds', 'Q16': 'Skin Integrity'}

### Recommend() 
1. Currently, this is naively sorting by the order of the questions that were flagged
2. We should put weight to the questions somehow
3. We could also take into account PPR and DTC (Potentially Preventable Post-Discharge Readmission, Discharge to Community)

In [187]:
pipe_prep = Pipeline([('Drop Unnecessary Columns', Drop()),
                      ('Rename Columns', Rename()),
                      ('Filter Offered Services', FilterByService(bool_services))])

df = pipe_prep.fit_transform(df_cal.copy())

Filtered out places that don't accept whatever the patient needs
1. **nurse**: 'Offers Nursing Care Services'
2. **pt**: 'Offers Physical Therapy Services'
3. **ot**: 'Offers Occupational Therapy Services',
4. **speech**: 'Offers Speech Pathology Services',
5. **social**: 'Offers Medical Social Services',
6. **aide**: 'Offers Home Health Aide Services',

______
### Weighted Columns (3-Tier)
1. Tier 1: Star_columns
2. Tier 2: Filtered_columns
3. Tier 3: Else

In [188]:
# flagged_qtopic['Q11'] = False
# flagged_qtopic['Q8'] = True
# flagged_qtopic['Q16'] = False
# flagged_qtopic['Q3'] = True



In [189]:
q_columns = [col for col in df.columns if 'Q' in col] + ['ppr', 'dtc']
weight = {key:1 for key in q_columns}

In [190]:
df['missing'] = df[q_columns].isna().sum(axis=1)


In [191]:
star_cols = ['Q1', 'Q13', 'Q8', 'Q9', 'Q10', 'Q11', 'Q15']  # -> 
filtered_qs = [key for key, value in flagged_qtopic.items() if value]  # 

In [192]:
# for sc in star_cols:
#     weight[sc] += 1
    
for fc in filtered_qs:
    weight[fc] += 2
    
weight['ppr'] = 2
weight['dtc'] = 2

In [193]:
weight

{'Q1': 1,
 'Q2': 1,
 'Q3': 1,
 'Q4': 1,
 'Q5': 1,
 'Q6': 1,
 'Q7': 1,
 'Q8': 1,
 'Q9': 1,
 'Q10': 1,
 'Q11': 3,
 'Q12': 1,
 'Q13': 1,
 'Q14': 1,
 'Q15': 1,
 'Q16': 3,
 'Q17': 1,
 'ppr': 2,
 'dtc': 2}

In [194]:
def custom_imputer(col_values):
    '''
    Imputs Q columns with worst value
    Assumes that questions with good answers being a higher value will be, on average, higher than 50% (0.5)
    '''
    high_is_better = False
    if np.nanmean(col_values) > 50:
        high_is_better = True
    if not high_is_better:
        col_values = np.ones_like(col_values)*100 - col_values
    return col_values.fillna(np.nanmin(col_values))

In [195]:
df.columns

Index(['ccn', 'name', 'address', 'city', ' zip', 'phone', 'date', 'star', 'Q1',
       'Q2', 'Q3', 'Q4', 'Q5', 'Q6', 'Q7', 'Q8', 'Q9', 'Q10', 'Q11', 'Q12',
       'Q13', 'Q14', 'Q15', 'Q16', 'Q17', 'dtc', 'dtc_cat', 'ppr', 'ppr_cat',
       'missing'],
      dtype='object')

In [196]:
df[q_columns] = df[q_columns].apply(custom_imputer)
# df['star'] = df['star'].fillna(np.nanmin(df['star'])) * 20

In [197]:
df[q_columns]

Unnamed: 0,Q1,Q2,Q3,Q4,Q5,Q6,Q7,Q8,Q9,Q10,Q11,Q12,Q13,Q14,Q15,Q16,Q17,ppr,dtc
13,87.3,97.9,99.8,97.8,82.4,98.2,94.1,76.4,76.3,75.9,85.7,89.1,73.0,80.6,82.3,99.87,91.0,87.39,64.87
21,96.9,99.8,99.7,97.6,89.3,92.2,96.2,79.5,82.3,85.2,88.4,94.3,81.1,85.1,86.7,99.88,92.2,97.18,85.14
163,99.3,99.9,99.5,92.6,93.8,91.8,92.3,88.4,89.2,94.7,98.7,99.6,86.2,86.9,87.9,100.0,93.9,97.51,79.72
181,96.4,98.9,98.7,100.0,84.2,91.6,100.0,81.0,81.6,83.5,86.5,95.1,71.5,86.5,87.3,99.85,99.0,98.49,92.21
253,99.7,99.9,100.0,99.8,89.8,89.0,98.8,84.6,86.1,86.3,90.3,89.8,82.4,86.9,88.5,100.0,61.2,96.55,86.17
267,81.6,98.1,99.8,99.6,83.2,88.2,98.0,76.5,79.0,79.1,87.9,93.0,72.4,83.9,87.4,100.0,98.0,97.38,81.28
299,97.6,99.7,99.3,98.9,74.9,87.0,98.8,76.0,82.9,82.7,78.0,75.5,67.3,85.6,87.5,99.86,98.7,97.34,80.86
308,97.2,97.7,99.7,96.5,82.9,87.6,94.6,75.4,79.0,79.7,88.2,96.3,71.0,86.5,86.5,99.92,95.7,97.75,84.9
438,99.5,98.7,99.9,99.5,67.0,97.6,99.7,76.0,86.4,80.0,91.7,75.5,70.5,87.5,85.0,99.97,99.9,87.39,81.44
516,99.7,99.9,100.0,100.0,90.9,96.9,99.6,82.1,82.3,89.6,91.0,87.2,83.6,86.9,85.7,100.0,100.0,96.5,84.81


________
## Rec 

In [198]:
for col in df:
    if col in weight.keys():
        df[col] = df[col] * weight[col]

In [199]:
df['score'] = df[q_columns].sum(axis=1)

In [200]:
hh_1 = df.sort_values('score', ascending=False).name.values

In [206]:
temp_cols = q_columns
temp_cols.remove('ppr')
temp_cols.remove('dtc')

In [207]:
df_cal.loc[[1089, 1185]][temp_cols]

Unnamed: 0,Q1,Q2,Q3,Q4,Q5,Q6,Q7,Q8,Q9,Q10,Q11,Q12,Q13,Q14,Q15,Q16,Q17
1089,100.0,100.0,100.0,98.7,97.5,99.2,100.0,93.9,92.8,98.1,95.5,,80.3,12.3,12.1,0.0,99.6
1185,99.8,98.8,99.7,99.7,84.7,91.2,94.7,100.0,95.6,98.8,100.0,96.4,100.0,11.9,14.5,0.03,99.0


In [208]:
df.loc[[1089, 1185]][temp_cols]

Unnamed: 0,Q1,Q2,Q3,Q4,Q5,Q6,Q7,Q8,Q9,Q10,Q11,Q12,Q13,Q14,Q15,Q16,Q17
1089,100.0,100.0,100.0,98.7,97.5,99.2,100.0,93.9,92.8,98.1,286.5,75.5,80.3,87.7,87.9,300.0,99.6
1185,99.8,98.8,99.7,99.7,84.7,91.2,94.7,100.0,95.6,98.8,300.0,96.4,100.0,88.1,85.5,299.91,99.0


In [203]:
df.sort_values('score', ascending=False)[['score', 'missing', '']]

Unnamed: 0,score,missing
1185,2395.87,0
1089,2353.38,1
163,2346.56,0
1028,2331.81,0
516,2320.02,0
1347,2314.19,1
1026,2310.75,0
1165,2307.83,0
575,2304.82,0
784,2295.8,1


In [201]:
hh_1

array(['21ST CENTURY HOME HEALTH SERVICES INC.',
       'HELPING HANDS HOME HEALTH CARE & HOSPICE, INC.',
       'AMEDISYS HOME HEALTH CARE', 'CARE IN TOUCH HOME HEALTH AGENCY',
       'WARM SPRINGS HOME HEALTH, INC', 'HOME HEALTH BAY AREA INC.',
       'HEALTHFLEX HOME HEALTH SERVICES',
       'ADVANCED HEALTHCARE SERVICES, LLC',
       'HEALTH LINK HOME HEALTH AGENCY', 'ALERT HOME HEALTH CARE',
       'PROFESSIONAL HOME CARE ASSOCIATES',
       'PATHWAYS HOME HEALTH AND HOSPICE', 'HEALTHY LIVING AT HOME, LLC',
       'ASIAN AMERICAN HOME HEALTH', 'BLIZE HEALTHCARE',
       'ASTRA HEALTH CARE HOME HEALTH AGENCY', 'HARMONY HOME HEALTH',
       'CARELINK HOME HEALTH AGENCY',
       'SUTTER VISITING NURSE ASSOCIATION & HOSPICE', 'TRUEMED, INC',
       'ANX HOME HEALTHCARE', 'CROSSROADS HOME HEALTH & HOSPICE',
       'ASSIST ON CALL PROFESSIONAL IN-HOME CARE SERVICES',
       'AMERICAN CAREQUEST', 'CVH CARE',
       'ELITE CARE HOME HEALTH AGENCY, INC.',
       'NEW HAVEN HOME HEALTH SERV

In [181]:
hh_2 = df.sort_values('score', ascending=False).name.values

In [185]:
df.sort_values('score', ascending=False)['score']

1185    2395.87
1089    2353.38
163     2346.56
1028    2331.81
516     2320.02
1347    2314.19
1026    2310.75
1165    2307.83
575     2304.82
784     2295.80
181     2295.75
21      2287.58
612     2279.89
253     2279.14
852     2275.24
1171    2273.52
995     2271.56
867     2261.22
308     2255.96
851     2250.70
624     2249.16
267     2238.82
1132    2238.68
438     2235.37
1037    2211.30
1237    2204.14
299     2202.38
650     2192.38
1293    2185.35
1307    2185.19
989     2166.09
13      2163.33
1176    2144.27
879     1962.67
Name: score, dtype: float64

In [184]:
for i,j in zip(hh_1, hh_2):
    print(i)
    print(j)
    print()

21ST CENTURY HOME HEALTH SERVICES INC.
21ST CENTURY HOME HEALTH SERVICES INC.

HELPING HANDS HOME HEALTH CARE & HOSPICE, INC.
HELPING HANDS HOME HEALTH CARE & HOSPICE, INC.

AMEDISYS HOME HEALTH CARE
AMEDISYS HOME HEALTH CARE

HEALTH LINK HOME HEALTH AGENCY
CARE IN TOUCH HOME HEALTH AGENCY

WARM SPRINGS HOME HEALTH, INC
WARM SPRINGS HOME HEALTH, INC

HOME HEALTH BAY AREA INC.
HOME HEALTH BAY AREA INC.

HEALTHFLEX HOME HEALTH SERVICES
HEALTHFLEX HOME HEALTH SERVICES

ASTRA HEALTH CARE HOME HEALTH AGENCY
ADVANCED HEALTHCARE SERVICES, LLC

CARE IN TOUCH HOME HEALTH AGENCY
HEALTH LINK HOME HEALTH AGENCY

PROFESSIONAL HOME CARE ASSOCIATES
ALERT HOME HEALTH CARE

ADVANCED HEALTHCARE SERVICES, LLC
PROFESSIONAL HOME CARE ASSOCIATES

PATHWAYS HOME HEALTH AND HOSPICE
PATHWAYS HOME HEALTH AND HOSPICE

ASIAN AMERICAN HOME HEALTH
HEALTHY LIVING AT HOME, LLC

HEALTHY LIVING AT HOME, LLC
ASIAN AMERICAN HOME HEALTH

TRUEMED, INC
BLIZE HEALTHCARE

CARELINK HOME HEALTH AGENCY
ASTRA HEALTH CARE HOME HEAL

In [183]:
hh_2

array(['21ST CENTURY HOME HEALTH SERVICES INC.',
       'HELPING HANDS HOME HEALTH CARE & HOSPICE, INC.',
       'AMEDISYS HOME HEALTH CARE', 'CARE IN TOUCH HOME HEALTH AGENCY',
       'WARM SPRINGS HOME HEALTH, INC', 'HOME HEALTH BAY AREA INC.',
       'HEALTHFLEX HOME HEALTH SERVICES',
       'ADVANCED HEALTHCARE SERVICES, LLC',
       'HEALTH LINK HOME HEALTH AGENCY', 'ALERT HOME HEALTH CARE',
       'PROFESSIONAL HOME CARE ASSOCIATES',
       'PATHWAYS HOME HEALTH AND HOSPICE', 'HEALTHY LIVING AT HOME, LLC',
       'ASIAN AMERICAN HOME HEALTH', 'BLIZE HEALTHCARE',
       'ASTRA HEALTH CARE HOME HEALTH AGENCY', 'HARMONY HOME HEALTH',
       'CARELINK HOME HEALTH AGENCY',
       'SUTTER VISITING NURSE ASSOCIATION & HOSPICE', 'TRUEMED, INC',
       'ANX HOME HEALTHCARE', 'CROSSROADS HOME HEALTH & HOSPICE',
       'ASSIST ON CALL PROFESSIONAL IN-HOME CARE SERVICES',
       'AMERICAN CAREQUEST', 'CVH CARE',
       'ELITE CARE HOME HEALTH AGENCY, INC.',
       'NEW HAVEN HOME HEALTH SERV

In [124]:
df.sort_values('score', ascending=False)[['name', 'star', 'score']]

Unnamed: 0,name,star,score
1185,21ST CENTURY HOME HEALTH SERVICES INC.,5.0,2875.6
1089,"HELPING HANDS HOME HEALTH CARE & HOSPICE, INC.",5.0,2806.38
163,AMEDISYS HOME HEALTH CARE,5.0,2792.26
1026,HEALTHFLEX HOME HEALTH SERVICES,5.0,2779.86
575,HEALTH LINK HOME HEALTH AGENCY,5.0,2762.42
1028,CARE IN TOUCH HOME HEALTH AGENCY,4.5,2760.66
1347,HOME HEALTH BAY AREA INC.,4.5,2753.84
516,"WARM SPRINGS HOME HEALTH, INC",4.5,2743.02
1171,ASTRA HEALTH CARE HOME HEALTH AGENCY,4.5,2726.72
1165,"ADVANCED HEALTHCARE SERVICES, LLC",3.5,2711.84


In [74]:
df.sort_values('score', ascending=False)[['star']+q_columns]

Unnamed: 0,star,Q1,Q2,Q3,Q4,Q5,Q6,Q7,Q8,Q9,Q10,Q11,Q12,Q13,Q14,Q15,Q16,Q17,ppr,dtc
1185,5.0,199.6,98.8,99.7,99.7,84.7,91.2,94.7,200.0,191.2,197.6,300.0,96.4,200.0,88.1,171.0,199.94,99.0,194.6,169.36
1089,5.0,200.0,100.0,100.0,98.7,97.5,99.2,100.0,187.8,185.6,196.2,286.5,75.5,160.6,87.7,175.8,200.0,99.6,195.56,160.12
163,5.0,198.6,99.9,99.5,92.6,93.8,91.8,92.3,176.8,178.4,189.4,296.1,99.6,172.4,86.9,175.8,200.0,93.9,195.02,159.44
1026,5.0,199.6,98.8,99.9,99.8,76.2,91.1,97.5,187.0,191.8,194.6,300.0,95.6,194.0,85.7,171.2,199.98,50.8,194.34,151.94
575,5.0,199.8,99.8,99.8,100.0,67.6,87.2,97.9,185.4,184.2,190.0,263.4,99.3,185.0,86.6,170.8,200.0,99.6,194.88,151.14
1028,4.5,198.0,100.0,100.0,100.0,98.8,99.3,96.4,145.4,187.8,186.6,278.4,86.7,167.4,83.6,172.2,199.7,95.9,197.8,166.66
1347,4.5,198.0,98.1,100.0,100.0,93.8,100.0,100.0,175.0,167.8,183.4,282.3,75.5,180.4,85.7,174.4,199.7,100.0,189.18,150.56
516,4.5,199.4,99.9,100.0,100.0,90.9,96.9,99.6,164.2,164.6,179.2,273.0,87.2,167.2,86.9,171.4,200.0,100.0,193.0,169.62
1171,4.5,188.4,96.2,100.0,100.0,79.5,86.5,100.0,196.2,200.0,190.4,266.1,75.5,154.8,88.8,176.6,200.0,100.0,174.78,152.94
1165,3.5,189.8,99.8,99.2,99.8,94.7,98.0,98.6,155.8,161.6,170.0,276.9,93.2,155.6,86.5,175.0,199.78,95.9,194.42,167.24


In [37]:
print(f'The recommendation prioritized {[Q_dict[flag] for flag in Q_flagged] + ["Star Rating"]} in that order')

The recommendation prioritized ['Breathing', 'Skin Integrity', 'Star Rating'] in that order
