This the code for walking through the lesson examples for your reference.

## Code for Building Synthetic Dataset

In [1]:
import pandas as pd
import numpy as np

In [2]:
#build synthetic line level example
NUMBER_RECORDS = 100000
NUMBER_ENCOUNTERS = 7800
NUMBER_PATIENTS = 1000

In [3]:
# Create random list of code sets for diagnosis, procedure, medication, and lab codes
dx_code_list = ["dx_code_" + str(x) for x in np.arange(1,100000)]
procedure_code_list =["procedure_code_" + str(x) for x in np.arange(0,73000)]
medication_code_list = ["medication_code_" + str(x) for x in np.arange(0,10000)]
lab_code_list = ["lab_code_" + str(x) for x in np.arange(0,10000)]

In [4]:
patient_id_list = ["udacity_health_patient_id_" + str(x) for x in np.arange(1, NUMBER_PATIENTS +1)]
encounter_id_list = ["udacity_health_encounter_id_" + str(x) for x in np.arange(1, NUMBER_ENCOUNTERS +1)]

In [5]:
def random_value_selection(field_value_list, number_records):
    #build normal probability distribution 
    field_prob_dist = np.random.dirichlet(np.ones(len(field_value_list)), size=1)[0] 
    #build random value list for field
    field_random_values = np.random.choice(field_value_list, number_records, p=field_prob_dist)
    return field_random_values

In [6]:
#patient_values = random_value_selection(patient_id_list, NUMBER_RECORDS)
encounter_values = random_value_selection(encounter_id_list, NUMBER_RECORDS)

In [7]:
encounter_patient_mapping = dict(zip(encounter_id_list,   random_value_selection(patient_id_list, NUMBER_ENCOUNTERS)))
patient_values = [encounter_patient_mapping[x] for x in encounter_values]

In [8]:
dx_value_mapping = dict(zip(encounter_id_list, random_value_selection(dx_code_list, NUMBER_ENCOUNTERS) ))
dx_values = [dx_value_mapping[x] for x in encounter_values ]

In [9]:
procedure_values = random_value_selection(procedure_code_list, NUMBER_RECORDS)
medication_values = random_value_selection(medication_code_list, NUMBER_RECORDS)
lab_values = random_value_selection(lab_code_list, NUMBER_RECORDS)

In [10]:
triplet_prob_choice = np.random.choice([0, 1, 2], NUMBER_RECORDS, p= np.random.dirichlet(np.ones(3), size=1)[0] )
line_triplet_values = list(zip(procedure_values, medication_values, lab_values, triplet_prob_choice))
selected_procedure_values = [x[0] if x[3] == 0 else np.nan for x in line_triplet_values ]
selected_medication_values = [x[1] if x[3] == 1 else np.nan for x in line_triplet_values]
selected_lab_values = [x[2] if x[3] == 2 else np.nan for x in line_triplet_values]

In [11]:
#add label
patient_label_mapping = dict(zip( patient_id_list, np.random.choice([0, 1], NUMBER_PATIENTS, replace=True, 
                                                                    p=[0.88, 0.12]) ))
label_values = [patient_label_mapping[x] for x in patient_values]

In [12]:
line_df = pd.DataFrame({ "ENCOUNTER_ID": encounter_values,
                        "PATIENT_ID": patient_values,
                        "PRINCIPAL_DIAGNOSIS_CODE": dx_values,
                        "PROCEDURE_CODE": selected_procedure_values,
                        "MEDICATION_CODE": selected_medication_values,
                        "LAB_CODE": selected_lab_values,
                        "LABEL": label_values
                       })

In [13]:
#line_df.to_csv("./data/SYNTHETIC_EHR_DATASET.csv", index=False)

## 1. Converting Line to Encounter Representation

### Load Synthetic EHR Line Dataset

In [14]:
ehr_line_df = pd.read_csv("./data/SYNTHETIC_EHR_DATASET.csv")

In [15]:
len(ehr_line_df)

100000

In [16]:
ehr_line_df.head()

Unnamed: 0,ENCOUNTER_ID,PATIENT_ID,PRINCIPAL_DIAGNOSIS_CODE,PROCEDURE_CODE,MEDICATION_CODE,LAB_CODE,LABEL
0,udacity_health_encounter_id_1933,udacity_health_patient_id_817,dx_code_56522,procedure_code_20005,,,0
1,udacity_health_encounter_id_5664,udacity_health_patient_id_594,dx_code_39264,,medication_code_7471,,0
2,udacity_health_encounter_id_1946,udacity_health_patient_id_169,dx_code_83619,,,lab_code_5311,0
3,udacity_health_encounter_id_1528,udacity_health_patient_id_870,dx_code_97434,procedure_code_31111,,,0
4,udacity_health_encounter_id_7185,udacity_health_patient_id_722,dx_code_68924,,medication_code_2250,,0


In [17]:
len(ehr_line_df['PATIENT_ID'].unique())

878

In [18]:
len(ehr_line_df['ENCOUNTER_ID'].unique())

7222

In [19]:
len(ehr_line_df['PRINCIPAL_DIAGNOSIS_CODE'].unique())

6752

In [20]:
ehr_line_df[ehr_line_df['ENCOUNTER_ID']=='udacity_health_encounter_id_100']

Unnamed: 0,ENCOUNTER_ID,PATIENT_ID,PRINCIPAL_DIAGNOSIS_CODE,PROCEDURE_CODE,MEDICATION_CODE,LAB_CODE,LABEL
14286,udacity_health_encounter_id_100,udacity_health_patient_id_585,dx_code_71465,,,lab_code_4198,0
19091,udacity_health_encounter_id_100,udacity_health_patient_id_585,dx_code_71465,,medication_code_7982,,0
29530,udacity_health_encounter_id_100,udacity_health_patient_id_585,dx_code_71465,,,lab_code_6603,0
34583,udacity_health_encounter_id_100,udacity_health_patient_id_585,dx_code_71465,,,lab_code_332,0
62325,udacity_health_encounter_id_100,udacity_health_patient_id_585,dx_code_71465,,medication_code_2452,,0
64967,udacity_health_encounter_id_100,udacity_health_patient_id_585,dx_code_71465,,,lab_code_1431,0
94636,udacity_health_encounter_id_100,udacity_health_patient_id_585,dx_code_71465,,medication_code_279,,0


In [21]:
ehr_line_df[ehr_line_df['PATIENT_ID']=='udacity_health_patient_id_585']

Unnamed: 0,ENCOUNTER_ID,PATIENT_ID,PRINCIPAL_DIAGNOSIS_CODE,PROCEDURE_CODE,MEDICATION_CODE,LAB_CODE,LABEL
161,udacity_health_encounter_id_983,udacity_health_patient_id_585,dx_code_27700,,medication_code_5201,,0
184,udacity_health_encounter_id_7155,udacity_health_patient_id_585,dx_code_75500,,medication_code_6633,,0
841,udacity_health_encounter_id_7155,udacity_health_patient_id_585,dx_code_75500,procedure_code_31792,,,0
1565,udacity_health_encounter_id_983,udacity_health_patient_id_585,dx_code_27700,,medication_code_6972,,0
2141,udacity_health_encounter_id_7155,udacity_health_patient_id_585,dx_code_75500,,medication_code_5212,,0
...,...,...,...,...,...,...,...
94636,udacity_health_encounter_id_100,udacity_health_patient_id_585,dx_code_71465,,medication_code_279,,0
96140,udacity_health_encounter_id_1970,udacity_health_patient_id_585,dx_code_75302,,medication_code_8919,,0
99020,udacity_health_encounter_id_6528,udacity_health_patient_id_585,dx_code_87709,,medication_code_4594,,0
99096,udacity_health_encounter_id_7155,udacity_health_patient_id_585,dx_code_75500,procedure_code_32548,,,0


In [22]:
ehr_line_df[ehr_line_df['ENCOUNTER_ID']=='udacity_health_encounter_id_1528']

Unnamed: 0,ENCOUNTER_ID,PATIENT_ID,PRINCIPAL_DIAGNOSIS_CODE,PROCEDURE_CODE,MEDICATION_CODE,LAB_CODE,LABEL
3,udacity_health_encounter_id_1528,udacity_health_patient_id_870,dx_code_97434,procedure_code_31111,,,0
758,udacity_health_encounter_id_1528,udacity_health_patient_id_870,dx_code_97434,,medication_code_485,,0
869,udacity_health_encounter_id_1528,udacity_health_patient_id_870,dx_code_97434,procedure_code_34881,,,0
2169,udacity_health_encounter_id_1528,udacity_health_patient_id_870,dx_code_97434,procedure_code_56384,,,0
2471,udacity_health_encounter_id_1528,udacity_health_patient_id_870,dx_code_97434,procedure_code_41089,,,0
7698,udacity_health_encounter_id_1528,udacity_health_patient_id_870,dx_code_97434,,medication_code_2212,,0
8044,udacity_health_encounter_id_1528,udacity_health_patient_id_870,dx_code_97434,procedure_code_5661,,,0
9615,udacity_health_encounter_id_1528,udacity_health_patient_id_870,dx_code_97434,,medication_code_402,,0
13074,udacity_health_encounter_id_1528,udacity_health_patient_id_870,dx_code_97434,procedure_code_31821,,,0
13578,udacity_health_encounter_id_1528,udacity_health_patient_id_870,dx_code_97434,,medication_code_1905,,0


In [23]:
ehr_line_df[ehr_line_df['PRINCIPAL_DIAGNOSIS_CODE']=='dx_code_83619']

Unnamed: 0,ENCOUNTER_ID,PATIENT_ID,PRINCIPAL_DIAGNOSIS_CODE,PROCEDURE_CODE,MEDICATION_CODE,LAB_CODE,LABEL
2,udacity_health_encounter_id_1946,udacity_health_patient_id_169,dx_code_83619,,,lab_code_5311,0
3567,udacity_health_encounter_id_1946,udacity_health_patient_id_169,dx_code_83619,,medication_code_9497,,0
15702,udacity_health_encounter_id_1946,udacity_health_patient_id_169,dx_code_83619,,,lab_code_6255,0
20296,udacity_health_encounter_id_1946,udacity_health_patient_id_169,dx_code_83619,,medication_code_305,,0
30880,udacity_health_encounter_id_1946,udacity_health_patient_id_169,dx_code_83619,,medication_code_7887,,0
32591,udacity_health_encounter_id_1946,udacity_health_patient_id_169,dx_code_83619,,,lab_code_8727,0
35079,udacity_health_encounter_id_1946,udacity_health_patient_id_169,dx_code_83619,,medication_code_120,,0
39971,udacity_health_encounter_id_1946,udacity_health_patient_id_169,dx_code_83619,procedure_code_61459,,,0
43610,udacity_health_encounter_id_1946,udacity_health_patient_id_169,dx_code_83619,,medication_code_2135,,0
46020,udacity_health_encounter_id_1946,udacity_health_patient_id_169,dx_code_83619,,medication_code_8827,,0


In [24]:
#note that this is for illustrative purposes only and for practicing key skills, 
# the actual data representation and combinations of codes not indicative of real thing

### Convert Line to Encounter Representation

In [25]:
# grouping fields 
grouping_field_list = ['ENCOUNTER_ID', 'PATIENT_ID', 'PRINCIPAL_DIAGNOSIS_CODE']
non_grouped_field_list = [c for c in ehr_line_df.columns if c not in grouping_field_list]

In [26]:
encounter_df = ehr_line_df.groupby(grouping_field_list)[non_grouped_field_list].agg(lambda x: 
                                                        list([y for y in x if y is not np.nan ] ) ).reset_index()

In [27]:
encounter_df[0:5]

Unnamed: 0,ENCOUNTER_ID,PATIENT_ID,PRINCIPAL_DIAGNOSIS_CODE,PROCEDURE_CODE,MEDICATION_CODE,LAB_CODE,LABEL
0,udacity_health_encounter_id_1,udacity_health_patient_id_186,dx_code_15406,"[procedure_code_58552, procedure_code_39776, p...","[medication_code_2350, medication_code_8630, m...","[lab_code_8835, lab_code_9859, lab_code_9032, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,udacity_health_encounter_id_10,udacity_health_patient_id_188,dx_code_74047,[],"[medication_code_7789, medication_code_3560, m...",[],"[1, 1, 1, 1]"
2,udacity_health_encounter_id_100,udacity_health_patient_id_585,dx_code_71465,[],"[medication_code_7982, medication_code_2452, m...","[lab_code_4198, lab_code_6603, lab_code_332, l...","[0, 0, 0, 0, 0, 0, 0]"
3,udacity_health_encounter_id_1000,udacity_health_patient_id_525,dx_code_61569,[],[medication_code_4036],[],[0]
4,udacity_health_encounter_id_1001,udacity_health_patient_id_950,dx_code_90172,[procedure_code_30555],"[medication_code_6755, medication_code_5045]",[lab_code_9112],"[0, 0, 0, 0]"


In [28]:
len(encounter_df)

7222

In [29]:
len(encounter_df['PATIENT_ID'].unique())

878

In [62]:
len(encounter_df['PRINCIPAL_DIAGNOSIS_CODE'].unique())

6752

In [30]:
ehr_line_df[ehr_line_df['ENCOUNTER_ID']=='udacity_health_encounter_id_1']

Unnamed: 0,ENCOUNTER_ID,PATIENT_ID,PRINCIPAL_DIAGNOSIS_CODE,PROCEDURE_CODE,MEDICATION_CODE,LAB_CODE,LABEL
3246,udacity_health_encounter_id_1,udacity_health_patient_id_186,dx_code_15406,,medication_code_2350,,0
7901,udacity_health_encounter_id_1,udacity_health_patient_id_186,dx_code_15406,,medication_code_8630,,0
11765,udacity_health_encounter_id_1,udacity_health_patient_id_186,dx_code_15406,,,lab_code_8835,0
11950,udacity_health_encounter_id_1,udacity_health_patient_id_186,dx_code_15406,procedure_code_58552,,,0
16057,udacity_health_encounter_id_1,udacity_health_patient_id_186,dx_code_15406,,medication_code_4030,,0
17961,udacity_health_encounter_id_1,udacity_health_patient_id_186,dx_code_15406,procedure_code_39776,,,0
24877,udacity_health_encounter_id_1,udacity_health_patient_id_186,dx_code_15406,,medication_code_431,,0
26235,udacity_health_encounter_id_1,udacity_health_patient_id_186,dx_code_15406,,medication_code_7435,,0
35490,udacity_health_encounter_id_1,udacity_health_patient_id_186,dx_code_15406,,,lab_code_9859,0
42855,udacity_health_encounter_id_1,udacity_health_patient_id_186,dx_code_15406,,medication_code_4338,,0


In [31]:
encounter_df[encounter_df['ENCOUNTER_ID']=='udacity_health_encounter_id_1']

Unnamed: 0,ENCOUNTER_ID,PATIENT_ID,PRINCIPAL_DIAGNOSIS_CODE,PROCEDURE_CODE,MEDICATION_CODE,LAB_CODE,LABEL
0,udacity_health_encounter_id_1,udacity_health_patient_id_186,dx_code_15406,"[procedure_code_58552, procedure_code_39776, p...","[medication_code_2350, medication_code_8630, m...","[lab_code_8835, lab_code_9859, lab_code_9032, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


## 2. Converting Encounter to Longitudinal Representation

In [32]:
encounter_df.head()

Unnamed: 0,ENCOUNTER_ID,PATIENT_ID,PRINCIPAL_DIAGNOSIS_CODE,PROCEDURE_CODE,MEDICATION_CODE,LAB_CODE,LABEL
0,udacity_health_encounter_id_1,udacity_health_patient_id_186,dx_code_15406,"[procedure_code_58552, procedure_code_39776, p...","[medication_code_2350, medication_code_8630, m...","[lab_code_8835, lab_code_9859, lab_code_9032, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,udacity_health_encounter_id_10,udacity_health_patient_id_188,dx_code_74047,[],"[medication_code_7789, medication_code_3560, m...",[],"[1, 1, 1, 1]"
2,udacity_health_encounter_id_100,udacity_health_patient_id_585,dx_code_71465,[],"[medication_code_7982, medication_code_2452, m...","[lab_code_4198, lab_code_6603, lab_code_332, l...","[0, 0, 0, 0, 0, 0, 0]"
3,udacity_health_encounter_id_1000,udacity_health_patient_id_525,dx_code_61569,[],[medication_code_4036],[],[0]
4,udacity_health_encounter_id_1001,udacity_health_patient_id_950,dx_code_90172,[procedure_code_30555],"[medication_code_6755, medication_code_5045]",[lab_code_9112],"[0, 0, 0, 0]"


In [33]:
patient_grouping_field_list = ["PATIENT_ID"]
non_patient_agg_field_list = [c for c in encounter_df.columns if c not in patient_grouping_field_list]

In [34]:
long_df = encounter_df.groupby(patient_grouping_field_list)[non_patient_agg_field_list].agg(lambda x: 
                                                        list([y for y in x if y is not np.nan ] ) ).reset_index()

In [35]:
long_df.head()

Unnamed: 0,PATIENT_ID,ENCOUNTER_ID,PRINCIPAL_DIAGNOSIS_CODE,PROCEDURE_CODE,MEDICATION_CODE,LAB_CODE,LABEL
0,udacity_health_patient_id_1,"[udacity_health_encounter_id_1038, udacity_hea...","[dx_code_36196, dx_code_63471, dx_code_29114, ...","[[procedure_code_36285, procedure_code_21124, ...","[[medication_code_3772, medication_code_9214, ...","[[lab_code_3982, lab_code_306], [], [lab_code_...","[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,..."
1,udacity_health_patient_id_10,"[udacity_health_encounter_id_1110, udacity_hea...","[dx_code_29609, dx_code_268, dx_code_26932, dx...","[[procedure_code_9379], [procedure_code_3052, ...","[[medication_code_7371, medication_code_2104, ...","[[lab_code_6457], [lab_code_2180, lab_code_693...","[[0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, ..."
2,udacity_health_patient_id_100,"[udacity_health_encounter_id_1205, udacity_hea...","[dx_code_32095, dx_code_45376, dx_code_48998, ...","[[procedure_code_71055, procedure_code_29744, ...","[[medication_code_2399, medication_code_966, m...","[[lab_code_4928, lab_code_6524, lab_code_1713,...","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."
3,udacity_health_patient_id_1000,"[udacity_health_encounter_id_1105, udacity_hea...","[dx_code_53764, dx_code_50924, dx_code_80218, ...","[[], [procedure_code_7870, procedure_code_2169...","[[medication_code_6580, medication_code_3007],...","[[], [lab_code_5468], [lab_code_7607], [lab_co...","[[1, 1], [1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1..."
4,udacity_health_patient_id_101,"[udacity_health_encounter_id_2058, udacity_hea...","[dx_code_13590, dx_code_29551]","[[procedure_code_45319, procedure_code_67294],...","[[medication_code_2532], [medication_code_4311...","[[lab_code_1206, lab_code_9967], []]","[[0, 0, 0, 0, 0], [0, 0, 0, 0, 0]]"


In [36]:
example_patient_history = long_df[long_df['PATIENT_ID']=='udacity_health_patient_id_310']

In [37]:
example_patient_history

Unnamed: 0,PATIENT_ID,ENCOUNTER_ID,PRINCIPAL_DIAGNOSIS_CODE,PROCEDURE_CODE,MEDICATION_CODE,LAB_CODE,LABEL
211,udacity_health_patient_id_310,"[udacity_health_encounter_id_4647, udacity_hea...","[dx_code_74153, dx_code_95836, dx_code_39465, ...","[[procedure_code_40521, procedure_code_52188, ...","[[medication_code_7251, medication_code_2765, ...","[[lab_code_1372], [lab_code_1794], [lab_code_1...","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."


In [38]:
list(example_patient_history['ENCOUNTER_ID'].values)

[['udacity_health_encounter_id_4647',
  'udacity_health_encounter_id_5010',
  'udacity_health_encounter_id_551',
  'udacity_health_encounter_id_7210',
  'udacity_health_encounter_id_7331']]

In [39]:
list(example_patient_history['PRINCIPAL_DIAGNOSIS_CODE'].values)

[['dx_code_74153',
  'dx_code_95836',
  'dx_code_39465',
  'dx_code_66358',
  'dx_code_99743']]

In [40]:
list(example_patient_history['PROCEDURE_CODE'].values)

[[['procedure_code_40521',
   'procedure_code_52188',
   'procedure_code_57020',
   'procedure_code_11784',
   'procedure_code_50478'],
  ['procedure_code_12696',
   'procedure_code_2873',
   'procedure_code_28392',
   'procedure_code_1398'],
  ['procedure_code_49962'],
  [],
  ['procedure_code_31171']]]

## 3. How to Split Dataset at Patient Level

#### ***Objective:*** 
- Split dataset at patient level into train and test partitions
- Validate that split was done correctly

#### Dataset Splitting Tests
- Patient data in only one partition
- Total unique number of patients across all partitions = total number unique patients in original full dataset
- Total number of rows original dataset = sum of rows across splits

In [41]:
PATIENT_ID_FIELD = 'PATIENT_ID'
TEST_PERCENTAGE = 0.2

In [42]:
def split_dataset_patient_level(df, key, test_percentage=0.2):
    df = df.iloc[np.random.permutation(len(df))]
    unique_values = df[key].unique()
    total_values = len(unique_values)
    sample_size = round(total_values * (1 - test_percentage ))
    train = df[df[key].isin(unique_values[:sample_size])].reset_index(drop=True)
    test = df[df[key].isin(unique_values[sample_size:])].reset_index(drop=True)
    return train, test

In [43]:
train_df, test_df = split_dataset_patient_level(encounter_df, PATIENT_ID_FIELD, TEST_PERCENTAGE)

In [44]:
assert len(set(train_df[PATIENT_ID_FIELD].unique()).intersection(set(test_df[PATIENT_ID_FIELD].unique()))) == 0
print("Test passed for patient data in only one partition")

Test passed for patient data in only one partition


In [45]:
assert (train_df[PATIENT_ID_FIELD].nunique()  + test_df[PATIENT_ID_FIELD].nunique()) == encounter_df[PATIENT_ID_FIELD].nunique()
print("Test passed for number of unique patients being equal!")

Test passed for number of unique patients being equal!


In [46]:
assert len(train_df)  + len(test_df) == len(encounter_df)
print("Test passed for number of total rows equal!")

Test passed for number of total rows equal!


In [47]:
train_df.head(20)

Unnamed: 0,ENCOUNTER_ID,PATIENT_ID,PRINCIPAL_DIAGNOSIS_CODE,PROCEDURE_CODE,MEDICATION_CODE,LAB_CODE,LABEL
0,udacity_health_encounter_id_2010,udacity_health_patient_id_172,dx_code_10638,[],"[medication_code_817, medication_code_5962, me...",[lab_code_9708],"[0, 0, 0, 0, 0, 0, 0, 0, 0]"
1,udacity_health_encounter_id_7038,udacity_health_patient_id_289,dx_code_49274,[procedure_code_70189],"[medication_code_9434, medication_code_9238, m...","[lab_code_839, lab_code_7725, lab_code_5320]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
2,udacity_health_encounter_id_2319,udacity_health_patient_id_400,dx_code_91110,"[procedure_code_50692, procedure_code_2524, pr...","[medication_code_3671, medication_code_3535, m...","[lab_code_7284, lab_code_5391, lab_code_1054, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,udacity_health_encounter_id_7443,udacity_health_patient_id_148,dx_code_59713,"[procedure_code_19226, procedure_code_53352, p...","[medication_code_7470, medication_code_7005, m...","[lab_code_1957, lab_code_2142]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,udacity_health_encounter_id_4236,udacity_health_patient_id_626,dx_code_55026,"[procedure_code_42102, procedure_code_67416, p...","[medication_code_6580, medication_code_1583, m...","[lab_code_8475, lab_code_9146, lab_code_8006, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
5,udacity_health_encounter_id_2895,udacity_health_patient_id_862,dx_code_26031,"[procedure_code_55594, procedure_code_40851, p...","[medication_code_8046, medication_code_3948, m...","[lab_code_7662, lab_code_4258]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
6,udacity_health_encounter_id_7579,udacity_health_patient_id_306,dx_code_9956,[procedure_code_33739],[],[],[1]
7,udacity_health_encounter_id_1257,udacity_health_patient_id_279,dx_code_61314,[procedure_code_9173],[],[],[0]
8,udacity_health_encounter_id_4931,udacity_health_patient_id_724,dx_code_60074,[procedure_code_16755],"[medication_code_7889, medication_code_6375, m...","[lab_code_7954, lab_code_4161, lab_code_1913, ...","[0, 0, 0, 0, 0, 0, 0, 0]"
9,udacity_health_encounter_id_490,udacity_health_patient_id_311,dx_code_80559,"[procedure_code_63278, procedure_code_64740, p...","[medication_code_6823, medication_code_410, me...",[lab_code_2887],"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"


## 4. ETL with TF Dataset API and Pandas

NOTE: In some cases you may need to preprocess Pandas Dataframe to removed mixed types. In particular, remove null values and impute or remove rows (we will later impute with zero for numerical features).

In [48]:
import tensorflow as tf

In [49]:
swiss_dataset_path = "./data/processed_swiss.csv"
swiss_df = pd.read_csv(swiss_dataset_path)
selected_col_list = ['age', 'thalach', 'cp', 'num_label']
subset_swiss_df = swiss_df[selected_col_list]

In [50]:
swiss_df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num_label
0,32,1,1,95,0,?,0,127,0,.7,1,?,?,1
1,34,1,4,115,0,?,?,154,0,.2,1,?,?,1
2,35,1,4,?,0,?,0,130,1,?,?,?,7,3
3,36,1,4,110,0,?,0,125,1,1,2,?,6,1
4,38,0,4,105,0,?,0,166,0,2.8,1,?,?,2


In [51]:
subset_swiss_df.head()

Unnamed: 0,age,thalach,cp,num_label
0,32,127,1,1
1,34,154,4,1
2,35,130,4,3
3,36,125,4,1
4,38,166,4,2


In [52]:
#adapted from https://www.tensorflow.org/tutorials/structured_data/feature_columns
def df_to_dataset(df, predictor,  batch_size=32):
    df = df.copy()
    labels = df.pop(predictor)
    ds = tf.data.Dataset.from_tensor_slices((dict(df), labels))
    ds = ds.shuffle(buffer_size=len(df))
    ds = ds.batch(batch_size)
    return ds

In [53]:
BATCH_SIZE = 64
PREDICTOR_FIELD = 'num_label'
sample_tf_ds = df_to_dataset(subset_swiss_df, PREDICTOR_FIELD, batch_size=BATCH_SIZE)

In [54]:
sample_feature_batch = next(iter(sample_tf_ds))[0]
sample_feature_batch

{'age': <tf.Tensor: shape=(64,), dtype=int32, numpy=
 array([67, 63, 57, 38, 68, 62, 62, 55, 69, 61, 50, 43, 54, 53, 41, 45, 61,
        59, 56, 60, 53, 52, 54, 46, 57, 50, 38, 65, 53, 56, 47, 42, 53, 73,
        62, 58, 51, 66, 54, 34, 63, 58, 59, 64, 47, 38, 57, 61, 50, 51, 61,
        57, 56, 56, 51, 61, 66, 65, 61, 53, 61, 57, 56, 53], dtype=int32)>,
 'thalach': <tf.Tensor: shape=(64,), dtype=string, numpy=
 array([b'125', b'86', b'148', b'156', b'120', b'123', b'143', b'150',
        b'?', b'113', b'156', b'145', b'155', b'141', b'176', b'138',
        b'77', b'115', b'97', b'110', b'120', b'120', b'150', b'113',
        b'182', b'110', b'179', b'67', b'122', b'100', b'149', b'99',
        b'135', b'121', b'128', b'138', b'170', b'108', b'110', b'154',
        b'98', b'105', b'115', b'145', b'118', b'128', b'98', b'70',
        b'139', b'127', b'110', b'131', b'98', b'99', b'60', b'145', b'90',
        b'154', b'105', b'95', b'117', b'100', b'103', b'120'],
       dtype=object)>,


In [55]:
sample_label_batch = next(iter(sample_tf_ds))[1]
sample_label_batch

<tf.Tensor: shape=(64,), dtype=int32, numpy=
array([2, 3, 3, 1, 0, 2, 4, 1, 2, 2, 2, 1, 1, 1, 2, 1, 3, 4, 1, 1, 1, 1,
       2, 1, 2, 2, 2, 1, 3, 2, 2, 3, 2, 3, 2, 3, 1, 1, 1, 2, 1, 3, 1, 3,
       2, 1, 3, 0, 2, 2, 3, 1, 1, 1, 3, 3, 4, 2, 3, 2, 1, 3, 3, 1],
      dtype=int32)>

## 5. Building Numerical Feature with TF Feature Column API

In [56]:
subset_swiss_df.head()

Unnamed: 0,age,thalach,cp,num_label
0,32,127,1,1
1,34,154,4,1
2,35,130,4,3
3,36,125,4,1
4,38,166,4,2


In [57]:
age_mean = subset_swiss_df['age'].describe()['mean']
age_std = subset_swiss_df['age'].describe()['std']
print("Mean age:{}\nStandard Deviation Age:{}".format(age_mean, age_std))

Mean age:55.31707317073171
Standard Deviation Age:9.032107639562039


In [58]:
import functools
def normalize_numeric_with_zscore(col, mean, std):
    return (col - mean)/std

def create_tf_numeric_feature(col, MEAN, STD,   default_value=0):
    normalizer = functools.partial(normalize_numeric_with_zscore, mean=MEAN, std=STD)
    return tf.feature_column.numeric_column(
    key=col, default_value = default_value, normalizer_fn=normalizer, dtype=tf.float64)

In [59]:
age_tf_feature = create_tf_numeric_feature('age', age_mean, age_std)

In [60]:
def demo(feature_column, example_batch):
    feature_layer = tf.keras.layers.DenseFeatures(feature_column)
    print(feature_layer(example_batch))

In [61]:
print("Example continuous field:\n{}\n".format(age_tf_feature))
demo(age_tf_feature, sample_feature_batch)

Example continuous field:
NumericColumn(key='age', shape=(1,), default_value=(0,), dtype=tf.float64, normalizer_fn=functools.partial(<function normalize_numeric_with_zscore at 0x7fbbec1a0e60>, mean=55.31707317073171, std=9.032107639562039))

tf.Tensor(
[[ 1.3333334 ]
 [ 0.8888889 ]
 [ 0.22222222]
 [-1.8888888 ]
 [ 1.4444444 ]
 [ 0.7777778 ]
 [ 0.7777778 ]
 [ 0.        ]
 [ 1.5555556 ]
 [ 0.6666667 ]
 [-0.5555556 ]
 [-1.3333334 ]
 [-0.11111111]
 [-0.22222222]
 [-1.5555556 ]
 [-1.1111112 ]
 [ 0.6666667 ]
 [ 0.44444445]
 [ 0.11111111]
 [ 0.5555556 ]
 [-0.22222222]
 [-0.33333334]
 [-0.11111111]
 [-1.        ]
 [ 0.22222222]
 [-0.5555556 ]
 [-1.8888888 ]
 [ 1.1111112 ]
 [-0.22222222]
 [ 0.11111111]
 [-0.8888889 ]
 [-1.4444444 ]
 [-0.22222222]
 [ 2.        ]
 [ 0.7777778 ]
 [ 0.33333334]
 [-0.44444445]
 [ 1.2222222 ]
 [-0.11111111]
 [-2.3333333 ]
 [ 0.8888889 ]
 [ 0.33333334]
 [ 0.44444445]
 [ 1.        ]
 [-0.8888889 ]
 [-1.8888888 ]
 [ 0.22222222]
 [ 0.6666667 ]
 [-0.5555556 ]
 [-0.4444444

## 6. Building Categorical Features with TF Feature Column API

In [63]:
categorical_example_df = encounter_df[['ENCOUNTER_ID', 'PRINCIPAL_DIAGNOSIS_CODE', 'LABEL']] 

In [64]:
categorical_example_df.head()

Unnamed: 0,ENCOUNTER_ID,PRINCIPAL_DIAGNOSIS_CODE,LABEL
0,udacity_health_encounter_id_1,dx_code_15406,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,udacity_health_encounter_id_10,dx_code_74047,"[1, 1, 1, 1]"
2,udacity_health_encounter_id_100,dx_code_71465,"[0, 0, 0, 0, 0, 0, 0]"
3,udacity_health_encounter_id_1000,dx_code_61569,[0]
4,udacity_health_encounter_id_1001,dx_code_90172,"[0, 0, 0, 0]"


In [65]:
len(categorical_example_df)

7222

In [66]:
# for this task need to convert label from array to scalar value
categorical_example_df['LABEL'] = categorical_example_df['LABEL'].apply(lambda x: np.unique(x)[0])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [67]:
categorical_example_df.head()

Unnamed: 0,ENCOUNTER_ID,PRINCIPAL_DIAGNOSIS_CODE,LABEL
0,udacity_health_encounter_id_1,dx_code_15406,0
1,udacity_health_encounter_id_10,dx_code_74047,1
2,udacity_health_encounter_id_100,dx_code_71465,0
3,udacity_health_encounter_id_1000,dx_code_61569,0
4,udacity_health_encounter_id_1001,dx_code_90172,0


### High Cardinality for Principal Diagnosis Code

In [68]:
categorical_example_df['PRINCIPAL_DIAGNOSIS_CODE'].nunique()

6752

### Generate Vocabulary File

In [69]:
#make vocab dir
import os
#os.mkdir("./vocab/")

In [70]:
# build vocab for categorical features
def write_vocabulary_file(vocab_list, field_name, default_value, vocab_dir='./vocab/'):
    output_file_path = os.path.join(vocab_dir, str(field_name) + "_vocab.txt")
    # put default value in first row as TF requires
    vocab_list = np.insert(vocab_list, 0, default_value, axis=0) 
    df = pd.DataFrame(vocab_list).to_csv(output_file_path, index=None, header=None)
    return output_file_path

def build_vocab_files(df, categorical_column_list, default_value='00'):
    vocab_files_list = []
    for c in categorical_column_list:
        v_file = write_vocabulary_file(df[c].unique(), c, default_value)
        vocab_files_list.append(v_file)
    return vocab_files_list

In [71]:
categorical_field_list = ["PRINCIPAL_DIAGNOSIS_CODE"]
vocab_files_list = build_vocab_files(categorical_example_df, categorical_field_list)

In [72]:
vocab_files_list

['./vocab/PRINCIPAL_DIAGNOSIS_CODE_vocab.txt']

In [85]:
f = open('./vocab/PRINCIPAL_DIAGNOSIS_CODE_vocab.txt', "r")
print(f.read())

00
dx_code_15406
dx_code_74047
dx_code_71465
dx_code_61569
dx_code_90172
dx_code_172
dx_code_92070
dx_code_22577
dx_code_44044
dx_code_46709
dx_code_5376
dx_code_70391
dx_code_3298
dx_code_65616
dx_code_50292
dx_code_79217
dx_code_44016
dx_code_30372
dx_code_68764
dx_code_28503
dx_code_40768
dx_code_47143
dx_code_40899
dx_code_10216
dx_code_71194
dx_code_25850
dx_code_28317
dx_code_94424
dx_code_97503
dx_code_23867
dx_code_79754
dx_code_78530
dx_code_55940
dx_code_97680
dx_code_88169
dx_code_11585
dx_code_56843
dx_code_19465
dx_code_57675
dx_code_5575
dx_code_36196
dx_code_23996
dx_code_66713
dx_code_59277
dx_code_38753
dx_code_72922
dx_code_86803
dx_code_72714
dx_code_5017
dx_code_42888
dx_code_47354
dx_code_92884
dx_code_10455
dx_code_54354
dx_code_38007
dx_code_36510
dx_code_87110
dx_code_68078
dx_code_94687
dx_code_98044
dx_code_71588
dx_code_93865
dx_code_42060
dx_code_63476
dx_code_74715
dx_code_28445
dx_code_73501
dx_code_2256
dx_code_81892
dx_code_42660
dx_code_70264
dx_code_89

In [86]:
def file_lengthy(fname):
        with open(fname) as f:
                for i, l in enumerate(f):
                        pass
        return i + 1
print("Number of lines in the file: ",file_lengthy('./vocab/PRINCIPAL_DIAGNOSIS_CODE_vocab.txt'))

Number of lines in the file:  6753


### Build TF Dataset from Pandas Dataframe

In [73]:
BATCH_SIZE = 64
PREDICTOR_FIELD = 'LABEL'
categorical_tf_ds = df_to_dataset(categorical_example_df, PREDICTOR_FIELD, batch_size=BATCH_SIZE)

In [88]:
categorical_tf_ds

<BatchDataset shapes: ({ENCOUNTER_ID: (None,), PRINCIPAL_DIAGNOSIS_CODE: (None,)}, (None,)), types: ({ENCOUNTER_ID: tf.string, PRINCIPAL_DIAGNOSIS_CODE: tf.string}, tf.int32)>

### Use TF Feature Column API to read from vocab file

In [74]:
vocab_files_list[0]

'./vocab/PRINCIPAL_DIAGNOSIS_CODE_vocab.txt'

In [75]:
principal_diagnosis_vocab = tf.feature_column.categorical_column_with_vocabulary_file(
            key="PRINCIPAL_DIAGNOSIS_CODE", vocabulary_file = vocab_files_list[0], num_oov_buckets=1)

INFO:tensorflow:vocabulary_size = 6753 in PRINCIPAL_DIAGNOSIS_CODE is inferred from the number of elements in the vocabulary_file ./vocab/PRINCIPAL_DIAGNOSIS_CODE_vocab.txt.


### Create one-hot encoding  from vocab column feature function

In [76]:
one_hot_principal_diagnosis_feature = tf.feature_column.indicator_column(principal_diagnosis_vocab)

In [77]:
categorical_tf_ds_batch = next(iter(categorical_tf_ds))[0]

In [84]:
categorical_tf_ds_batch

{'ENCOUNTER_ID': <tf.Tensor: shape=(64,), dtype=string, numpy=
 array([b'udacity_health_encounter_id_3638',
        b'udacity_health_encounter_id_2393',
        b'udacity_health_encounter_id_5163',
        b'udacity_health_encounter_id_5180',
        b'udacity_health_encounter_id_3543',
        b'udacity_health_encounter_id_707',
        b'udacity_health_encounter_id_4746',
        b'udacity_health_encounter_id_2375',
        b'udacity_health_encounter_id_2042',
        b'udacity_health_encounter_id_1575',
        b'udacity_health_encounter_id_4491',
        b'udacity_health_encounter_id_6985',
        b'udacity_health_encounter_id_3002',
        b'udacity_health_encounter_id_6977',
        b'udacity_health_encounter_id_6983',
        b'udacity_health_encounter_id_5536',
        b'udacity_health_encounter_id_1208',
        b'udacity_health_encounter_id_3882',
        b'udacity_health_encounter_id_2361',
        b'udacity_health_encounter_id_4903',
        b'udacity_health_encounter_id_

In [83]:
demo(one_hot_principal_diagnosis_feature, categorical_tf_ds_batch)

tf.Tensor(
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]], shape=(64, 6754), dtype=float32)
