In [1]:
import pandas as pd
from pprint import pprint
import json
import numpy as np

In [2]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
pd.set_option('display.width', 1000)

In [3]:
# ICD_list table must be re-built from, presumably, ICD_for_Enc due to some entries being
# pre-18th birthday.  ICD_list entries are not timestamped!
table_names = ['all_encounter_data', 'demographics', 'encounters', 'family_hist_for_Enc',
               'family_hist_list', 'ICD_for_Enc', 'ICD_list', 'macula_findings_for_Enc',
               'SL_Lens_for_Enc', 'SNOMED_problem_list', 'systemic_disease_for_Enc', 'systemic_disease_list']

person_data = ['demographics','family_hist_list', 'systemic_disease_list', 'SNOMED_problem_list']

encounter_data = ['all_encounter_data', 'encounters', 'family_hist_for_Enc', 'ICD_for_Enc', 'macula_findings_for_Enc',
                   'SL_Lens_for_Enc', 'systemic_disease_for_Enc']

In [4]:
path = 'E:\\anil\\IIT Sop\\Term02\\MATH497\\ICO_data\\original_pickle\\'

In [5]:
# read tables into dataframes
dfs = [ pd.read_pickle(path + name + '.pickle') if name != 'ICD_list' else None
        for name in table_names ]

In [6]:
# rename columns in all dataframes to avoid unicode decode error
for df in dfs:
    if df is not None:
        df.columns = [col.decode("utf-8-sig") for col in df.columns]

In [8]:
# aggregate encounter nbrs under person number from tables with encounter numbers
encounter_key = 'u'Enc_Nbr'
for df in dfs:
    if df is not None:
        print(df.columns.values)

[u'Enc_ID' u'Enc_Nbr' u'Enc_Date' u'Person_ID' u'Person_Nbr'
 u'Primary_Payer' u'Smoking_Status' u'BMI' u'BP' u'Glucose' u'A1C'
 u'MR_OD_SPH' u'MR_OD_CYL' u'MR_OD_AXIS' u'MR_OD_DVA' u'MR_OD_NVA'
 u'MR_OS_SPH' u'MR_OS_CYL' u'MR_OS_AXIS' u'MR_OS_DVA' u'MR_OS_NVA'
 u'BB_OD_SPH' u'BB_OD_CYL' u'BB_OD_AXIS' u'BB_OD_DVA' u'BB_OD_NVA'
 u'BB_OS_SPH' u'BB_OS_CYL' u'BB_OS_AXIS' u'BB_OS_DVA' u'BB_OS_NVA'
 u'CYCLO_OD_SPH' u'CYCLO_OD_CYL' u'CYCLO_OD_AXIS' u'CYCLO_OD_DVA'
 u'CYCLO_OD_NVA' u'CYCLO_OS_SPH' u'CYCLO_OS_CYL' u'CYCLO_OS_AXIS'
 u'CYCLO_OS_DVA' u'CYCLO_OS_NVA']
[u'Person_ID' u'Person_Nbr' u'DOB' u'Gender' u'Race' u'Ethnicity' u'Zip'
 u'Age_Censored']
[u'Person_ID' u'Person_Nbr' u'Enc_ID' u'Enc_Nbr' u'Enc_Timestamp']
[u'Person_ID' u'Person_Nbr' u'Enc_ID' u'Enc_Nbr' u'Enc_Date' u'Code'
 u'Code_System' u'Family_History' u'Relation']
[u'Person_ID' u'Person_Nbr' u'Date_Created' u'Code' u'Code_System'
 u'Family_History' u'Relation']
[u'Person_ID' u'Person_Nbr' u'Enc_ID' u'Enc_Nbr' u'Enc_Timestamp'

#### Grouping all encounter nbrs under respective person nbr

In [18]:
encounter_key = 'Enc_Nbr'
person_key = 'Person_Nbr'
encounters_by_person = {}
for df in dfs:
    if df is not None:
        df_columns =set(df.columns.values)
        if encounter_key in df_columns and person_key in df_columns:
            for row_index, dfrow in df.iterrows():
                rowdict = dict(dfrow)
                person_nbr = rowdict[person_key]
                encounter_nbr = rowdict[encounter_key]
                encounters_by_person.setdefault(person_nbr, set()).add(encounter_nbr)

In [21]:
for person_nbr in encounters_by_person:
    if len(encounters_by_person[person_nbr])>5:
        pprint(encounters_by_person[person_nbr])
        break

set([3689952L,
     9254247L,
     10967288L,
     11247025L,
     11311562L,
     11328070L,
     13085096L])


#### Now grouping other measurements and properties under encounter_nbrs

In [23]:
encounter_key = 'Enc_Nbr'
# columns_to_ignore = [u'Person_ID', u'Person_Nbr', u'Enc_ID', u'Enc_Nbr', u'Enc_Date']
data_by_encounters = {}
data_by_encounters_type = {}
for df_index, df in enumerate(dfs):
    df_name = table_names[df_index]
    print df_name
    data_by_encounters[df_name] = {}
    if df is not None:
        df_columns =set(df.columns.values)
        if encounter_key in df_columns:
            # check if encounter is primary key in the table
            if len(df) == len(df[encounter_key].unique()):
                data_by_encounters_type[df_name] = 'single'
                for row_index, dfrow in df.iterrows():
                    rowdict = dict(dfrow)
                    
                    for k, v in rowdict.iteritems():
                        if isinstance(v, pd.tslib.Timestamp):
                            rowdict[k] = v.toordinal()
                    
                    encounter_nbr = rowdict[encounter_key]
                    data_by_encounters[df_name][encounter_nbr] = rowdict
            else:
                data_by_encounters_type[df_name] = 'list'
                for row_index, dfrow in df.iterrows():
                    rowdict = dict(dfrow)
                    for k, v in rowdict.iteritems():
                        if isinstance(v, pd.tslib.Timestamp):
                            rowdict[k] = v.toordinal()
                    encounter_nbr = rowdict[encounter_key]
                    data_by_encounters[df_name].setdefault(encounter_nbr, []).append(rowdict)

all_encounter_data
demographics
encounters
family_hist_for_Enc
family_hist_list
ICD_for_Enc
ICD_list
macula_findings_for_Enc
SL_Lens_for_Enc
SNOMED_problem_list
systemic_disease_for_Enc
systemic_disease_list


## Aggregating encounter entities under respective person entity

In [65]:
all_persons = []
for person_nbr in encounters_by_person:
    person_object = {person_key:person_nbr, 'encounter_objects':[]}
    for enc_nbr in encounters_by_person[person_nbr]:
        encounter_object = {encounter_key: enc_nbr}
        for df_name in data_by_encounters_type:
            if enc_nbr in data_by_encounters[df_name]:
                encounter_object[df_name] = data_by_encounters[df_name][enc_nbr]
                if data_by_encounters_type[df_name] !="single":
                    encounter_object[df_name+"_count"] = len(data_by_encounters[df_name][enc_nbr])
        person_object['encounter_objects'].append(encounter_object)

    all_persons.append(person_object)

In [29]:
# checking for aggregation consistency
n = 0
for person in all_persons:
    person_nbr=person[person_key]
    for enc_obj in person['encounter_objects']:
        enc_nbr=enc_obj[encounter_key]
        for df_name in data_by_encounters_type:
            if data_by_encounters_type[df_name] == "single":
                if df_name in enc_obj:
                    if person_key in enc_obj[df_name]:
                        if person_nbr != enc_obj[df_name][person_key]:
                            print "Person nbr does not match", person_nbr, enc_nbr, df_name
                    if encounter_key in enc_obj[df_name]:
                        if enc_nbr != enc_obj[df_name][encounter_key]:
                            print "Encounter nbr does not match", person_nbr, enc_nbr, df_name
                
            else:
                if df_name in enc_obj:
                    for rp_index, repeated_property in enumerate(enc_obj[df_name]):
                        if person_key in repeated_property:
                            if person_nbr != repeated_property[person_key]:
                                print "Person nbr does not match", person_nbr, enc_nbr, df_name, rp_index
                        if encounter_key in repeated_property:
                            if enc_nbr != repeated_property[encounter_key]:
                                print "Encounter nbr does not match", person_nbr, enc_nbr, df_name, rp_index
            
            
#     n+=1
#     if n>2:break

## Dropping duplicated columns and then full na rows across tables

In [66]:
with open('20170224_encounter_objects_before_duplicate_fields_drop.json', 'w') as fh:
    json.dump(all_persons, fh)

In [67]:
# drop repeated columns in nested fields except from table "encounters"


columns_to_drop = ['Enc_ID', 'Enc_Nbr', 'Enc_Date', 'Person_ID', 'Person_Nbr','Date_Created', 'Enc_Timestamp']


for person_index in range(len(all_persons)):
    
    for enc_obj_index in range(len(all_persons[person_index]['encounter_objects'])):
    
        enc_obj = all_persons[person_index]['encounter_objects'][enc_obj_index]
    
        for df_name in data_by_encounters_type:
            if data_by_encounters_type[df_name] == "single":
                if df_name in enc_obj and df_name!='encounters':
                    for column_to_drop in columns_to_drop:
                        try:
                            del enc_obj[df_name][column_to_drop]
                        except:
                            pass
                
            else:
                if df_name in enc_obj and df_name!='encounters':
                    for rp_index in range(len(enc_obj[df_name])):
                        for column_to_drop in columns_to_drop:
                            try:
                                del enc_obj[df_name][rp_index][column_to_drop]
                            except:
                                pass
                    
            
        all_persons[person_index]['encounter_objects'][enc_obj_index] = enc_obj

In [68]:
# drop full na object rows
# !does not seem to be working!!

for person_index in range(len(all_persons)):
    
    for enc_obj_index in range(len(all_persons[person_index]['encounter_objects'])):
        enc_obj = all_persons[person_index]['encounter_objects'][enc_obj_index]
        for df_name in data_by_encounters_type:
            if data_by_encounters_type[df_name] == "single":
                if df_name in enc_obj:
                    if all(pd.isnull(enc_obj[df_name].values())):
                        enc_obj[df_name] = float('nan')
            else:
                if df_name in enc_obj:
                    for rp_index in reversed(range(len(enc_obj[df_name]))):
                        if all(pd.isnull(enc_obj[df_name][rp_index].values())):
                            del enc_obj[df_name][rp_index]
        
        all_persons[person_index]['encounter_objects'][enc_obj_index] = enc_obj

In [69]:
with open('20170224_encounter_objects.json', 'w') as fh:
    json.dump(all_persons, fh)

In [70]:
# creating a dataframe from aggregated data
combined_ecounters_df = pd.DataFrame.from_dict({(person_obj[person_key],enc_obj[encounter_key]): enc_obj
                           for person_obj in all_persons
                           for enc_obj in person_obj['encounter_objects']},
                       orient='index')

In [76]:
combined_ecounters_df.head(10)

Unnamed: 0,Unnamed: 1,all_encounter_data_count,Enc_Nbr,macula_findings_for_Enc,SL_Lens_for_Enc,all_encounter_data,ICD_for_Enc_count,encounters,ICD_for_Enc,systemic_disease_for_Enc_count,systemic_disease_for_Enc,family_hist_for_Enc,family_hist_for_Enc_count
33,123227,2.0,123227,"{u'OD_Macula2_Modifer': nan, u'OD_Macula3_Find...","{u'OS_SL_LENS': u'1+ NS 1-2+ ACC', u'OD_SL_LEN...","[{u'MR_OD_DVA': u'20/20', u'BB_OD_SPH': nan, u...",3.0,{u'Person_ID': u'18405351-AC64-46A2-A003-8F788...,"[{u'Description': u'Presbyopia', u'Diagnosis_C...",3.0,"[{u'Snomed_Code': u'44054006', u'Systemic_Dise...","[{u'Code': u'160347007', u'Code_System': u'SNO...",6.0
89,233159,1.0,233159,,,"[{u'MR_OD_DVA': nan, u'BB_OD_SPH': nan, u'CYCL...",,{u'Person_ID': u'4ACD51E8-4A9B-4AA6-B635-166AD...,,,,,
89,455661,1.0,455661,,{u'OS_SL_LENS': u'clear lens capsule cortex an...,"[{u'MR_OD_DVA': nan, u'BB_OD_SPH': nan, u'CYCL...",3.0,{u'Person_ID': u'4ACD51E8-4A9B-4AA6-B635-166AD...,[{u'Description': u'Anatomical narrow angle bo...,1.0,"[{u'Snomed_Code': u'NULL', u'Systemic_Diseases...","[{u'Code': u'160267000', u'Code_System': u'SNO...",5.0
89,4126172,1.0,4126172,"{u'OD_Macula2_Modifer': nan, u'OD_Macula3_Find...","{u'OS_SL_LENS': u'nuclear sclerosis 1+', u'OD_...","[{u'MR_OD_DVA': nan, u'BB_OD_SPH': nan, u'CYCL...",3.0,{u'Person_ID': u'4ACD51E8-4A9B-4AA6-B635-166AD...,"[{u'Description': u'Retinal hemorrhage', u'Dia...",,,,
89,9217142,1.0,9217142,"{u'OD_Macula2_Modifer': nan, u'OD_Macula3_Find...","{u'OS_SL_LENS': u'nuclear sclerosis 1+', u'OD_...","[{u'MR_OD_DVA': nan, u'BB_OD_SPH': nan, u'CYCL...",1.0,{u'Person_ID': u'4ACD51E8-4A9B-4AA6-B635-166AD...,[{u'Description': u'Anatomical narrow angle bo...,,,,
89,10658304,1.0,10658304,"{u'OD_Macula2_Modifer': nan, u'OD_Macula3_Find...",{u'OS_SL_LENS': u'clear lens capsule cortex an...,"[{u'MR_OD_DVA': u'20/20', u'BB_OD_SPH': nan, u...",3.0,{u'Person_ID': u'4ACD51E8-4A9B-4AA6-B635-166AD...,[{u'Description': u'Anatomical narrow angle bi...,,,,
89,12870648,1.0,12870648,"{u'OD_Macula2_Modifer': nan, u'OD_Macula3_Find...",{u'OS_SL_LENS': u'nuclear sclerosis 1+ with tr...,"[{u'MR_OD_DVA': nan, u'BB_OD_SPH': nan, u'CYCL...",2.0,{u'Person_ID': u'4ACD51E8-4A9B-4AA6-B635-166AD...,[{u'Description': u'Anatomical narrow angle bo...,,,,
89,13815981,1.0,13815981,,,"[{u'MR_OD_DVA': nan, u'BB_OD_SPH': nan, u'CYCL...",,{u'Person_ID': u'4ACD51E8-4A9B-4AA6-B635-166AD...,,,,,
89,14537805,1.0,14537805,,{u'OS_SL_LENS': u'clear lens capsule cortex an...,"[{u'MR_OD_DVA': nan, u'BB_OD_SPH': nan, u'CYCL...",1.0,{u'Person_ID': u'4ACD51E8-4A9B-4AA6-B635-166AD...,[{u'Description': u'Primary angle closure with...,,,,
89,14968450,1.0,14968450,,,"[{u'MR_OD_DVA': nan, u'BB_OD_SPH': nan, u'CYCL...",4.0,{u'Person_ID': u'4ACD51E8-4A9B-4AA6-B635-166AD...,[{u'Description': u'Diabetes Mellitus Type 2 U...,,,,


In [75]:
combined_ecounters_df.loc[89,'family_hist_for_Enc']

233159                                                    NaN
455661      [{u'Code': u'160267000', u'Code_System': u'SNO...
4126172                                                   NaN
9217142                                                   NaN
10658304                                                  NaN
12870648                                                  NaN
13815981                                                  NaN
14537805                                                  NaN
14968450                                                  NaN
Name: family_hist_for_Enc, dtype: object