In [41]:
import pandas as pd
from pprint import pprint
import json
import numpy as np

In [35]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
pd.set_option('display.width', 1000)

In [2]:
# ICD_list table must be re-built from, presumably, ICD_for_Enc due to some entries being
# pre-18th birthday.  ICD_list entries are not timestamped!
table_names = ['all_encounter_data', 'demographics', 'encounters', 'family_hist_for_Enc',
               'family_hist_list', 'ICD_for_Enc', 'ICD_list', 'macula_findings_for_Enc',
               'SL_Lens_for_Enc', 'SNOMED_problem_list', 'systemic_disease_for_Enc', 'systemic_disease_list']

person_data = ['demographics','family_hist_list', 'systemic_disease_list', 'SNOMED_problem_list']

encounter_data = ['all_encounter_data', 'encounters', 'family_hist_for_Enc', 'ICD_for_Enc', 'macula_findings_for_Enc',
                   'SL_Lens_for_Enc', 'systemic_disease_for_Enc']

In [3]:
path = 'E:\\anil\\IIT Sop\\Term02\\MATH497\\ICO_data\\original_pickle\\'

In [18]:
# read tables into dataframes
dfs = [ pd.read_pickle(path + name + '.pickle') if name != 'ICD_list' else None
        for name in table_names ]




In [19]:
# rename columns in all dataframes to avoid unicode decode error
for df in dfs:
    if df is not None:
        df.columns = [col.decode("utf-8-sig") for col in df.columns]
        

In [37]:
# list of lists of columns to normalize
normalized_tbls = ['encounterIds', 'personIds', 'snomed', 'icd']
normalize_lists = {
    'encounterIds' : {'tables': ['all_encounter_data' , 'encounters', 'family_hist_for_Enc', 'ICD_for_Enc', 'macula_findings_for_Enc', 'SL_Lens_for_Enc', 'systemic_disease_for_Enc'],
                    'columns': ['Enc_ID', 'Enc_Nbr', 'Enc_Date', 'Enc_Timestamp']},
    'personIds' : {'tables': ['all_encounter_data', 'demographics', 'encounters', 'family_hist_for_Enc',
               'family_hist_list', 'ICD_for_Enc', 'macula_findings_for_Enc',
               'SL_Lens_for_Enc', 'SNOMED_problem_list', 'systemic_disease_for_Enc', 'systemic_disease_list'],
                 'columns':[u'Person_ID', u'Person_Nbr']},
    'snomed' : {'tables': ['SNOMED_problem_list', 'systemic_disease_for_Enc', 'systemic_disease_list'],
                 'columns':[u'Concept_ID', 'Description','Snomed_Code', 'Systemic_Diseases']},
    'icd' : {'tables': ['ICD_for_Enc'],
             'columns': ['Diagnosis_Code_ID', 'Description']}
    
}

In [42]:
normalized_dfs = []
for normalize_list_key in normalized_tbls:
#     print normalize_list_key
    tables = normalize_lists[normalize_list_key]['tables']
    columns_norm = normalize_lists[normalize_list_key]['columns']
#     print columns_norm
    norm_df_rows = set()
    for tableName in tables:
#         print tableName, ",",
        df_index = table_names.index(tableName)
        df = dfs[df_index]
        columnsInDf = set(df.columns.values)
        for i, dfrow in df.iterrows():
            rowdict = dict(dfrow)
            norm_df_row = []
            for column in columns_norm:
                if column in columnsInDf:
                    val = rowdict[column]
                    if pd.isnull(val) or val=='NULL':
                        norm_df_row.append(np.nan)
                    else:
                        norm_df_row.append(val)
                else:
                    norm_df_row.append(np.nan)
        
            norm_df_rows.add(tuple(norm_df_row))
#     print ""
    normalized_dfs.append(pd.DataFrame([list(row) for row in norm_df_rows], columns=columns_norm))


In [32]:
for i, normalize_list_key in enumerate(normalized_tbls):
    normalized_dfs[i].to_pickle(normalize_list_key+"_normalized.pickle")


In [43]:
for i, normalize_list_key in enumerate(normalized_tbls):
    print normalized_dfs[i].head(10)

    Concept_ID                               Description Snomed_Code           Systemic_Diseases
0          NaN                                       NaN         NaN  vertigo (migraine-induced)
1  246938006.0                            Corneal dellen         NaN                         NaN
2          NaN                                       NaN         NaN                 Cholesterol
3          NaN                                       NaN         NaN                Endometrosis
4  193489006.0                           Diabetic iritis         NaN                         NaN
5          NaN                                       NaN         NaN          Blood clot in lung
6          NaN                                       NaN         NaN    Quadruple Bypass Surgery
7          NaN                                       NaN         NaN                         AKI
8          NaN                                       NaN         NaN           Urinary Frequency
9  366066006.0  Suppression (b