# **Descriptive Analysis**

In [1]:
# Import Necessary Packages
import pandas as pd
import numpy as np

In [None]:
# Set Working Directory 
import os 
os.chdir("/home/nico/BMIDS_FP")
os.getcwd()

## Table: **chartevents**

In [10]:
# Load csv
df_chartevents = pd.read_csv("chartevents.csv")

In [7]:
# Create Copy 
df_chartevents2 = df_chartevents.copy()

In [11]:
# Convert timestamps to date time 
df_chartevents['charttime'] = pd.to_datetime(df_chartevents['charttime'])
df_chartevents['storetime']= pd.to_datetime(df_chartevents['storetime'])

In [None]:
# Get number of unique values for each column and whether the column is unique 
unique_vals = pd.DataFrame(df_chartevents.nunique())[0].tolist()
unique_vals_count = [val == len(df_chartevents) for val in unique_vals]
print(unique_vals)
print(unique_vals_count)

In [None]:
# Get count of null values in each column and calculate the percentage of nulls
df_chartevents_null = pd.DataFrame(df_chartevents.isna().sum().reset_index()).rename({'index':'name', 0:'null_count'}, axis=1)

# Get percent null by dividing null_count column by length of df * 100 and formatting into percentage
df_chartevents_null['percent_null'] = df_chartevents_null['null_count'].apply(lambda x: f"{round(x/df_chartevents.shape[0]*100, 1)}%")
null_counts = df_chartevents_null.null_count.tolist()
percent_nulls = df_chartevents_null.percent_null.tolist()
print(null_counts[0:5])
print(percent_nulls[0:5])

In [None]:
# Get data types for each column
dtype_vals = pd.DataFrame(df_chartevents.dtypes)[0].tolist()
dtype_vals

In [None]:
# Calculate mode
modes = df_chartevents.mode().iloc[0,]
modes

In [None]:
# Calculate median
medians = []
for i in df_chartevents.columns:
    if (df_chartevents[i]).dtype == "O":
        medians.append("NA")
    else:
        medians.append(df_chartevents[i].median())
medians

In [18]:
# Create extra columns with table information
file_name = ['chartevents.csv']*len(df_chartevents.columns)
column_name = df_chartevents.columns
row_count = [df_chartevents.shape[0]]*len(df_chartevents.columns)
logical_datatypes = dtype_vals

In [None]:
# Create a DataFrame with metadata columns
df_chartevents_final = pd.DataFrame({'File Name':file_name, 'Column Name':column_name, 'Row Count':row_count, 
                        'Unique Values':unique_vals, 'Null Counts':null_counts, 'Null Percentage':percent_nulls,
                        'Logical Data Type':logical_datatypes, 'Mode':modes, 'Median':medians, 'Mode':modes, 'Median':medians})

df_chartevents_final

In [None]:
# Get descriptive statistics 
df_chartevents_describe = df_chartevents.describe().transpose()
df_chartevents_describe

In [22]:
# Reset Index of Description table
df_chartevents_describe = df_chartevents_describe.reset_index(drop=False)

In [None]:
# Add 'Column Name' to Description table
df_chartevents_describe.columns= ['Column Name'] + list(df_chartevents_describe.columns[1:])
df_chartevents_describe

In [25]:
# Calculate Range, Interquartile Range and Variance 
df_chartevents_describe["Range"] = df_chartevents_describe['max'] - df_chartevents_describe['min']
df_chartevents_describe["Interquartile_Range"] = df_chartevents_describe['75%'] - df_chartevents_describe['25%']
df_chartevents_describe["Variance"] = (df_chartevents_describe['std'])**2

In [None]:
# Merge
df_chartevents_final = df_chartevents_final.merge(df_chartevents_describe, how = 'left', on = 'Column Name')
df_chartevents_final

In [27]:
# Save
df_chartevents_final.to_csv("chartevents_descriptive.csv")

## Table: **admissions**

In [4]:
# Load csv
df_admissions = pd.read_csv("admissions.csv")

In [42]:
# Create Copy 
df_admissions2 = df_admissions.copy()

In [5]:
# Convert timestamps to date time 
df_admissions['admittime'] = pd.to_datetime(df_admissions['admittime'])
df_admissions['dischtime']= pd.to_datetime(df_admissions['dischtime'])
df_admissions['deathtime'] = pd.to_datetime(df_admissions['deathtime'])
df_admissions['edregtime']= pd.to_datetime(df_admissions['edregtime'])
df_admissions['edouttime']= pd.to_datetime(df_admissions['edouttime'])

In [None]:
# Get number of unique values for each column and whether the column is unique 
unique_vals = pd.DataFrame(df_admissions.nunique())[0].tolist()
unique_vals_count = [val == len(df_admissions) for val in unique_vals]
print(unique_vals)
print(unique_vals_count)

In [None]:
# Get count of null values in each column and calculate the percentage of nulls
df_admissions_null = pd.DataFrame(df_admissions.isna().sum().reset_index()).rename({'index':'name', 0:'null_count'}, axis=1)

# Get percent null by dividing null_count column by length of df * 100 and formatting into percentage
df_admissions_null['percent_null'] = df_admissions_null['null_count'].apply(lambda x: f"{round(x/df_admissions.shape[0]*100, 1)}%")
null_counts = df_admissions_null.null_count.tolist()
percent_nulls = df_admissions_null.percent_null.tolist()
print(null_counts[0:5])
print(percent_nulls[0:5])

In [None]:
# Get data types for each column
dtype_vals = pd.DataFrame(df_admissions.dtypes)[0].tolist()
dtype_vals

In [None]:
# Calculate mode
modes = df_admissions.mode().iloc[0,]
modes

In [None]:
# Calculate median
medians = []
for i in df_admissions.columns:
    if (df_admissions[i]).dtype == "O":
        medians.append("NA")
    else:
        medians.append(df_admissions[i].median())
medians

In [9]:
# Create extra columns with table information
file_name = ['admissions.csv']*len(df_admissions.columns)
column_name = df_admissions.columns
row_count = [df_admissions.shape[0]]*len(df_admissions.columns)
logical_datatypes = dtype_vals

In [None]:
# Create a DataFrame with metadata columns
df_admissions_final = pd.DataFrame({'File Name':file_name, 'Column Name':column_name, 'Row Count':row_count, 
                        'Unique Values':unique_vals, 'Null Counts':null_counts, 'Null Percentage':percent_nulls,
                        'Logical Data Type':logical_datatypes, 'Mode':modes, 'Median':medians})

df_admissions_final

In [None]:
# Get descriptive statistics 
df_admissions_describe = df_admissions.describe().transpose()
df_admissions_describe

In [98]:
# Reset Index of Description table
df_admissions_describe = df_admissions_describe.reset_index(drop=False)

In [None]:
# Add 'Column Name' to Description Table
df_admissions_describe.columns= ['Column Name'] + list(df_admissions_describe.columns[1:])
df_admissions_describe

In [100]:
# Calculate Range, Interquartile Range and Variance 
df_admissions_describe["Range"] = df_admissions_describe['max'] - df_admissions_describe['min']
df_admissions_describe["Interquartile_Range"] = df_admissions_describe['75%'] - df_admissions_describe['25%']
df_admissions_describe["Variance"] = (df_admissions_describe['std'])**2

In [None]:
# Merge
df_admissions_final = df_admissions_final.merge(df_admissions_describe, how = 'left', on = 'Column Name')
df_admissions_final

In [105]:
# Save
df_admissions_final.to_csv("admissions_descriptive.csv")

### Quality Check: admittime, dischtime 

In [None]:
# Initial check to see if there are dischtime before admittime 
is_valid = True
for x in range(len(df_admissions)):
    if df_admissions['admittime'][x] > df_admissions['dischtime'][x]:
        is_valid = False
        break
print(is_valid)

In [None]:
# Add column to df for whether dischtime is before admittime 
df_admissions['admittime_error'] = 0
for x in range(len(df_admissions)):
    if df_admissions['admittime'][x] > df_admissions['dischtime'][x]:
        df_admissions['admittime_error'][x] = False
    else: 
        df_admissions['admittime_error'][x] = True

In [None]:
# Display columns where this is the case # 
filtered_df = df_admissions[df_admissions['admittime_error'] == False]
with pd.option_context('display.max_rows', None):
    print(filtered_df['hadm_id'])

## Table: **diagnoses_icd**

In [3]:
# Load csv
df_diagnoses_icd = pd.read_csv("diagnoses_icd.csv")

In [4]:
# Create Copy 
df_diagnoses_icd2 = df_diagnoses_icd.copy()

In [None]:
# Get number of unique values for each column and whether the column is unique 
unique_vals = pd.DataFrame(df_diagnoses_icd.nunique())[0].tolist()
unique_vals_count = [val == len(df_diagnoses_icd) for val in unique_vals]
print(unique_vals)
print(unique_vals_count)

In [None]:
# Get count of null values in each column and calculate the percentage of nulls
df_diagnoses_icd_null = pd.DataFrame(df_diagnoses_icd.isna().sum().reset_index()).rename({'index':'name', 0:'null_count'}, axis=1)

# Get percent null by dividing null_count column by length of df * 100 and formatting into percentage
df_diagnoses_icd_null['percent_null'] = df_diagnoses_icd_null['null_count'].apply(lambda x: f"{round(x/df_diagnoses_icd.shape[0]*100, 1)}%")
null_counts = df_diagnoses_icd_null.null_count.tolist()
percent_nulls = df_diagnoses_icd_null.percent_null.tolist()
print(null_counts[0:5])
print(percent_nulls[0:5])

In [None]:
# Get data types for each column
dtype_vals = pd.DataFrame(df_diagnoses_icd.dtypes)[0].tolist()
dtype_vals

In [None]:
# Calculate mode
modes = df_diagnoses_icd.mode().iloc[0,]
modes

In [None]:
# Calculate median
medians = []
for i in df_diagnoses_icd.columns:
    if (df_diagnoses_icd[i]).dtype == "O":
        medians.append("NA")
    else:
        medians.append(df_diagnoses_icd[i].median())
medians

In [10]:
# Create extra columns with table information
file_name = ['diagnoses_icd.csv']*len(df_diagnoses_icd.columns)
column_name = df_diagnoses_icd.columns
row_count = [df_diagnoses_icd.shape[0]]*len(df_diagnoses_icd.columns)
logical_datatypes = dtype_vals

In [None]:
# Create a DataFrame with metadata columns
df_diagnoses_icd_final = pd.DataFrame({'File Name':file_name, 'Column Name':column_name, 'Row Count':row_count, 
                        'Unique Values':unique_vals, 'Null Counts':null_counts, 'Null Percentage':percent_nulls,
                        'Logical Data Type':logical_datatypes, 'Mode':modes, 'Median':medians})

df_diagnoses_icd_final

In [None]:
# Get descriptive statistics 
df_diagnoses_icd_describe = df_diagnoses_icd.describe().transpose()
df_diagnoses_icd_describe

In [13]:
# Reset Index #
df_diagnoses_icd_describe = df_diagnoses_icd_describe.reset_index(drop=False)

In [None]:
# Change to "Column Name"
df_diagnoses_icd_describe.columns= ['Column Name'] + list(df_diagnoses_icd_describe.columns[1:])
df_diagnoses_icd_describe

In [15]:
# Calculate Range, Interquartile Range and Variance 
df_diagnoses_icd_describe["Range"] = df_diagnoses_icd_describe['max'] - df_diagnoses_icd_describe['min']
df_diagnoses_icd_describe["Interquartile_Range"] = df_diagnoses_icd_describe['75%'] - df_diagnoses_icd_describe['25%']
df_diagnoses_icd_describe["Variance"] = (df_diagnoses_icd_describe['std'])**2

In [None]:
# Merge
df_diagnoses_icd_final = df_diagnoses_icd_final.merge(df_diagnoses_icd_describe, how = 'left', on = 'Column Name')
df_diagnoses_icd_final

In [17]:
df_diagnoses_icd_final.to_csv("diagnoses_icd_descriptive.csv")

## Table: **d_items**

In [124]:
# Load csv
df_d_items = pd.read_csv("d_items.csv")

In [125]:
# Create Copy 
df_d_items2 = df_d_items.copy()

In [None]:
# Get number of unique values for each column and whether the column is unique 
unique_vals = pd.DataFrame(df_d_items.nunique())[0].tolist()
unique_vals_count = [val == len(df_d_items) for val in unique_vals]
print(unique_vals)
print(unique_vals_count)

In [None]:
# Get count of null values in each column and calculate the percentage of nulls
df_d_items_null = pd.DataFrame(df_d_items.isna().sum().reset_index()).rename({'index':'name', 0:'null_count'}, axis=1)

# Get percent null by dividing null_count column by length of df * 100 and formatting into percentage
df_d_items_null['percent_null'] = df_d_items_null['null_count'].apply(lambda x: f"{round(x/df_d_items.shape[0]*100, 1)}%")
null_counts = df_d_items_null.null_count.tolist()
percent_nulls = df_d_items_null.percent_null.tolist()

In [None]:
# Get data types for each column
dtype_vals = pd.DataFrame(df_d_items.dtypes)[0].tolist()
dtype_vals

In [None]:
# Calculate mode
modes = df_d_items.mode().iloc[0,]
modes

In [None]:
# Calculate median
medians = []
for i in df_d_items.columns:
    if (df_d_items[i]).dtype == "O":
        medians.append("NA")
    else:
        medians.append(df_d_items[i].median())
medians

In [131]:
# Create extra columns with table information
file_name = ['d_items.csv']*len(df_d_items.columns)
column_name = df_d_items.columns
row_count = [df_d_items.shape[0]]*len(df_d_items.columns)
logical_datatypes = dtype_vals

In [None]:
# Create a DataFrame with metadata columns
df_d_items_final = pd.DataFrame({'File Name':file_name, 'Column Name':column_name, 'Row Count':row_count, 
                        'Unique Values':unique_vals, 'Null Counts':null_counts, 'Null Percentage':percent_nulls,
                        'Logical Data Type':logical_datatypes, 'Mode':modes, 'Median':medians})

df_d_items_final

In [None]:
# Get descriptive statistics 
df_d_items_describe = df_d_items.describe().transpose()
df_d_items_describe

In [134]:
# Reset Index of Description table
df_d_items_describe = df_d_items_describe.reset_index(drop=False)

In [None]:
# Add 'Column Name' to description table 
df_d_items_describe.columns= ['Column Name'] + list(df_d_items_describe.columns[1:])
df_d_items_describe

In [136]:
# Calculate Range, Interquartile Range and Variance 
df_d_items_describe["Range"] = df_d_items_describe['max'] - df_d_items_describe['min']
df_d_items_describe["Interquartile_Range"] = df_d_items_describe['75%'] - df_d_items_describe['25%']
df_d_items_describe["Variance"] = (df_d_items_describe['std'])**2

In [None]:
# Merge
df_d_items_final = df_d_items_final.merge(df_d_items_describe, how = 'left', on = 'Column Name')
df_d_items_final

In [138]:
df_d_items_final.to_csv("d_items_descriptive.csv")

## Table: **icustays**

In [4]:
# Load csv
df_icustays = pd.read_csv("icustays.csv")

In [5]:
# Create Copy 
df_icustays2 = df_icustays.copy()

In [6]:
# Convert timestamps to date time 
df_icustays['intime'] = pd.to_datetime(df_icustays['intime'])
df_icustays['outtime']= pd.to_datetime(df_icustays['outtime'])


In [None]:
# Get number of unique values for each column and whether the column is unique 
unique_vals = pd.DataFrame(df_icustays.nunique())[0].tolist()
unique_vals_count = [val == len(df_icustays) for val in unique_vals]
print(unique_vals)
print(unique_vals_count)

In [None]:
# Get count of null values in each column and calculate the percentage of nulls
df_icustays_null = pd.DataFrame(df_icustays.isna().sum().reset_index()).rename({'index':'name', 0:'null_count'}, axis=1)

# Get percent null by dividing null_count column by length of df * 100 and formatting into percentage
df_icustays_null['percent_null'] = df_icustays_null['null_count'].apply(lambda x: f"{round(x/df_icustays.shape[0]*100, 1)}%")
null_counts = df_icustays_null.null_count.tolist()
percent_nulls = df_icustays_null.percent_null.tolist()
print(null_counts[0:5])
print(percent_nulls[0:5])

In [None]:
# Get data types for each column
dtype_vals = pd.DataFrame(df_icustays.dtypes)[0].tolist()
dtype_vals

In [None]:
# Calculate mode
modes = df_icustays.mode().iloc[0,]
modes

In [None]:
# Calculate median
medians = []
for i in df_icustays.columns:
    if (df_icustays[i]).dtype == "O":
        medians.append("NA")
    else:
        medians.append(df_icustays[i].median())
medians

In [12]:
# Create extra columns with table information
file_name = ['icustays.csv']*len(df_icustays.columns)
column_name = df_icustays.columns
row_count = [df_icustays.shape[0]]*len(df_icustays.columns)
logical_datatypes = dtype_vals

In [None]:
# Create a DataFrame with metadata columns
df_icustays_final = pd.DataFrame({'File Name':file_name, 'Column Name':column_name, 'Row Count':row_count, 
                        'Unique Values':unique_vals, 'Null Counts':null_counts, 'Null Percentage':percent_nulls,
                        'Logical Data Type':logical_datatypes, 'Mode':modes, 'Median':medians})

df_icustays_final

In [None]:
# Get descriptive statistics 
df_icustays_describe = df_icustays.describe().transpose()
df_icustays_describe

In [15]:
# Reset Index of Description table
df_icustays_describe = df_icustays_describe.reset_index(drop=False)

In [None]:
# Add 'Column Name' to Description table
df_icustays_describe.columns= ['Column Name'] + list(df_icustays_describe.columns[1:])
df_icustays_describe

In [17]:
# Calculate Range, Interquartile Range and Variance 
df_icustays_describe["Range"] = df_icustays_describe['max'] - df_icustays_describe['min']
df_icustays_describe["Interquartile_Range"] = df_icustays_describe['75%'] - df_icustays_describe['25%']
df_icustays_describe["Variance"] = (df_icustays_describe['std'])**2

In [None]:
# Merge
df_icustays_final = df_icustays_final.merge(df_icustays_describe, how = 'left', on = 'Column Name')
df_icustays_final

In [19]:
df_icustays_final.to_csv("icustays_descriptive.csv")

## Table: **phecode_icd9** - Secondary Dataset

In [58]:
# Load csv
df_phecode_icd9 = pd.read_csv("phecode_icd9.csv")

In [59]:
# Create Copy 
df_phecode_icd9 = df_phecode_icd9.copy()

In [None]:
# Get number of unique values for each column and whether the column is unique 
unique_vals = pd.DataFrame(df_phecode_icd9.nunique())[0].tolist()
unique_vals_count = [val == len(df_phecode_icd9) for val in unique_vals]
print(unique_vals)
print(unique_vals_count)

In [None]:
# Get count of null values in each column and calculate the percentage of nulls
df_phecode_icd9_null = pd.DataFrame(df_phecode_icd9.isna().sum().reset_index()).rename({'index':'name', 0:'null_count'}, axis=1)

# Get percent null by dividing null_count column by length of df * 100 and formatting into percentage
df_phecode_icd9_null['percent_null'] = df_phecode_icd9_null['null_count'].apply(lambda x: f"{round(x/df_phecode_icd9.shape[0]*100, 1)}%")
null_counts = df_phecode_icd9_null.null_count.tolist()
percent_nulls = df_phecode_icd9_null.percent_null.tolist()
print(null_counts[0:5])
print(percent_nulls[0:5])

In [None]:
# Get data types for each column
dtype_vals = pd.DataFrame(df_phecode_icd9.dtypes)[0].tolist()
dtype_vals

In [None]:
# Calculate mode
modes = df_phecode_icd9.mode().iloc[0,]
modes

In [None]:
# Calculate median
medians = []
for i in df_phecode_icd9.columns:
    if (df_phecode_icd9[i]).dtype == "O":
        medians.append("NA")
    else:
        medians.append(df_phecode_icd9[i].median())
medians

In [65]:
# Create extra columns with table information
file_name = ['phecode_icd9.csv']*len(df_phecode_icd9.columns)
column_name = df_phecode_icd9.columns
row_count = [df_phecode_icd9.shape[0]]*len(df_phecode_icd9.columns)
logical_datatypes = dtype_vals

In [None]:
# Create a DataFrame with metadata columns
df_phecode_icd9_final = pd.DataFrame({'File Name':file_name, 'Column Name':column_name, 'Row Count':row_count, 
                        'Unique Values':unique_vals, 'Null Counts':null_counts, 'Null Percentage':percent_nulls,
                        'Logical Data Type':logical_datatypes, 'Mode':modes, 'Median':medians})

df_phecode_icd9_final

In [None]:
# Get descriptive statistics 
df_phecode_icd9_describe = df_phecode_icd9.describe().transpose()
df_phecode_icd9_describe

In [68]:
# Reset Index of Description table
df_phecode_icd9_describe = df_phecode_icd9_describe.reset_index(drop=False)

In [None]:
# Add 'Column Name' to Description table
df_phecode_icd9_describe.columns= ['Column Name'] + list(df_phecode_icd9_describe.columns[1:])
df_phecode_icd9_describe

In [70]:
# Calculate Range, Interquartile Range and Variance 
df_phecode_icd9_describe["Range"] = df_phecode_icd9_describe['max'] - df_phecode_icd9_describe['min']
df_phecode_icd9_describe["Interquartile_Range"] = df_phecode_icd9_describe['75%'] - df_phecode_icd9_describe['25%']
df_phecode_icd9_describe["Variance"] = (df_phecode_icd9_describe['std'])**2

In [None]:
# Merge
df_phecode_icd9_final = df_phecode_icd9_final.merge(df_phecode_icd9_describe, how = 'left', on = 'Column Name')
df_phecode_icd9_final

In [72]:
df_phecode_icd9_final.to_csv("phecode_icd9_descriptive.csv")

## Table: **phecodes_icd10** - Secondary Dataset

In [43]:
# Load csv
df_phecode_icd10 = pd.read_csv("phecode_icd10.csv")

In [None]:
# Get number of unique values for each column and whether the column is unique 
unique_vals = pd.DataFrame(df_phecode_icd10.nunique())[0].tolist()
unique_vals_count = [val == len(df_phecode_icd10) for val in unique_vals]
print(unique_vals)
print(unique_vals_count)

In [None]:
# Get count of null values in each column and calculate the percentage of nulls
df_phecode_icd10_null = pd.DataFrame(df_phecode_icd10.isna().sum().reset_index()).rename({'index':'name', 0:'null_count'}, axis=1)

# Get percent null by dividing null_count column by length of df * 100 and formatting into percentage
df_phecode_icd10_null['percent_null'] = df_phecode_icd10_null['null_count'].apply(lambda x: f"{round(x/df_phecode_icd10.shape[0]*100, 1)}%")
null_counts = df_phecode_icd10_null.null_count.tolist()
percent_nulls = df_phecode_icd10_null.percent_null.tolist()
print(null_counts[0:5])
print(percent_nulls[0:5])

In [None]:
# Get data types for each column
dtype_vals = pd.DataFrame(df_phecode_icd10.dtypes)[0].tolist()
dtype_vals

In [None]:
# Calculate mode
modes = df_phecode_icd10.mode().iloc[0,]
modes

In [None]:
# Calculate median
medians = []
for i in df_phecode_icd10.columns:
    if (df_phecode_icd10[i]).dtype == "O":
        medians.append("NA")
    else:
        medians.append(df_phecode_icd10[i].median())
medians

In [49]:
# Create extra columns with table information
file_name = ['phecodes_icd10.csv']*len(df_phecode_icd10.columns)
column_name = df_phecode_icd10.columns
row_count = [df_phecode_icd10.shape[0]]*len(df_phecode_icd10.columns)
logical_datatypes = dtype_vals

In [None]:
# Create a DataFrame with metadata columns
df_phecode_icd10_final = pd.DataFrame({'File Name':file_name, 'Column Name':column_name, 'Row Count':row_count, 
                        'Unique Values':unique_vals, 'Null Counts':null_counts, 'Null Percentage':percent_nulls,
                        'Logical Data Type':logical_datatypes, 'Mode':modes, 'Median':medians})

df_phecode_icd10_final

In [None]:
# Get descriptive statistics 
df_phecode_icd10_describe = df_phecode_icd10.describe().transpose()
df_phecode_icd10_describe

In [52]:
# Reset Index of Description table
df_phecode_icd10_describe = df_phecode_icd10_describe.reset_index(drop=False)

In [None]:
# Add 'Column Name' to Description table
df_phecode_icd10_describe.columns= ['Column Name'] + list(df_phecode_icd10_describe.columns[1:])
df_phecode_icd10_describe

In [54]:
# Calculate Range, Interquartile Range and Variance 
df_phecode_icd10_describe["Range"] = df_phecode_icd10_describe['max'] - df_phecode_icd10_describe['min']
df_phecode_icd10_describe["Interquartile_Range"] = df_phecode_icd10_describe['75%'] - df_phecode_icd10_describe['25%']
df_phecode_icd10_describe["Variance"] = (df_phecode_icd10_describe['std'])**2

In [None]:
# Merge
df_phecode_icd10_final = df_phecode_icd10_final.merge(df_phecode_icd10_describe, how = 'left', on = 'Column Name')
df_phecode_icd10_final

In [57]:
df_phecode_icd10_final.to_csv("phecode_icd10_descriptive.csv")