In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt


%load_ext autoreload
%autoreload 2



In [52]:
states = ['Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California', 'Colorado', 'Connecticut', 'Delaware', 'District of Columbia', 'Florida', 'Georgia', 'Hawaii', 'Idaho', 'Illinois', 'Indiana', 'Iowa', 'Kansas', 'Kentucky', 'Louisiana', 'Maine', 'Maryland', 'Massachusetts', 'Michigan', 'Minnesota', 'Mississippi', 'Missouri', 'Montana', 'Nebraska', 'Nevada', 'New Hampshire', 'New Jersey', 'New Mexico', 'New York', 'North Carolina', 'North Dakota', 'Ohio', 'Oklahoma', 'Oregon', 'Pennsylvania', 'Puerto Rico', 'Rhode Island', 'South Carolina', 'South Dakota', 'Tennessee', 'Texas', 'Utah', 'Vermont', 'Virginia', 'Washington', 'West Virginia', 'Wisconsin', 'Wyoming']

#list of all human ehtnicities
ethnicity = ['Asian', 'Black/African American', "American Indian/Alaska native", "Pacific islander", 'Hispanic', 'White']

data = pd.DataFrame(index = range(10000))
data = (
    data
    .assign(
            Sex = np.random.choice(['male', 'female'], size=data.shape[0]),
            age = np.random.randint(low = 18, high = 85, size=data.shape[0]),
            Ethnicity = np.random.choice(ethnicity, size=data.shape[0]),
            state = np.random.choice(states, size=data.shape[0]),
            height_cm = np.random.normal(170, scale = 5, size=data.shape[0]).round(0),
            weight_lb = np.random.normal(180, scale = 5, size=data.shape[0]).round(0),
            systolic_bp = np.random.normal(120, scale = 10, size=data.shape[0]).round(0),
            diastolic_bp = np.random.normal(80, scale = 5, size=data.shape[0]).round(0),
            resting_heart_rate = np.random.normal(75, scale = 8, size=data.shape[0]).round(0),
            fasting_blood_glucose = np.random.normal(100, scale = 10, size=data.shape[0]).round(0),
            fasting_triglycerides = np.random.normal(150, scale = 10, size=data.shape[0]).round(0),
            Alzeimers = np.random.choice(['yes','no', 'no'], size=data.shape[0]),
            PD_risk_score = np.random.normal(0, scale = 1, size=data.shape[0]).round(1),
            hours_of_sleep = np.random.randint(4,10, size=data.shape[0]),
            stress_level = np.random.choice(['low','slightly elevated', 'average', 'high', 'over whelming'], size=data.shape[0])
            )
    .rename(columns = {"PD_risk_score": "PD risk score", "hours_of_sleep": "hours of sleep", "stress_level": "stress level"})
    .to_csv('/Users/michaelschaid/GitHub/CNC_pandas_workshop/data/fake_clinical_data.csv', index=False)

    
    
    
)

In [77]:
df = pd.read_csv('/Users/michaelschaid/GitHub/CNC_pandas_workshop/data/fake_clinical_data.csv')
df.head()

Unnamed: 0,Sex,age,Ethnicity,state,height_cm,weight_lb,systolic_bp,diastolic_bp,resting_heart_rate,fasting_blood_glucose,fasting_triglycerides,Alzeimers,PD risk score,hours of sleep,stress level
0,male,62,Hispanic,Texas,167.0,173.0,117.0,83.0,82.0,87.0,138.0,no,0.1,8,slightly elevated
1,male,68,Black/African American,Puerto Rico,175.0,178.0,111.0,94.0,80.0,111.0,138.0,no,1.1,7,high
2,male,44,Black/African American,Maryland,167.0,185.0,116.0,81.0,83.0,102.0,142.0,no,1.0,6,high
3,female,49,Asian,New Hampshire,163.0,182.0,116.0,80.0,76.0,84.0,131.0,yes,-0.2,9,slightly elevated
4,female,67,American Indian/Alaska native,Louisiana,172.0,185.0,115.0,87.0,92.0,111.0,162.0,yes,-1.4,7,average


In [80]:

    
def clean_data(df:pd.DataFrame) -> pd.DataFrame:
    stress_cat =  pd.CategoricalDtype(categories=['low','slightly elevated', 'average', 'high', 'over whelming'], ordered=True) 
    bp_cat = pd.CategoricalDtype(categories=['normal', 'prehypertension', 'hypertension', 'hypertensive crisis'], ordered=True)
    
    def calc_bmi(df):
        return df.weight_kg / ((df.height_cm/100)**2)

    return (df
    .rename(columns = lambda c: c.lower().replace(' ', '_'))
    .assign(sex = lambda df_: df_.sex.astype('category'),
            ethnicity = lambda df_: df_.ethnicity.astype('category'), 
            state = lambda df_: df_.state.astype('category'),
            age = lambda df_: df_.age.astype('int32'),
            height_cm = lambda df_: df_.height_cm.astype('float16'),
            weight_kg = lambda df_: df_.weight_lb.astype('float16')/2.204,
            bmi = lambda df_: calc_bmi(df_),
            systolic_bp = lambda df_: df_.systolic_bp.astype('int32'),
            diastolic_bp = lambda df_: df_.diastolic_bp.astype('int32'),
            hypertension_category = lambda df_: pd.cut(df_.systolic_bp,
                                                       bins = [0, 120, 139, 140, 800],
                                                       labels = ['normal', 'prehypertension', 'hypertension', 'hypertensive crisis']).astype(bp_cat),
            resting_heart_rate = lambda df_: df_.resting_heart_rate.astype('int8'),
            fasting_blood_glucose = lambda df_: df_.fasting_blood_glucose.astype('int32'),
            fasting_triglycerides = lambda df_: df_.fasting_triglycerides.astype('int32'),
            alzeimers = lambda df_: df_.alzeimers.replace({'yes': True, 'no': False}).astype('bool'), 
            hours_of_sleep = lambda df_: df_.hours_of_sleep.astype('int32'),
            stress_level = lambda df_: df_.stress_level.astype(stress_cat)
            )
    .drop(columns = ['weight_lb'])
    )
data = clean_data(df)
data

Unnamed: 0,sex,age,ethnicity,state,height_cm,systolic_bp,diastolic_bp,resting_heart_rate,fasting_blood_glucose,fasting_triglycerides,alzeimers,pd_risk_score,hours_of_sleep,stress_level,weight_kg,bmi,hypertension_category
0,male,62,Hispanic,Texas,167.0,117,83,82,87,138,False,0.1,8,slightly elevated,78.5000,28.140625,normal
1,male,68,Black/African American,Puerto Rico,175.0,111,94,80,111,138,False,1.1,7,high,80.8125,26.390625,normal
2,male,44,Black/African American,Maryland,167.0,116,81,83,102,142,False,1.0,6,high,84.0000,30.125000,normal
3,female,49,Asian,New Hampshire,163.0,116,80,76,84,131,True,-0.2,9,slightly elevated,82.6250,31.109375,normal
4,female,67,American Indian/Alaska native,Louisiana,172.0,115,87,92,111,162,True,-1.4,7,average,84.0000,28.406250,normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,male,64,Hispanic,Arizona,159.0,130,81,75,95,146,True,-0.4,6,average,79.4375,31.437500,prehypertension
9996,male,67,White,Minnesota,170.0,108,78,66,83,137,False,1.1,9,slightly elevated,84.4375,29.203125,normal
9997,female,47,Black/African American,Vermont,177.0,117,76,79,100,155,False,-1.3,5,slightly elevated,80.3125,25.656250,normal
9998,male,78,Pacific islander,Tennessee,165.0,141,69,69,98,165,False,-1.9,6,over whelming,84.8750,31.156250,hypertensive crisis


In [82]:
init_mem = df.memory_usage(deep=True).sum() 
clean_mem = data.memory_usage(deep=True).sum() 

print(f'initial memory usage: {init_mem/1e6} MB')
print(f'cleaned memory usage: {clean_mem/1e6} MB')

print(f'percent of raw data memory:{clean_mem/init_mem *100:.2f}%')

initial memory usage: 4.040229 MB
cleaned memory usage: 0.457433 MB
percent of raw data memory:11.32%
