In [1]:
import numpy as np
import pandas as pd
from equiflow import *

In [2]:
np.random.seed(42)
n = 100000
data = {
    'age': np.random.randint(10, 80, size=n),
    'sofa': np.random.choice([0,1,2,3,4,5,6,7,8,9,10,15, np.nan], size=n),
    'race': np.random.choice(['White', 'Black', 'Asian', 'Hispanic', None],
                             size=n),
    'sex': np.random.choice(['Male', 'Female'], size=n),
    'english': np.random.choice(['Fluent', 'Limited', np.nan, None], size=n),
}

for i in range(1, 11):
    data[f'var{i}'] = np.random.randn(n)

df = pd.DataFrame(data)

In [7]:
eqfl = EquiFlow(df,
                initial_cohort_label='MIMIC-IV',
                categorical=['english'],
                normal=['age'],
                nonnormal=['sofa'],
                rename={
                    'sofa': 'SOFA',
                    'age': 'Age',
                    'english': 'English Proficiency',
                },
                missingness=True,
                )

# eqfl.add_exclusion(
#     mask=df.english.notnull(),
#     exclusion_reason='missing English Proficiency',
#     new_cohort_label='with English Proficiency data'
# )

eqfl.add_exclusion(
    new_cohort=df.loc[df.english.notnull()],
    exclusion_reason='missing English Proficiency',
    new_cohort_label='with English Proficiency data'
)

# add a filter for age
eqfl.add_exclusion(
    mask=df.age >= 18,
    exclusion_reason='age < 18',
    new_cohort_label='adults only'
)                    

eqfl.plot_flows(
    legend=True
)

In [4]:
eqfl.view_table_flows()

Cohort Flow,0 to 1
,
"Initial, n",100000.0
"Removed, n",49978.0
"Result, n",50022.0


In [5]:
eqfl.view_table_characteristics()

Unnamed: 0_level_0,Unnamed: 1_level_0,Cohort,Cohort
Unnamed: 0_level_1,Unnamed: 1_level_1,0,1
Variable,Value,Unnamed: 2_level_2,Unnamed: 3_level_2
Overall,,100000,50022
"Race and Ethnicity, N (%)",Hispanic,"20,024 (20.0)","10,011 (20.0)"
"Race and Ethnicity, N (%)",Asian,"19,952 (20.0)","9,918 (19.8)"
"Race and Ethnicity, N (%)",Black,"20,066 (20.1)","10,088 (20.2)"
"Race and Ethnicity, N (%)",White,"19,931 (19.9)","9,978 (19.9)"
"Race and Ethnicity, N (%)",Missing,"20,027 (20.0)","10,027 (20.0)"
"English Proficiency, N (%)",Fluent,"25,134 (25.1)","25,134 (50.2)"
"English Proficiency, N (%)",Limited,"24,888 (24.9)","24,888 (49.8)"
"English Proficiency, N (%)",Missing,"49,978 (50.0)",0 (0.0)


In [11]:
eqfl.view_table_drifts(drifts_by_class=False)

Unnamed: 0_level_0,Cohort Flow,0 to 1
Variable,Value,Unnamed: 2_level_1
Overall,,
Race and Ethnicity,Hispanic,0.0
Race and Ethnicity,Asian,0.0
Race and Ethnicity,Black,0.0
Race and Ethnicity,White,0.0
English Proficiency,Fluent,0.5
English Proficiency,Limited,0.5


In [None]:
data_0 = df.copy()
data_1 = data_0.loc[data_0.english.notnull()]
data_2 = data_1.loc[data_1.sofa.notnull()]

# ef = EquiFlow(dfs = [data_0, data_1, data_2])



: 

: 

: 

: 

: 

: 

In [None]:
TableFlows(
    dfs = [data_0, data_1, data_2],
    label_suffix=True,
    thousands_sep=False,
).table

Cohort Flow,0 to 1,1 to 2
,,
"Inital, n",100000.0,50022.0
"Removed, n",49978.0,3874.0
"Result, n",50022.0,46148.0


: 

: 

: 

: 

: 

: 

In [None]:
TableCharacteristics(
    dfs = [data_0, data_1, data_2],
    # categorical = [], # currently not supported
    categorical = ['race','sex', 'english'],
    nonnormal = ['sofa'],
    normal = ['age'],
    # nonnormal = [],
    # normal = [],
    format_cat = 'N (%)',
    # format_cont = 'Mean ± SD',
    format_normal = 'Mean',
    format_nonnormal='Median [IQR]',
    missingness = True,
    decimals = 1,
    label_suffix = True,
    thousands_sep = True,
    rename={'race': 'Race and Ethnicity',
            'english': 'English Proficiency',
            'sex':'Sex',
            'sofa': 'SOFA',
            'age': 'Age',  
            }
).table

Unnamed: 0_level_0,Unnamed: 1_level_0,Cohort,Cohort,Cohort
Unnamed: 0_level_1,Unnamed: 1_level_1,0,1,2
Variable,Value,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Overall,,100000,50022,46148
"Race and Ethnicity, N (%)",Hispanic,"20,024 (20.0)","10,011 (20.0)","9,197 (19.9)"
"Race and Ethnicity, N (%)",Asian,"19,952 (20.0)","9,918 (19.8)","9,145 (19.8)"
"Race and Ethnicity, N (%)",Black,"20,066 (20.1)","10,088 (20.2)","9,323 (20.2)"
"Race and Ethnicity, N (%)",White,"19,931 (19.9)","9,978 (19.9)","9,218 (20.0)"
"Race and Ethnicity, N (%)",Missing,"20,027 (20.0)","10,027 (20.0)","9,265 (20.1)"
"Sex, N (%)",Male,"50,052 (50.1)","25,049 (50.1)","23,089 (50.0)"
"Sex, N (%)",Female,"49,948 (49.9)","24,973 (49.9)","23,059 (50.0)"
"Sex, N (%)",Missing,0 (0.0),0 (0.0),0 (0.0)
"English Proficiency, N (%)",Fluent,"25,134 (25.1)","25,134 (50.2)","23,223 (50.3)"


: 

: 

: 

: 

: 

: 

In [None]:
TableDrifts(
    dfs=[data_0, data_1, data_2],
    categorical = ['race','sex', 'english'],
    # categorical=[],
    # nonnormal = ['sofa'],
    # normal = ['age'],
    nonnormal = [],
    normal = [],
    # missingness = True,
    decimals = 3,
    # label_suffix = True,
    # thousands_sep = False,
    rename={'race': 'Race and Ethnicity',
            'english': 'English Proficiency',
            'sex':'Sex',
            'sofa': 'SOFA',
            'age': 'Age',  
            }
).table

Unnamed: 0_level_0,Cohort Flow,0 to 1,1 to 2
Variable,Value,Unnamed: 2_level_1,Unnamed: 3_level_1
Overall,,,
Race and Ethnicity,Hispanic,0.0,0.002
Race and Ethnicity,Asian,0.003,0.0
Race and Ethnicity,Black,0.003,0.001
Race and Ethnicity,White,0.0,0.001
Sex,Male,0.0,0.001
Sex,Female,0.0,0.001
English Proficiency,Fluent,0.537,0.002
English Proficiency,Limited,0.532,0.002


: 

: 

: 

: 

: 

: 

: 

: 

: 

: 

: 

: 