In [1]:
import math
import pandas as pd

# Conectando ao Google Drive
from google.colab import drive
drive.mount('/content/drive', force_remount = True)

PATH = '/content/drive/MyDrive/MC936/P2/data/processed'
CLUSTERING = ['hierarchical', 'kmeans']

Mounted at /content/drive


# **Definição de Funções**

In [2]:
# Monta um DataFrame (DF) para cada arquivo CSV gerado com o resultado de uma
# clusterização. Cada DF contém somente as features de interesse para as análises 
# que serão realizadas a seguir. Também são gerados DFs transpostos para cada um 
# dos DFs citados.
#
# Parâmetros: parte do caminho para o arquivo CSV do cluster (path_scenario), 
# dicionário para armazenar os DFs das features de interesse (datasets) e  
# dicionário para armazenar os DFs transpostos (datasets_t).

def create_DFs(path_scenario, datasets, datasets_t):
    # Importação do conjunto de dados
    for cluster in CLUSTERING:
        csv_path = f'{path_scenario}_{cluster}_clustering.csv'
        datasets[cluster] = pd.read_csv(csv_path)
        
        # Remoção das features que não serão analisadas
        columns = ['PATIENT', 'RACE', 'ETHNICITY', 'GENDER', 'CITY', 
                  'STATE', 'AGE', 'COVID DIAGNOSIS DATE', 
                  'LAST ENCOUNTERS OR DIED DATE', 'TIME (DAYS)']

        if cluster == 'kmeans':
            columns.append('Silhouette')

        datasets[cluster] = datasets[cluster].drop(columns = columns)

        # Criação de um DF transposto
        datasets_t[cluster] = datasets[cluster].T
        datasets_t[cluster] = datasets_t[cluster].drop(index = 'Cluster')

        datasets_t[cluster] = datasets_t[cluster][[0, 1]]
        datasets_t[cluster][0] = 0
        datasets_t[cluster][1] = 0

In [3]:
# Preenche a frequência absoluta de cada valor booleano das features de 
# interesse contidas nos DFs transpostos.
#
# Parâmetros: dicionário com os DFs das features de interesse (datasets) e  
# dicionário com os DFs transpostos (datasets_t).

def fill_transposed_DF(datasets, datasets_t):
    for cluster in CLUSTERING:
        # Itera sobre cada uma das features de interesse
        for col in datasets[cluster].columns:
            if col != 'Cluster':
                df_cluster = datasets[cluster]
                indexes = df_cluster[col].value_counts().index.tolist()
                values = df_cluster[col].value_counts().values.tolist()

                # Preenche a contagem dos valores no DF transposto
                for i in range(len(indexes)):
                    index = math.floor(float(indexes[i]))
                    value = math.floor(float(values[i]))

                    datasets_t[cluster].loc[col, index] = value

In [4]:
# Monta um DF para cada um dos clusters de cada clusterização e também gera DFs
# transpostos para cada um deles.
#
# Parâmetros: dicionário com os DFs das features de interesse (datasets), 
# dicionários com os DFs de cada cluster das clusterizações (dfs_clusters_h e 
# dfs_clusters_k) e dicionários com os DFs transpostos de cada cluster das 
# clusterizações (dfs_clusters_h_t e dfs_clusters_k_t)

def create_DFs_for_clusters(datasets, dfs_clusters_h, dfs_clusters_h_t, 
                            dfs_clusters_k, dfs_clusters_k_t):
    for cluster in CLUSTERING:
        if cluster == 'kmeans':
            dfs = dfs_clusters_k
            dfs_t = dfs_clusters_k_t
        else:
            dfs = dfs_clusters_h
            dfs_t = dfs_clusters_h_t

        names = sorted(datasets[cluster]['Cluster'].unique().tolist())

        # Itera sobre cada um dos clusters da clusterização em questão
        for name in names:
            dfs[name] = datasets[cluster].query(f'Cluster == "{name}"')
            dfs[name] = dfs[name].reset_index(drop = True)

            # Criação de um DF transposto
            dfs_t[name] = dfs[name].T
            dfs_t[name] = dfs_t[name].drop(index = 'Cluster')
            
            dfs_t[name] = dfs_t[name][[0]]
            dfs_t[name][0] = 0
            dfs_t[name][1] = 0

In [5]:
# Preenche a frequência absoluta de cada valor booleano das features de 
# interesse contidas nos DFs transpostos de cada um dos clusters de cada
# clusterização.
#
# Parâmetros: dicionário com os DFs transpostos das features de interesse 
# (datasets_t), dicionários com os DFs de cada cluster das clusterizações 
# (dfs_clusters_h e dfs_clusters_k) e dicionários com os DFs transpostos de 
# cada cluster das clusterizações (dfs_clusters_h_t e dfs_clusters_k_t).

def fill_transposed_DFs_for_clusters(datasets_t, dfs_clusters_h, dfs_clusters_h_t, 
                                     dfs_clusters_k, dfs_clusters_k_t):
    for cluster in CLUSTERING:
        if cluster == 'kmeans':
            dfs = dfs_clusters_k
            dfs_t = dfs_clusters_k_t
        else:
            dfs = dfs_clusters_h
            dfs_t = dfs_clusters_h_t

        # Itera sobre cada um dos clusters da clusterização em questão
        for name in dfs.keys():
            for col in dfs[name].columns:
                if col != 'Cluster':
                    indexes = dfs[name][col].value_counts().index.tolist()
                    values = dfs[name][col].value_counts().values.tolist()

                    # Preenche a porcentagem dos valores em relação ao total no DF transposto
                    for i in range(len(indexes)):
                        index = math.floor(float(indexes[i]))
                        value = math.floor(float(values[i]))
                        value = (value / datasets_t[cluster].loc[col, index]) * 100

                        dfs_t[name].loc[col, index] = round(value, 2)

In [6]:
# Faz o merge dos DFs transpostos de cada cluster de uma clusterização 
# com o DF transposto das features de interesse da clusterização em questão.
#
# Parâmetros: dicionário com os DFs transpostos das features de interesse 
# (datasets_t), dicionários com os DFs transpostos de cada cluster das 
# clusterizações (dfs_clusters_h_t e dfs_clusters_k_t) e dicionário para
# armazenar os DFs resultantes dos merges.

def merge_transposed_DFs(datasets_t, dfs_clusters_h_t, dfs_clusters_k_t, dfs_features):
    for cluster in CLUSTERING:
        df_features = datasets_t[cluster]
        df_features = df_features.reset_index()
        df_features.columns = ['feature', '0', '1']

        if cluster == 'kmeans':
            dfs_t = dfs_clusters_k_t
        else:
            dfs_t = dfs_clusters_h_t

        # Itera sobre os clusters gerados pela clusterização em questão
        for name in dfs_t.keys():
            df_cluster = dfs_t[name]
            df_cluster = df_cluster.reset_index()
            df_cluster.columns = ['feature', f'[{name}] 0(%)', f'[{name}] 1(%)']
            df_features = df_features.merge(df_cluster, how = 'left', on = 'feature')

        dfs_features[cluster] = df_features

In [7]:
def main(scenario):
    path_csv = f'{PATH}/{scenario}'

    datasets = dict()
    datasets_t = dict()

    create_DFs(path_csv, datasets, datasets_t)
    fill_transposed_DF(datasets, datasets_t)

    # Clusterização Hierárquica
    dfs_clusters_h = dict()
    dfs_clusters_h_t = dict()

    # Clusterização K-means
    dfs_clusters_k = dict()
    dfs_clusters_k_t = dict()

    create_DFs_for_clusters(datasets, dfs_clusters_h, dfs_clusters_h_t, 
                            dfs_clusters_k, dfs_clusters_k_t)

    fill_transposed_DFs_for_clusters(datasets_t, dfs_clusters_h, dfs_clusters_h_t, 
                                     dfs_clusters_k, dfs_clusters_k_t)

    dfs_features = dict()

    merge_transposed_DFs(datasets_t, dfs_clusters_h_t, dfs_clusters_k_t, dfs_features)

    return dfs_features

# **Features de Interesse para as Análises Relacionadas ao Grupo de Risco**

In [8]:
risk_group = list()

# COVID Related
risk_group.append('DIED IN ONE MONTH')
risk_group.append('COVID-19')
risk_group.append('Face mask (physical object)')
risk_group.append('SARS-COV-2 (COVID-19) vaccine  mRNA  spike protein  LNP  preservative free  30 mcg/0.3mL dose')
risk_group.append('SARS-COV-2 (COVID-19) vaccine  mRNA  spike protein  LNP  preservative free  100 mcg/0.5mL dose')
risk_group.append('SARS-COV-2 (COVID-19) vaccine  vector non-replicating  recombinant spike protein-Ad26  preservative free  0.5 mL')
risk_group.append('Suspected COVID-19')

# High blood pressure
risk_group.append('Hypertension')

# Heart problems
risk_group.append('Cardiac Arrest')
risk_group.append('Coronary Heart Disease')
risk_group.append('Heart failure (disorder)')
risk_group.append('Heart failure education (procedure)')
risk_group.append('History of cardiac arrest (situation)')
risk_group.append('History of myocardial infarction (situation)')
risk_group.append('Injury of heart (disorder)')
risk_group.append('Myocardial Infarction')

# Lung problems
risk_group.append('Acute bronchitis (disorder)')
risk_group.append('Acute pulmonary embolism (disorder)')
risk_group.append('Acute respiratory distress syndrome (disorder)')
risk_group.append('Acute respiratory failure (disorder)')
risk_group.append('Childhood asthma')
risk_group.append('Chronic obstructive bronchitis (disorder)')
risk_group.append('Lung volume reduction surgery (procedure)')
risk_group.append('Oxygen administration by mask (procedure)')
risk_group.append('Pulmonary emphysema (disorder)')
risk_group.append('Pulmonary rehabilitation (regime/therapy)')
risk_group.append('Respiratory distress (finding)')

# Diabetes
risk_group.append('Diabetes')
risk_group.append('Diabetic renal disease (disorder)')
risk_group.append('Diabetic retinopathy associated with type II diabetes mellitus (disorder)')
risk_group.append('Hyperglycemia (disorder)')
risk_group.append('Prediabetes')

# Obesity
risk_group.append('Body mass index 30+ - obesity (finding)')
risk_group.append('Body mass index 40+ - severely obese (finding)')

# Cancer
risk_group.append('Acute myeloid leukemia  disease (disorder)')
risk_group.append('Carcinoma in situ of prostate (disorder)')
risk_group.append('Chemotherapy (procedure)')
risk_group.append('Febrile neutropenia (disorder)')
risk_group.append('Malignant neoplasm of breast (disorder)')
risk_group.append('Malignant tumor of colon')
risk_group.append('Neoplasm of prostate')
risk_group.append('Neutropenia (disorder)')
risk_group.append('Overlapping malignant neoplasm of colon')

# **Clusterização da Base de Dados *scenario01 + scenario02***

In [9]:
scenario = 'scenario01+02'

dfs_features = main(scenario)
dfs_rg_features = dict()       # Risk group features

# Exportação dos resultados
for cluster in CLUSTERING:
    df_features = dfs_features[cluster]
    df_features.to_csv(f'{PATH}/{scenario}_{cluster}_features.csv', 
                       encoding = 'utf-8-sig', index = False)

    df_rg_features = df_features.query(f'feature in {risk_group}')
    df_rg_features = df_features.query(f'feature in {risk_group}')
    df_rg_features = df_rg_features.reset_index(drop = True)
    df_rg_features.to_csv(f'{PATH}/{scenario}_{cluster}_risk_group_features.csv', 
                          encoding = 'utf-8-sig', index = False)
    dfs_rg_features[cluster] = df_rg_features

## Clusterização Hierárquica

In [10]:
cluster = 'hierarchical'

### Visualização das Proporções das Features por Cluster

In [11]:
dfs_features[cluster]

Unnamed: 0,feature,0,1,[C1] 0(%),[C1] 1(%),[C2] 0(%),[C2] 1(%),[C3] 0(%),[C3] 1(%),[C4] 0(%),...,[C5] 0(%),[C5] 1(%),[C6] 0(%),[C6] 1(%),[C7] 0(%),[C7] 1(%),[C8] 0(%),[C8] 1(%),[C9] 0(%),[C9] 1(%)
0,Acquired coagulation disorder (disorder),178,4,0.56,0.00,17.42,0.00,0.00,50.00,1.12,...,0.56,0.00,23.60,0.00,55.62,50.00,0.56,0.00,0.56,0.00
1,Acute allergic reaction,179,3,0.56,0.00,17.32,0.00,1.12,0.00,1.12,...,0.56,0.00,21.79,100.00,56.42,0.00,0.56,0.00,0.56,0.00
2,Acute bacterial sinusitis (disorder),172,10,0.58,0.00,17.44,10.00,1.16,0.00,1.16,...,0.58,0.00,23.84,10.00,54.07,80.00,0.58,0.00,0.58,0.00
3,Acute bronchitis (disorder),107,75,0.93,0.00,17.76,16.00,0.93,1.33,1.87,...,0.00,1.33,20.56,26.67,57.01,53.33,0.93,0.00,0.00,1.33
4,Acute deep venous thrombosis (disorder),173,9,0.58,0.00,17.34,11.11,0.58,11.11,1.16,...,0.58,0.00,23.12,22.22,55.49,55.56,0.58,0.00,0.58,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
278,Vaccination for diphtheria pertussis and tet...,148,34,0.00,2.94,0.00,91.18,1.35,0.00,1.35,...,0.68,0.00,28.38,0.00,67.57,2.94,0.00,2.94,0.68,0.00
279,X-ray or wrist,177,5,0.56,0.00,17.51,0.00,1.13,0.00,1.13,...,0.00,20.00,22.60,40.00,55.93,40.00,0.56,0.00,0.56,0.00
280,Tdap,159,23,0.63,0.00,18.87,4.35,1.26,0.00,1.26,...,0.63,0.00,14.47,82.61,61.64,13.04,0.63,0.00,0.63,0.00
281,Td (adult) preservative free,50,132,0.00,0.76,2.00,22.73,2.00,0.76,2.00,...,0.00,0.76,84.00,0.00,10.00,72.73,0.00,0.76,0.00,0.76


### Visualização das Proporções das Features Relacionadas ao Grupo de Risco por Cluster

In [12]:
dfs_rg_features[cluster]

Unnamed: 0,feature,0,1,[C1] 0(%),[C1] 1(%),[C2] 0(%),[C2] 1(%),[C3] 0(%),[C3] 1(%),[C4] 0(%),...,[C5] 0(%),[C5] 1(%),[C6] 0(%),[C6] 1(%),[C7] 0(%),[C7] 1(%),[C8] 0(%),[C8] 1(%),[C9] 0(%),[C9] 1(%)
0,Acute bronchitis (disorder),107,75,0.93,0.0,17.76,16.0,0.93,1.33,1.87,...,0.0,1.33,20.56,26.67,57.01,53.33,0.93,0.0,0.0,1.33
1,Acute myeloid leukemia disease (disorder),161,21,0.62,0.0,16.15,23.81,1.24,0.0,1.24,...,0.62,0.0,16.77,71.43,62.11,4.76,0.62,0.0,0.62,0.0
2,Acute pulmonary embolism (disorder),164,18,0.61,0.0,17.68,11.11,0.61,5.56,0.0,...,0.61,0.0,25.0,5.56,54.27,66.67,0.61,0.0,0.61,0.0
3,Acute respiratory distress syndrome (disorder),177,5,0.56,0.0,17.51,0.0,0.56,20.0,0.0,...,0.56,0.0,23.73,0.0,55.93,40.0,0.56,0.0,0.56,0.0
4,Acute respiratory failure (disorder),164,18,0.61,0.0,18.29,5.56,0.0,11.11,0.0,...,0.61,0.0,24.39,11.11,54.88,61.11,0.61,0.0,0.61,0.0
5,Body mass index 30+ - obesity (finding),110,72,0.91,0.0,18.18,15.28,0.0,2.78,0.91,...,0.91,0.0,35.45,4.17,41.82,76.39,0.91,0.0,0.91,0.0
6,Body mass index 40+ - severely obese (finding),178,4,0.56,0.0,16.29,50.0,1.12,0.0,1.12,...,0.56,0.0,23.6,0.0,55.62,50.0,0.56,0.0,0.56,0.0
7,Cardiac Arrest,176,6,0.57,0.0,16.48,33.33,1.14,0.0,0.57,...,0.57,0.0,23.86,0.0,55.68,50.0,0.57,0.0,0.57,0.0
8,Childhood asthma,175,7,0.57,0.0,17.71,0.0,1.14,0.0,1.14,...,0.57,0.0,21.14,71.43,56.57,28.57,0.57,0.0,0.57,0.0
9,Chronic obstructive bronchitis (disorder),179,3,0.0,33.33,16.76,33.33,1.12,0.0,1.12,...,0.56,0.0,23.46,0.0,55.87,33.33,0.56,0.0,0.56,0.0


## Visualização dos Resultados - Clusterização K-means

### Visualização das Proporções das Features por Cluster

In [13]:
dfs_features[cluster]

Unnamed: 0,feature,0,1,[C1] 0(%),[C1] 1(%),[C2] 0(%),[C2] 1(%),[C3] 0(%),[C3] 1(%),[C4] 0(%),...,[C5] 0(%),[C5] 1(%),[C6] 0(%),[C6] 1(%),[C7] 0(%),[C7] 1(%),[C8] 0(%),[C8] 1(%),[C9] 0(%),[C9] 1(%)
0,Acquired coagulation disorder (disorder),178,4,0.56,0.00,17.42,0.00,0.00,50.00,1.12,...,0.56,0.00,23.60,0.00,55.62,50.00,0.56,0.00,0.56,0.00
1,Acute allergic reaction,179,3,0.56,0.00,17.32,0.00,1.12,0.00,1.12,...,0.56,0.00,21.79,100.00,56.42,0.00,0.56,0.00,0.56,0.00
2,Acute bacterial sinusitis (disorder),172,10,0.58,0.00,17.44,10.00,1.16,0.00,1.16,...,0.58,0.00,23.84,10.00,54.07,80.00,0.58,0.00,0.58,0.00
3,Acute bronchitis (disorder),107,75,0.93,0.00,17.76,16.00,0.93,1.33,1.87,...,0.00,1.33,20.56,26.67,57.01,53.33,0.93,0.00,0.00,1.33
4,Acute deep venous thrombosis (disorder),173,9,0.58,0.00,17.34,11.11,0.58,11.11,1.16,...,0.58,0.00,23.12,22.22,55.49,55.56,0.58,0.00,0.58,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
278,Vaccination for diphtheria pertussis and tet...,148,34,0.00,2.94,0.00,91.18,1.35,0.00,1.35,...,0.68,0.00,28.38,0.00,67.57,2.94,0.00,2.94,0.68,0.00
279,X-ray or wrist,177,5,0.56,0.00,17.51,0.00,1.13,0.00,1.13,...,0.00,20.00,22.60,40.00,55.93,40.00,0.56,0.00,0.56,0.00
280,Tdap,159,23,0.63,0.00,18.87,4.35,1.26,0.00,1.26,...,0.63,0.00,14.47,82.61,61.64,13.04,0.63,0.00,0.63,0.00
281,Td (adult) preservative free,50,132,0.00,0.76,2.00,22.73,2.00,0.76,2.00,...,0.00,0.76,84.00,0.00,10.00,72.73,0.00,0.76,0.00,0.76


### Visualização das Proporções das Features Relacionadas ao Grupo de Risco por Cluster

In [14]:
dfs_rg_features[cluster]

Unnamed: 0,feature,0,1,[C1] 0(%),[C1] 1(%),[C2] 0(%),[C2] 1(%),[C3] 0(%),[C3] 1(%),[C4] 0(%),...,[C5] 0(%),[C5] 1(%),[C6] 0(%),[C6] 1(%),[C7] 0(%),[C7] 1(%),[C8] 0(%),[C8] 1(%),[C9] 0(%),[C9] 1(%)
0,Acute bronchitis (disorder),107,75,0.93,0.0,17.76,16.0,0.93,1.33,1.87,...,0.0,1.33,20.56,26.67,57.01,53.33,0.93,0.0,0.0,1.33
1,Acute myeloid leukemia disease (disorder),161,21,0.62,0.0,16.15,23.81,1.24,0.0,1.24,...,0.62,0.0,16.77,71.43,62.11,4.76,0.62,0.0,0.62,0.0
2,Acute pulmonary embolism (disorder),164,18,0.61,0.0,17.68,11.11,0.61,5.56,0.0,...,0.61,0.0,25.0,5.56,54.27,66.67,0.61,0.0,0.61,0.0
3,Acute respiratory distress syndrome (disorder),177,5,0.56,0.0,17.51,0.0,0.56,20.0,0.0,...,0.56,0.0,23.73,0.0,55.93,40.0,0.56,0.0,0.56,0.0
4,Acute respiratory failure (disorder),164,18,0.61,0.0,18.29,5.56,0.0,11.11,0.0,...,0.61,0.0,24.39,11.11,54.88,61.11,0.61,0.0,0.61,0.0
5,Body mass index 30+ - obesity (finding),110,72,0.91,0.0,18.18,15.28,0.0,2.78,0.91,...,0.91,0.0,35.45,4.17,41.82,76.39,0.91,0.0,0.91,0.0
6,Body mass index 40+ - severely obese (finding),178,4,0.56,0.0,16.29,50.0,1.12,0.0,1.12,...,0.56,0.0,23.6,0.0,55.62,50.0,0.56,0.0,0.56,0.0
7,Cardiac Arrest,176,6,0.57,0.0,16.48,33.33,1.14,0.0,0.57,...,0.57,0.0,23.86,0.0,55.68,50.0,0.57,0.0,0.57,0.0
8,Childhood asthma,175,7,0.57,0.0,17.71,0.0,1.14,0.0,1.14,...,0.57,0.0,21.14,71.43,56.57,28.57,0.57,0.0,0.57,0.0
9,Chronic obstructive bronchitis (disorder),179,3,0.0,33.33,16.76,33.33,1.12,0.0,1.12,...,0.56,0.0,23.46,0.0,55.87,33.33,0.56,0.0,0.56,0.0


# **Clusterização da Base de Dados *scenario03***

In [15]:
scenario = 'scenario03'

dfs_features = main(scenario)
dfs_rg_features = dict()       # Risk group features

# Exportação dos resultados
for cluster in CLUSTERING:
    df_features = dfs_features[cluster]
    df_features.to_csv(f'{PATH}/{scenario}_{cluster}_features.csv', 
                       encoding = 'utf-8-sig', index = False)

    df_rg_features = df_features.query(f'feature in {risk_group}')
    df_rg_features = df_features.query(f'feature in {risk_group}')
    df_rg_features = df_rg_features.reset_index(drop = True)
    df_rg_features.to_csv(f'{PATH}/{scenario}_{cluster}_risk_group_features.csv', 
                          encoding = 'utf-8-sig', index = False)
    dfs_rg_features[cluster] = df_rg_features

## Clusterização Hierárquica

In [16]:
cluster = 'hierarchical'

### Visualização das Proporções das Features por Cluster

In [17]:
dfs_features[cluster]

Unnamed: 0,feature,0,1,[C1] 0(%),[C1] 1(%),[C2] 0(%),[C2] 1(%),[C3] 0(%),[C3] 1(%),[C4] 0(%),[C4] 1(%)
0,Acquired coagulation disorder (disorder),936,14,16.45,7.14,0.11,7.14,0.11,0.0,83.33,85.71
1,Acute allergic reaction,937,13,16.22,23.08,0.21,0.00,0.11,0.0,83.46,76.92
2,Acute bacterial sinusitis (disorder),888,62,15.88,22.58,0.11,1.61,0.11,0.0,83.90,75.81
3,Acute bronchitis (disorder),570,380,17.19,15.00,0.35,0.00,0.18,0.0,82.28,85.00
4,Acute deep venous thrombosis (disorder),880,70,16.59,12.86,0.23,0.00,0.11,0.0,83.07,87.14
...,...,...,...,...,...,...,...,...,...,...,...
424,Weaning from mechanically assisted ventilation...,949,1,16.23,100.00,0.21,0.00,0.11,0.0,83.46,0.00
425,X-ray or wrist,923,27,16.58,7.41,0.11,3.70,0.11,0.0,83.21,88.89
426,negative screening for depression on phq9,949,1,16.33,0.00,0.21,0.00,0.11,0.0,83.35,100.00
427,positive screening for PHQ-9,949,1,16.33,0.00,0.21,0.00,0.11,0.0,83.35,100.00


### Visualização das Proporções das Features Relacionadas ao Grupo de Risco por Cluster

In [18]:
dfs_rg_features[cluster]

Unnamed: 0,feature,0,1,[C1] 0(%),[C1] 1(%),[C2] 0(%),[C2] 1(%),[C3] 0(%),[C3] 1(%),[C4] 0(%),[C4] 1(%)
0,Acute bronchitis (disorder),570,380,17.19,15.0,0.35,0.0,0.18,0.0,82.28,85.0
1,Acute myeloid leukemia disease (disorder),818,132,17.24,10.61,0.24,0.0,0.12,0.0,82.4,89.39
2,Acute pulmonary embolism (disorder),898,52,16.04,21.15,0.22,0.0,0.11,0.0,83.63,78.85
3,Acute respiratory distress syndrome (disorder),928,22,16.49,9.09,0.11,4.55,0.11,0.0,83.3,86.36
4,Acute respiratory failure (disorder),855,95,16.61,13.68,0.0,2.11,0.12,0.0,83.27,84.21
5,Body mass index 30+ - obesity (finding),633,317,17.69,13.56,0.32,0.0,0.0,0.32,81.99,86.12
6,Body mass index 40+ - severely obese (finding),941,9,16.15,33.33,0.21,0.0,0.11,0.0,83.53,66.67
7,Carcinoma in situ of prostate (disorder),937,13,16.54,0.0,0.21,0.0,0.11,0.0,83.14,100.0
8,Cardiac Arrest,926,24,16.52,8.33,0.22,0.0,0.11,0.0,83.15,91.67
9,Childhood asthma,923,27,16.14,22.22,0.22,0.0,0.11,0.0,83.53,77.78


## Clusterização K-means

In [19]:
cluster = 'kmeans'

### Visualização das Proporções das Features por Cluster

In [20]:
dfs_features[cluster]

Unnamed: 0,feature,0,1,[C1] 0(%),[C1] 1(%),[C2] 0(%),[C2] 1(%),[C3] 0(%),[C3] 1(%),[C4] 0(%),[C4] 1(%)
0,Acquired coagulation disorder (disorder),936,14,44.76,0.00,26.82,92.86,12.71,0.00,15.71,7.14
1,Acute allergic reaction,937,13,44.18,38.46,28.07,7.69,12.38,23.08,15.37,30.77
2,Acute bacterial sinusitis (disorder),888,62,44.03,45.16,27.93,25.81,12.50,12.90,15.54,16.13
3,Acute bronchitis (disorder),570,380,42.28,46.84,29.30,25.53,11.40,14.21,17.02,13.42
4,Acute deep venous thrombosis (disorder),880,70,44.77,35.71,26.70,41.43,12.73,10.00,15.80,12.86
...,...,...,...,...,...,...,...,...,...,...,...
424,Weaning from mechanically assisted ventilation...,949,1,44.15,0.00,27.82,0.00,12.43,100.00,15.60,0.00
425,X-ray or wrist,923,27,43.99,48.15,28.06,18.52,12.46,14.81,15.49,18.52
426,negative screening for depression on phq9,949,1,44.05,100.00,27.82,0.00,12.54,0.00,15.60,0.00
427,positive screening for PHQ-9,949,1,44.05,100.00,27.82,0.00,12.54,0.00,15.60,0.00


### Visualização das Proporções das Features Relacionadas ao Grupo de Risco por Cluster

In [21]:
dfs_rg_features[cluster]

Unnamed: 0,feature,0,1,[C1] 0(%),[C1] 1(%),[C2] 0(%),[C2] 1(%),[C3] 0(%),[C3] 1(%),[C4] 0(%),[C4] 1(%)
0,Acute bronchitis (disorder),570,380,42.28,46.84,29.3,25.53,11.4,14.21,17.02,13.42
1,Acute myeloid leukemia disease (disorder),818,132,43.77,46.21,29.1,19.7,11.98,15.91,15.16,18.18
2,Acute pulmonary embolism (disorder),898,52,44.77,32.69,27.06,40.38,12.69,9.62,15.48,17.31
3,Acute respiratory distress syndrome (disorder),928,22,45.15,0.0,26.4,86.36,12.61,9.09,15.84,4.55
4,Acute respiratory failure (disorder),855,95,45.96,27.37,25.26,50.53,12.75,10.53,16.02,11.58
5,Body mass index 30+ - obesity (finding),633,317,43.6,45.11,25.91,31.55,14.53,8.52,15.96,14.83
6,Body mass index 40+ - severely obese (finding),941,9,44.31,22.22,27.74,33.33,12.43,22.22,15.52,22.22
7,Carcinoma in situ of prostate (disorder),937,13,44.08,46.15,28.07,7.69,12.49,15.38,15.37,30.77
8,Cardiac Arrest,926,24,44.71,20.83,27.65,33.33,12.74,4.17,14.9,41.67
9,Childhood asthma,923,27,43.45,66.67,28.28,11.11,12.57,11.11,15.71,11.11
