## Data statistics for the population and each cluster

In [7]:
# Include and load packages, config files

import numpy as np
import simlr_ad
import pandas as pd
from utils.data_utils import load_all_data
from utils.utils import compute_simlr, feat_ranking, estimate_number_clusters
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Parameters of the procedure
clusters = 3
rd_seed = 1714                                          # Random seed for experiment replication

# Paths
existing_cluster = True                               # Compute the clustering again or use an existing one
cluster_path = "results/extendeddata_cluster/"   # Path of the existing cluster, if applicable
covariate_path = "data/useddata_homo_abeta_plasma_meta.csv"                 # Path of the covariance data frame (.csv)
feature_path = "data/UCSDVOL.csv"                     # Path of the feature path (.csv)

covariate_data, cov_names, feature_data, feature_names = load_all_data(covariate_path, feature_path)
feature_data['DX'] = covariate_data.DX_bl.values

if existing_cluster:
    # Load existent
    c_data = pd.read_csv(cluster_path + 'cluster_data.csv')
    ## Load S, F data
    S = np.load(cluster_path + 'S_matrix.npy')
    ydata = np.load(cluster_path + 'ydata_matrix.npy')
    F = np.load(cluster_path + 'F_matrix.npy')
else:
    # Compute base clustering
    y_b, S, F, ydata, alpha = compute_simlr(
        np.array(covariate_data[cov_names]), clusters)


Compute the basic summary for each of the diagnostic groups

In [8]:
# Check distribution of DX in each cluster
unique, counts = np.unique(covariate_data.DX_bl.values, return_counts=True)
print(dict(zip(unique, counts)))
print(len(covariate_data))
labels = ["MMSE", "PTEDUCAT", "AGE"]
otherlabels = ["PTGENDER", "APOE4"]
dx = ["CN", "LMCI", "AD"]
print(covariate_data[labels].describe())

g_ratio = len(covariate_data[covariate_data.PTGENDER.values == 0]) / len(covariate_data)
print(g_ratio)
# Apoe positive ratio
a_ratio = len(covariate_data[covariate_data.APOE4.values > 0]) / len(covariate_data)
print(a_ratio)

# For different diagnostics
for d in dx:
    print(d)
    c1 = covariate_data[covariate_data['DX_bl'].values == d]
    # Gender ratio (% of females)
    g_ratio = len(c1[c1.PTGENDER.values == 0]) / len(c1)
    print(g_ratio)
    # Apoe positive ratio
    a_ratio = len(c1[c1.APOE4.values > 0]) / len(c1)
    print(a_ratio)
    
    # print(c1[labels].describe())


{'AD': 85, 'CN': 52, 'LMCI': 161}
298
             MMSE    PTEDUCAT         AGE
count  298.000000  298.000000  298.000000
mean    26.325503   15.687919   74.447651
std      2.620449    2.924399    7.392439
min     20.000000    6.000000   55.100000
25%     25.000000   14.000000   70.350000
50%     26.000000   16.000000   74.450000
75%     29.000000   18.000000   79.600000
max     30.000000   20.000000   89.600000
0.3926174496644295
0.4899328859060403
CN
0.4423076923076923
0.09615384615384616
LMCI
0.33540372670807456
0.5217391304347826
AD
0.47058823529411764
0.6705882352941176


In [14]:
# For different clusters
for c in range(1, clusters+1):
    print('Cluster ' + str(c))
    c1 = covariate_data[c_data['C'].values == c]
    print(len(c1))
    unique, counts = np.unique(c1.DX_bl.values, return_counts=True)
    print("Number of each diagnostic")
    print(dict(zip(unique, counts)))
    # Gender ratio (% of females)
    g_ratio = len(c1[c1.PTGENDER.values == 0]) / len(c1)
    print("Gender ratio (females): " + str(g_ratio))
    # Apoe positive ratio
    a_ratio = len(c1[c1.APOE4.values > 0]) / len(c1)
    print("Apoe (+): " + str(a_ratio))
    print(c1[labels].describe())

    
    

Cluster 1
112
Number of each diagnostic
{'AD': 20, 'CN': 24, 'LMCI': 68}
Gender ratio (females): 0.07142857142857142
Apoe (+): 0.42857142857142855
             MMSE    PTEDUCAT         AGE
count  112.000000  112.000000  112.000000
mean    26.687500   15.857143   76.241071
std      2.529221    2.900317    6.175782
min     20.000000    6.000000   57.700000
25%     25.000000   14.000000   72.475000
50%     27.000000   16.000000   76.200000
75%     29.000000   18.000000   80.400000
max     30.000000   20.000000   88.300000
Cluster 2
130
Number of each diagnostic
{'AD': 36, 'CN': 19, 'LMCI': 75}
Gender ratio (females): 0.6846153846153846
Apoe (+): 0.5461538461538461
             MMSE    PTEDUCAT         AGE
count  130.000000  130.000000  130.000000
mean    26.507692   15.469231   72.693846
std      2.386156    3.035165    7.481458
min     21.000000    8.000000   55.100000
25%     25.000000   13.000000   68.125000
50%     26.000000   16.000000   72.450000
75%     29.000000   18.000000   77.6

In [5]:
# Data statistics for important features
important_names = ["Beta-2-Microglobulin (B2M) (ug/mL)",
"Cystatin-C (ng/ml)",
"T-Cell-Specific Protein RANTES (RANTES) (ng/mL)",
"Brain-Derived Neurotrophic Factor (BDNF) (ng/mL)",
"Platelet-Derived Growth Factor BB (PDGF- (pg/ml)",
"Growth-Regulated alpha protein (GRO-alph (pg/mL)",
"CD40 Ligand (CD40-L) (ng/mL)",
"Epidermal Growth Factor (EGF) (pg/mL)",
"Follicle-Stimulating Hormone (FSH) (mIU/mL)",
"Trefoil Factor 3 (TFF3) (ug/ml)",
"Tissue Inhibitor of Metalloproteinases 1 (ng/mL)",
"Epithelial-Derived Neutrophil-Activating (ng/mL)",
"Alpha-1-Microglobulin (A1Micro) (ug/ml)",
"Luteinizing Hormone  (LH) (mIU/mL)",
"Kynurenine",
"Apolipoprotein H (Apo H) (ug/mL)"
]

for feat in important_names:


In [None]:
# Data statistics for important features in feature space

for feat in feature_names:
    feature_data