## Data statistics for the population and each cluster

In [None]:
# Include and load packages, config files
import numpy as np
import pandas as pd
import sys
sys.path.append('..')
from utils.data_utils import load_all_data
from utils.utils import compute_simlr, feat_ranking, estimate_number_clusters
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Parameters of the procedure
clusters = 4
rd_seed = 1714                                          # Random seed for experiment replication

# Paths
existing_cluster = True                               # Compute the clustering again or use an existing one
cluster_path = "results/cimlr4/"   # Path of the existing cluster, if applicable
covariate_path = "data/useddata_homo_abeta_plasma_meta.csv"                 # Path of the covariance data frame (.csv)
feature_path = "data/stats_tables/aseg_volume.csv"                     # Path of the feature path (.csv)

covariate_data, cov_names, feature_data, feature_names = load_all_data(covariate_path, feature_path)
covariate_data_nonorm, cov_names, feature_data_nonorm, feature_names = load_all_data(covariate_path, feature_path, normalize=False)

feature_data['DX'] = covariate_data.DX_bl.values

if existing_cluster:
    # Load existent
    c_data = pd.read_csv(cluster_path + 'cluster_data.csv')
    ## Load S, F data
    S = np.load(cluster_path + 'S_matrix.npy')
    ydata = np.load(cluster_path + 'ydata_matrix.npy')
    F = np.load(cluster_path + 'F_matrix.npy')
else:
    # Compute base clustering
    y_b, S, F, ydata, alpha = compute_simlr(
        np.array(covariate_data[cov_names]), clusters)

names = pd.DataFrame(data={
    "cov_names": cov_names})
names.to_csv(cluster_path + 'cov_names.csv')

names = pd.DataFrame(data={
    "feat_names": feature_names})
names.to_csv(cluster_path + 'feat_names.csv')

ptid_latex = [x for x in map(lambda x: '$' + x +'$', covariate_data.PTID.values)]
names = pd.DataFrame(data={
    "id": ptid_latex})
names.to_csv(cluster_path + 'ptid.csv')

Compute the basic summary for each of the diagnostic groups

In [None]:
# Check distribution of DX in each cluster
unique, counts = np.unique(covariate_data.DX_bl.values, return_counts=True)
print(dict(zip(unique, counts)))
print(len(covariate_data))
labels = ["MMSE", "PTEDUCAT", "AGE"]
otherlabels = ["PTGENDER", "APOE4"]
dx = ["CN", "LMCI", "AD"]
print(covariate_data[labels].describe())

g_ratio = len(covariate_data[covariate_data.PTGENDER.values == 0]) / len(covariate_data)
print(g_ratio)
# Apoe positive ratio
a_ratio = len(covariate_data[covariate_data.APOE4.values > 0]) / len(covariate_data)
print(a_ratio)

# For different diagnostics
for d in dx:
    print(d)
    c1 = covariate_data[covariate_data['DX_bl'].values == d]
    # Gender ratio (% of females)
    g_ratio = len(c1[c1.PTGENDER.values == 0]) / len(c1)
    print(g_ratio)
    # Apoe positive ratio
    a_ratio = len(c1[c1.APOE4.values > 0]) / len(c1)
    print(a_ratio)
    
    print(c1[labels].describe())


In [None]:
# Check distribution of DX in each cluster
# Create dataframe of results
df_results = pd.DataFrame(index=["Nº subj","CN","MCI","AD","Age","Gender","Education","APoe4", "MMSE"])

unique, counts = np.unique(covariate_data.DX_bl.values, return_counts=True)

labels = ["MMSE", "PTEDUCAT", "AGE"]
otherlabels = ["PTGENDER", "APOE4"]
dx = ["CN", "LMCI", "AD"]

statistics = covariate_data[labels].describe()
print(statistics)
g_ratio = int((len(covariate_data[covariate_data.PTGENDER.values == 0]) / len(covariate_data)) * 100)

# Apoe positive ratio
a_ratio = int((len(covariate_data[covariate_data.APOE4.values > 0]) / len(covariate_data)) * 100)

data_cluster = [sum(counts), counts[1],counts[2],counts[0],"{:.2f}".format(statistics.loc["mean","AGE"]) + '+-' + "{:.2f}".format(statistics.loc["std","AGE"]), 
                str(g_ratio) + '%' , "{:.2f}".format(statistics.loc["mean","PTEDUCAT"]) + '+-' + "{:.2f}".format(statistics.loc["std","PTEDUCAT"]),
                str(a_ratio) + '%' , "{:.2f}".format(statistics.loc["mean","MMSE"]) + '+-' + "{:.2f}".format(statistics.loc["std","MMSE"])]
df_results["Total"] = pd.Series(data_cluster, index=df_results.index)


# For different clusters
for c in range(1, clusters+1):
    print('Cluster ' + str(c))
    c1 = covariate_data[c_data['C'].values == c]

    unique, counts = np.unique(c1.DX_bl.values, return_counts=True)

    # Gender ratio (% of females)
    g_ratio = int((len(c1[c1.PTGENDER.values == 0]) / len(c1)) * 100)

    # Apoe positive ratio
    a_ratio = int((len(c1[c1.APOE4.values > 0]) / len(c1)) * 100)

    statistics = c1[labels].describe()
    data_cluster = [sum(counts), counts[1],counts[2],counts[0],"{:.2f}".format(statistics.loc["mean","AGE"]) + '+-' + "{:.2f}".format(statistics.loc["std","AGE"]), 
                    str(g_ratio) + '%' , "{:.2f}".format(statistics.loc["mean","PTEDUCAT"]) + '+-' + "{:.2f}".format(statistics.loc["std","PTEDUCAT"]),
                    str(a_ratio) + '%' , "{:.2f}".format(statistics.loc["mean","MMSE"]) + '+-' + "{:.2f}".format(statistics.loc["std","MMSE"])]
    df_results[c] = pd.Series(data_cluster, index=df_results.index)

df_results.to_csv(cluster_path+'cluster_stats.csv')    

In [None]:
# For clusters 1 check the statistic for each diagnostic groups
c = 1
print('Cluster ' + str(c))
c1 = covariate_data[c_data['C'].values == c]
print(len(c1))
for d in dx:
    print(d)
    c2 = c1[c1['DX_bl'].values == d]
    # Gender ratio (% of females)
    g_ratio = len(c2[c2.PTGENDER.values == 0]) / len(c2)
    print("Gender:" + str(g_ratio))
    # Apoe positive ratio
    a_ratio = len(c2[c2.APOE4.values > 0]) / len(c2)
    print(a_ratio)
    print(c2[labels].describe())


In [None]:
# Data statistics for important features in feature space

# import wilkcoxon test
from scipy.stats import ttest_1samp, wilcoxon, ttest_ind, mannwhitneyu

cluster_pairs = [(1,2), (1,3), (2,3)]

for c in cluster_pairs:
    print(c)
    c1 = covariate_data[c_data['C'].values == c[0]]
    c2 = covariate_data[c_data['C'].values == c[1]]
    for f in important_names:
        u, p_value = mannwhitneyu(c1[f].values, c2[f].values)
        print(f + ': ' + '{:.2}'.format(p_value))


In [None]:
# Data statistics for important features in feature space

# import wilkcoxon test
from scipy.stats import ttest_1samp, wilcoxon, ttest_ind, mannwhitneyu

cluster_pairs = [(1,2), (1,3), (2,3)]
diagnostic = ["CN", "LMCI", "AD"]

for c in cluster_pairs:
    print(c)
    c1 = feature_data[c_data['C'].values == c[0]]
    c2 = feature_data[c_data['C'].values == c[1]]
    for d in diagnostic:
        print(d)
        d1 = c1[c1.DX.values == d]
        d2 = c2[c2.DX.values == d]
        for f in feature_names:
            u, p_value = mannwhitneyu(d1[f].values, d2[f].values)
            print(f + ': ' + '{:.2}'.format(p_value))
