In [1]:
from plotly.offline import init_notebook_mode, plot
init_notebook_mode(connected=True)

In [2]:
# Outside imports
import pandas as pd
import numpy as np
# import lemur
import os, sys

In [3]:
sys.path.append('../../..')
import lemur.datasets as lds
import lemur.metrics as lms
import lemur.plotters as lpl
import lemur.embedders as leb
import lemur.clustering as lcl

In [4]:
# Create a lemur dataset based on the phenotypic data
CDS = lds.CSVDataSet("../../../data/pheno/hbn_cleaned.csv",
                     name = "HBN Phenotypic")
# metadata = CDS.saveMetaData("data/pheno/metadata.json")
CDS.imputeColumns("mean")
DM = lds.DistanceMatrix(CDS, lms.VectorDifferenceNorm)

print(DM.D.shape)

# Create an embedded distance matrix object under MDS
MDSEmbedder = leb.MDSEmbedder(num_components=10)
HBN_Embedded = MDSEmbedder.embed(DM)

Dataset of size 965 samples 71 dimensions Loaded
Replacing all . with nan
Dataset of size 965 samples 71 dimensions Resulting
Removing column Identifiers because it is not numeric
Imputing column ACE,ACE_Score with value 1.7037037037
Imputing column APQ_P,APQ_P_Total with value 99.3910081744
Imputing column APQ_SR,APQ_SR_Total with value 118.921126761
Imputing column ARI_P,ARI_P_Total_Score with value 3.40868454662
Imputing column ARI_S,ARI_S_Total_Score with value 3.84046692607
Imputing column ASR,ASR_Total with value 68.3448275862
Imputing column ASSQ,ASSQ_Total with value 8.53448275862
Imputing column Audit,AUDIT_Total_Score with value 0.46835443038
Imputing column Barratt,Barratt_Total with value 45.6932926829
Imputing column CBCL,CBCL_Total with value 42.3391188251
Imputing column CBCL_Pre,CBCLPre_Total with value 45.0769230769
Imputing column CDI_P,CDI2P_Total with value 15.4077669903
Imputing column CDI_SR,CDI2_Total with value 11.4285714286
Imputing column CELF,CELF_CriterionSc

In [5]:
BASE = "data"
out_base = os.path.join(BASE, "pheno_derivatives")
out_emb_base = os.path.join(BASE, "pheno_embedded_deriatives")
os.makedirs(out_base + "/agg", exist_ok=True)
os.makedirs(out_emb_base + "/agg", exist_ok=True)

In [None]:
lpl.Heatmap(CDS).plot()
# lpl.Heatmap(HBN_Embedded, mode="savediv", base_path=out_emb_base).plot()

In [None]:
lpl.LocationLines(CDS).plot()
# lpl.LocationLines(HBN_Embedded, mode="savediv", base_path = out_emb_base).plot()

In [None]:
lpl.LocationHeatmap(CDS).plot()
# lpl.LocationHeatmap(HBN_Embedded, mode="savediv", base_path=out_emb_base).plot()

In [None]:
lpl.HistogramHeatmap(CDS).plot()
# lpl.HistogramHeatmap(HBN_Embedded, mode="savediv", base_path=out_emb_base).plot()

In [None]:
#lpl.CorrelationMatrix(CDS).plot()
#lpl.CorrelationMatrix(HBN_Embedded).plot()
#lpl.CorrelationMatrix(CDS, mode="savediv", base_path=out_base).plot()
#lpl.CorrelationMatrix(HBN_Embedded, mode="savediv", base_path=out_base).plot()

In [None]:
lpl.ScreePlotter(CDS, mode="savediv", base_path=out_base).plot()
lpl.ScreePlotter(HBN_Embedded, mode="savediv", base_path=out_emb_base).plot()

In [None]:
lpl.EigenvectorHeatmap(CDS, mode="savediv", base_path=out_base).plot()
lpl.EigenvectorHeatmap(HBN_Embedded, mode="savediv", base_path=out_base).plot()

## Adding clustering

In [8]:
clustered = lcl.HGMMClustering(HBN_Embedded, 4)

In [22]:
print(len(clustered.clusters))
for i in range(5):
    print('i ' + str(len(clustered.clusters[i])))
print(type(clustered.clusters[3][0][0]))

5
i 1
i 2
i 4
i 6
i 7
<class 'numpy.ndarray'>


In [6]:
kmeans = lcl.KMeans(HBN_Embedded, 4)

In [8]:
print(len(kmeans.clusters[kmeans.levels]))

4


In [31]:
samples = []
labels = []
for i, c in enumerate(kmeans.clusters[kmeans.levels]):
    samples.append(c.T)
    labels.append(c.shape[0] * [i])

In [32]:
print(labels)
print(len(samples))

[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 

In [33]:
print(samples)

[array([[ -1.44666708e+01,  -3.39889309e+01,   9.66594442e+00, ...,
          8.54184070e+01,   7.79089721e+01,   1.08689558e+02],
       [ -1.59212240e+01,  -5.71053256e+00,   1.25898609e+01, ...,
         -7.31322651e+01,  -7.34559056e+01,  -4.22559844e+01],
       [  1.35705795e+01,   1.56336575e+01,   8.37381980e+00, ...,
          1.13265646e+02,   1.11178828e+02,   1.24127724e+02],
       ..., 
       [ -2.61051406e+01,  -3.88835827e+01,   3.10554817e+01, ...,
          7.88092131e+01,   7.20045122e+01,   9.18967923e+01],
       [  2.99099981e+01,  -1.00242057e+01,  -6.35947220e+01, ...,
          1.00791294e+01,   1.17206698e+01,   1.20544996e-01],
       [ -1.14691121e+01,   5.22134151e+01,   1.46905589e+01, ...,
          3.56627984e+01,   4.81951617e+01,   4.51361110e+01]]), array([[-211.40666492, -185.4769897 , -182.0719198 , ..., -101.77111801,
         -80.83378262,  -94.78483691],
       [ 158.29924523,  138.67161481,   57.29393527, ...,   84.30693282,
          69.486329

In [36]:
new_samples = np.hstack(samples)[:3,:]
print(new_samples.shape)

(3, 965)


In [23]:
print(len(np.hstack(labels)))

40


In [23]:
new_samples2 = np.expand_dims(new_samples, axis=1)

In [17]:
print(new_samples.shape)

(3, 10)


In [28]:
new_samples2[:3,:].T
df = pd.DataFrame(new_samples2.T, columns=["Dim %d"%i for i in range(new_samples2.shape[0])])

In [7]:
spectral = lcl.SpectralClustering(DM, 4)

In [7]:
lpl.ClusterPairsPlot(kmeans).plot()

In [8]:
lpl.HGMMPairsPlot(clustered).plot()

In [14]:
lpl.HGMMClusterMeansLevelLines(clustered).plot()