In [3]:
from plotly.offline import init_notebook_mode, plot
init_notebook_mode(connected=True)

In [4]:
# Outside imports
import pandas as pd
import numpy as np
# import lemur
import os, sys

In [5]:
sys.path.append('../../..')
import lemur.datasets as lds
import lemur.metrics as lms
import lemur.plotters as lpl
import lemur.embedders as leb
import lemur.clustering as lcl

In [6]:
# Create a lemur dataset based on the phenotypic data
CDS = lds.CSVDataSet("../../../data/hbn_cleaned.csv",
                     name = "HBN Phenotypic")
# metadata = CDS.saveMetaData("data/pheno/metadata.json")
CDS.imputeColumns("mean")
DM = lds.DistanceMatrix(CDS, lms.VectorDifferenceNorm)

print(DM.D.shape)

# Create an embedded distance matrix object under MDS
MDSEmbedder = leb.MDSEmbedder(num_components=10)
HBN_Embedded = MDSEmbedder.embed(DM)

Dataset of size 965 samples 71 dimensions Loaded
Replacing all . with nan
Dataset of size 965 samples 71 dimensions Resulting
Removing column Identifiers because it is not numeric
Imputing column ACE,ACE_Score with value 1.7037037037
Imputing column APQ_P,APQ_P_Total with value 99.3910081744
Imputing column APQ_SR,APQ_SR_Total with value 118.921126761
Imputing column ARI_P,ARI_P_Total_Score with value 3.40868454662
Imputing column ARI_S,ARI_S_Total_Score with value 3.84046692607
Imputing column ASR,ASR_Total with value 68.3448275862
Imputing column ASSQ,ASSQ_Total with value 8.53448275862
Imputing column Audit,AUDIT_Total_Score with value 0.46835443038
Imputing column Barratt,Barratt_Total with value 45.6932926829
Imputing column CBCL,CBCL_Total with value 42.3391188251
Imputing column CBCL_Pre,CBCLPre_Total with value 45.0769230769
Imputing column CDI_P,CDI2P_Total with value 15.4077669903
Imputing column CDI_SR,CDI2_Total with value 11.4285714286
Imputing column CELF,CELF_CriterionSc

In [7]:
BASE = "data"
out_base = os.path.join(BASE, "pheno_derivatives")
out_emb_base = os.path.join(BASE, "pheno_embedded_deriatives")
os.makedirs(out_base + "/agg", exist_ok=True)
os.makedirs(out_emb_base + "/agg", exist_ok=True)

In [None]:
lpl.Heatmap(CDS).plot()
# lpl.Heatmap(HBN_Embedded, mode="savediv", base_path=out_emb_base).plot()

In [None]:
lpl.LocationLines(CDS).plot()
# lpl.LocationLines(HBN_Embedded, mode="savediv", base_path = out_emb_base).plot()

In [None]:
lpl.LocationHeatmap(CDS).plot()
# lpl.LocationHeatmap(HBN_Embedded, mode="savediv", base_path=out_emb_base).plot()

In [None]:
lpl.HistogramHeatmap(CDS).plot()
# lpl.HistogramHeatmap(HBN_Embedded, mode="savediv", base_path=out_emb_base).plot()

In [None]:
#lpl.CorrelationMatrix(CDS).plot()
#lpl.CorrelationMatrix(HBN_Embedded).plot()
#lpl.CorrelationMatrix(CDS, mode="savediv", base_path=out_base).plot()
#lpl.CorrelationMatrix(HBN_Embedded, mode="savediv", base_path=out_base).plot()

In [None]:
lpl.ScreePlotter(CDS, mode="savediv", base_path=out_base).plot()
lpl.ScreePlotter(HBN_Embedded, mode="savediv", base_path=out_emb_base).plot()

In [None]:
lpl.EigenvectorHeatmap(CDS, mode="savediv", base_path=out_base).plot()
lpl.EigenvectorHeatmap(HBN_Embedded, mode="savediv", base_path=out_base).plot()

## Adding HGMM Clustering

In [8]:
hgmm = lcl.HGMMClustering(HBN_Embedded, 4)

In [9]:
hgmm.cluster()

In [10]:
lpl.ClusterPairsPlot(hgmm).plot()

In [11]:
lpl.ClusterMeansLevelLines(hgmm).plot()

In [12]:
lpl.ClusterMeansLevelHeatmap(hgmm).plot()

In [13]:
lpl.HierarchicalStackedClusterMeansHeatmap(hgmm).plot()

In [14]:
lpl.HierarchicalClusterMeansDendrogram(hgmm).plot()

### Test Plot

In [9]:
lpl.ClusterMeansLevelLines(hgmm).plot()

In [11]:
lpl.OrigClusterMeansLevelLines(hgmm).plot()

## Adding KMeans Clustering

In [15]:
kmeans = lcl.KMeans(HBN_Embedded, 4)

In [16]:
kmeans.cluster()

In [17]:
lpl.ClusterPairsPlot(kmeans).plot()

In [18]:
lpl.ClusterMeansLevelLines(kmeans).plot()

In [19]:
lpl.ClusterMeansLevelHeatmap(kmeans).plot()

In [20]:
lpl.HierarchicalStackedClusterMeansHeatmap(kmeans).plot()

In [21]:
lpl.HierarchicalClusterMeansDendrogram(kmeans).plot()