In [1]:
import pandas as pd
from sklearn.cluster import SpectralClustering
import numpy as np
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
import seaborn as sns
import matplotlib.pyplot as plt

# Read the data and applied clustering techniques

In [2]:
data = pd.read_csv('./data/1_5_CPMcutoff_suffix_1_log.csv', index_col=0)
data.head()

Unnamed: 0_level_0,C42_1,C42B_1,LNCAP_1,MR49F_1
gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ENSG00000000003,2.020308,5.802458,3.463763,3.090618
ENSG00000000419,4.22335,7.149923,5.323355,2.755359
ENSG00000000457,2.869919,1.596058,3.677716,4.178899
ENSG00000000460,1.751904,3.42174,3.052585,3.070865
ENSG00000001036,3.481197,5.683603,5.241517,4.872132


In [None]:
%%time
clustering = SpectralClustering(n_clusters=9,
        assign_labels='discretize',
        random_state=0).fit(data.values)

In [None]:
posteriors = pd.read_csv("./results/posteriors_1_5_CPMcutoff_suffix_1_log_it_1000_npEM.csv",index_col=0)
clusters_np = np.argmax(posteriors.values,axis = 1)
clusters_spect = clustering.labels_
X = data.values
X_embedded = TSNE(n_components=2).fit_transform(X)

# Visualization comparing ```npEM``` with ```SpectralClustering```

In [None]:
df = pd.DataFrame({
    'x':X_embedded[:,0],
    'y':X_embedded[:,1], 
    'cluster_np':clusters_np,
    'cluster_spect':clusters_spect
})
fig, axs = plt.subplots(ncols=2)
g = sns.scatterplot(data=df ,x='x',y='y' ,hue="cluster_spect",alpha = 0.7,palette='bright',ax=axs[0]);
g.figure.set_size_inches(20,15);
g.set_title("Visualization using t-SNE (spectral)",fontsize=25);
g2 = sns.scatterplot(data=df ,x='x',y='y' ,hue="cluster_np",alpha = 0.7,palette='bright',ax=axs[1]);
g2.figure.set_size_inches(20,15);
g2.set_title("Visualization using t-SNE (npEM)",fontsize=25);

In [None]:
X_pca = PCA(n_components=2).fit_transform(X)
df2 = pd.DataFrame({
    'x':X_pca[:,0],
    'y':X_pca[:,1], 
    'cluster_np':clusters_np,
    'cluster_spect':clusters_spect
})
fig, axs = plt.subplots(ncols=2)
g = sns.scatterplot(data=df2 ,x='x',y='y' ,hue="cluster_spect",alpha = 0.7,palette='bright',ax=axs[0]);
g.figure.set_size_inches(20,15);
g.set_title("Visualization using PCA (spectral)",fontsize=25);
g2 = sns.scatterplot(data=df2 ,x='x',y='y' ,hue="cluster_np",alpha = 0.7,palette='bright',ax=axs[1]);
g2.figure.set_size_inches(20,15);
g2.set_title("Visualization using PCA (npEM)",fontsize=25);

# Proportional size of clusters in each algorithm

In [None]:
# Proportion in npEM
df.cluster_np.value_counts(normalize=True)

In [None]:
# Proportion in Spectral Clustering
df.cluster_spect.value_counts(normalize=True)