In [1]:
%pylab inline
import pandas as pd
import numpy as np

import torch
from torchvision import transforms
import matplotlib.pyplot as plt

import torch.nn as nn
import torch.nn.functional as F
from torch.nn.parameter import Parameter

import torch.optim as optim                          # optimization

import os
from tqdm import tqdm                                # for progress bar
from sklearn.model_selection import train_test_split
from sklearn.cluster import AgglomerativeClustering, KMeans
from sklearn.metrics import adjusted_rand_score

from clustering import clus

from numba import njit

if torch.cuda.is_available():
    device = torch.device("cuda:0")
    print("running on the gpu")
else:
    device = torch.device("cpu")
    print("running on the cpu")
    
from compression import pipeline

Populating the interactive namespace from numpy and matplotlib
running on the gpu
running on the gpu


In [2]:
path = "xin"
do_clus = True
latent = pipeline(path, 1024, 10, False, 1024, [10, 20], vae_choice='paper', retrain=True) # a list with 3 differently trained latent variables
print(len(latent), latent[0].shape)


non_negative_kernel_autoencoder(
  (encoder): Linear(in_features=39851, out_features=50, bias=True)
  (decoder): Linear(in_features=50, out_features=39851, bias=True)
)
training non-negative kernel autoencoder


100%|██████████| 2/2 [00:00<00:00,  6.84it/s]


Loss:  0.002848167670890689


100%|██████████| 2/2 [00:00<00:00, 401.00it/s]


Loss:  0.0028481644112616777


100%|██████████| 2/2 [00:00<00:00, 661.09it/s]


Loss:  0.0028481604531407356


100%|██████████| 2/2 [00:00<00:00, 578.92it/s]


Loss:  0.002848156727850437


100%|██████████| 2/2 [00:00<00:00, 501.38it/s]


Loss:  0.0028481523040682077


100%|██████████| 2/2 [00:00<00:00, 668.31it/s]


Loss:  0.0028481478802859783


100%|██████████| 2/2 [00:00<00:00, 668.47it/s]


Loss:  0.0028481436893343925


100%|██████████| 2/2 [00:00<00:00, 661.15it/s]


Loss:  0.002848139265552163


100%|██████████| 2/2 [00:00<00:00, 496.46it/s]


Loss:  0.00284813460893929


100%|██████████| 2/2 [00:00<00:00, 662.71it/s]


Loss:  0.002848129952326417
paper_encoder(
  (encoder): Linear(in_features=1600, out_features=64, bias=True)
  (bn): BatchNorm1d(64, eps=0.001, momentum=0.01, affine=True, track_running_stats=True)
  (mu): Linear(in_features=64, out_features=15, bias=True)
  (var): Linear(in_features=64, out_features=15, bias=True)
  (sample_expander): ModuleList(
    (0): Linear(in_features=15, out_features=64, bias=True)
    (1): Linear(in_features=15, out_features=64, bias=True)
  )
  (decoder): ModuleList(
    (0): Linear(in_features=64, out_features=1600, bias=True)
    (1): Linear(in_features=64, out_features=1600, bias=True)
  )
)
training stacked bayesian autoencoder

###############################
#phase 1: the warm-up process#
######################################


100%|██████████| 2/2 [00:00<00:00, 133.70it/s]


Loss: 0.31130990386009216


100%|██████████| 2/2 [00:00<00:00, 111.40it/s]


Loss: 0.2785595655441284


100%|██████████| 2/2 [00:00<00:00, 133.70it/s]


Loss: 0.24904029071331024


100%|██████████| 2/2 [00:00<00:00, 133.69it/s]


Loss: 0.22244927287101746


100%|██████████| 2/2 [00:00<00:00, 143.25it/s]


Loss: 0.199151411652565


100%|██████████| 2/2 [00:00<00:00, 133.68it/s]


Loss: 0.17809465527534485


100%|██████████| 2/2 [00:00<00:00, 117.94it/s]


Loss: 0.15863338112831116


100%|██████████| 2/2 [00:00<00:00, 167.13it/s]


Loss: 0.1420782208442688


100%|██████████| 2/2 [00:00<00:00, 95.51it/s]


Loss: 0.1273421049118042


100%|██████████| 2/2 [00:00<00:00, 154.26it/s]


Loss: 0.11358308792114258

##############################
#phase 2: the VAE stage#
############################


100%|██████████| 2/2 [00:00<00:00, 125.33it/s]


Loss: 3.3040802478790283


100%|██████████| 2/2 [00:00<00:00, 83.45it/s]


Loss: 3.1267006397247314


100%|██████████| 2/2 [00:00<00:00, 125.06it/s]


Loss: 2.9736621379852295


100%|██████████| 2/2 [00:00<00:00, 105.38it/s]


Loss: 2.8461782932281494


100%|██████████| 2/2 [00:00<00:00, 133.38it/s]


Loss: 2.7409515380859375


100%|██████████| 2/2 [00:00<00:00, 133.69it/s]


Loss: 2.6558711528778076


100%|██████████| 2/2 [00:00<00:00, 124.79it/s]


Loss: 2.582597255706787


100%|██████████| 2/2 [00:00<00:00, 105.36it/s]


Loss: 2.5216832160949707


100%|██████████| 2/2 [00:00<00:00, 125.34it/s]


Loss: 2.470773220062256


100%|██████████| 2/2 [00:00<00:00, 133.69it/s]


Loss: 2.4248881340026855


100%|██████████| 2/2 [00:00<00:00, 143.25it/s]


Loss: 2.3880579471588135


100%|██████████| 2/2 [00:00<00:00, 133.98it/s]


Loss: 2.354860305786133


100%|██████████| 2/2 [00:00<00:00, 137.46it/s]


Loss: 2.3250207901000977


100%|██████████| 2/2 [00:00<00:00, 95.50it/s]


Loss: 2.3000283241271973


100%|██████████| 2/2 [00:00<00:00, 142.95it/s]


Loss: 2.275865077972412


100%|██████████| 2/2 [00:00<00:00, 125.34it/s]


Loss: 2.25368070602417


100%|██████████| 2/2 [00:00<00:00, 105.55it/s]


Loss: 2.235820770263672


100%|██████████| 2/2 [00:00<00:00, 111.21it/s]


Loss: 2.2171449661254883


100%|██████████| 2/2 [00:00<00:00, 133.70it/s]


Loss: 2.202585220336914


100%|██████████| 2/2 [00:00<00:00, 133.69it/s]


Loss: 2.1903324127197266
paper_encoder(
  (encoder): Linear(in_features=1600, out_features=64, bias=True)
  (bn): BatchNorm1d(64, eps=0.001, momentum=0.01, affine=True, track_running_stats=True)
  (mu): Linear(in_features=64, out_features=15, bias=True)
  (var): Linear(in_features=64, out_features=15, bias=True)
  (sample_expander): ModuleList(
    (0): Linear(in_features=15, out_features=64, bias=True)
    (1): Linear(in_features=15, out_features=64, bias=True)
  )
  (decoder): ModuleList(
    (0): Linear(in_features=64, out_features=1600, bias=True)
    (1): Linear(in_features=64, out_features=1600, bias=True)
  )
)
training stacked bayesian autoencoder

###############################
#phase 1: the warm-up process#
######################################


100%|██████████| 2/2 [00:00<00:00, 166.67it/s]


Loss: 0.31458908319473267


100%|██████████| 2/2 [00:00<00:00, 167.10it/s]


Loss: 0.2818939983844757


100%|██████████| 2/2 [00:00<00:00, 166.67it/s]


Loss: 0.2518848180770874


100%|██████████| 2/2 [00:00<00:00, 167.12it/s]


Loss: 0.22541843354701996


100%|██████████| 2/2 [00:00<00:00, 100.27it/s]


Loss: 0.20161989331245422


100%|██████████| 2/2 [00:00<00:00, 154.27it/s]


Loss: 0.18013408780097961


100%|██████████| 2/2 [00:00<00:00, 154.27it/s]


Loss: 0.1605500876903534


100%|██████████| 2/2 [00:00<00:00, 153.89it/s]


Loss: 0.14347267150878906


100%|██████████| 2/2 [00:00<00:00, 154.23it/s]


Loss: 0.12790025770664215


100%|██████████| 2/2 [00:00<00:00, 154.26it/s]


Loss: 0.11442844569683075

##############################
#phase 2: the VAE stage#
############################


100%|██████████| 2/2 [00:00<00:00, 111.20it/s]


Loss: 3.3057875633239746


100%|██████████| 2/2 [00:00<00:00, 133.70it/s]


Loss: 3.123231887817383


100%|██████████| 2/2 [00:00<00:00, 105.55it/s]


Loss: 2.9715194702148438


100%|██████████| 2/2 [00:00<00:00, 133.70it/s]


Loss: 2.842543840408325


100%|██████████| 2/2 [00:00<00:00, 133.71it/s]


Loss: 2.7387871742248535


100%|██████████| 2/2 [00:00<00:00, 133.70it/s]


Loss: 2.649380922317505


100%|██████████| 2/2 [00:00<00:00, 111.41it/s]


Loss: 2.5784459114074707


100%|██████████| 2/2 [00:00<00:00, 143.22it/s]


Loss: 2.51662540435791


100%|██████████| 2/2 [00:00<00:00, 133.69it/s]


Loss: 2.466731071472168


100%|██████████| 2/2 [00:00<00:00, 133.39it/s]


Loss: 2.4224987030029297


100%|██████████| 2/2 [00:00<00:00, 133.36it/s]


Loss: 2.3828701972961426


100%|██████████| 2/2 [00:00<00:00, 95.49it/s]


Loss: 2.349799633026123


100%|██████████| 2/2 [00:00<00:00, 133.69it/s]


Loss: 2.320326328277588


100%|██████████| 2/2 [00:00<00:00, 133.69it/s]


Loss: 2.294290065765381


100%|██████████| 2/2 [00:00<00:00, 133.69it/s]


Loss: 2.2705368995666504


100%|██████████| 2/2 [00:00<00:00, 111.18it/s]


Loss: 2.250584602355957


100%|██████████| 2/2 [00:00<00:00, 133.46it/s]


Loss: 2.232520580291748


100%|██████████| 2/2 [00:00<00:00, 133.69it/s]


Loss: 2.214327812194824


100%|██████████| 2/2 [00:00<00:00, 142.92it/s]


Loss: 2.200326442718506


100%|██████████| 2/2 [00:00<00:00, 125.08it/s]


Loss: 2.187258243560791
paper_encoder(
  (encoder): Linear(in_features=1600, out_features=64, bias=True)
  (bn): BatchNorm1d(64, eps=0.001, momentum=0.01, affine=True, track_running_stats=True)
  (mu): Linear(in_features=64, out_features=15, bias=True)
  (var): Linear(in_features=64, out_features=15, bias=True)
  (sample_expander): ModuleList(
    (0): Linear(in_features=15, out_features=64, bias=True)
    (1): Linear(in_features=15, out_features=64, bias=True)
  )
  (decoder): ModuleList(
    (0): Linear(in_features=64, out_features=1600, bias=True)
    (1): Linear(in_features=64, out_features=1600, bias=True)
  )
)
training stacked bayesian autoencoder

###############################
#phase 1: the warm-up process#
######################################


100%|██████████| 2/2 [00:00<00:00, 154.26it/s]


Loss: 0.31567955017089844


100%|██████████| 2/2 [00:00<00:00, 167.12it/s]


Loss: 0.2828906774520874


100%|██████████| 2/2 [00:00<00:00, 167.11it/s]


Loss: 0.25388216972351074


100%|██████████| 2/2 [00:00<00:00, 117.96it/s]


Loss: 0.22705602645874023


100%|██████████| 2/2 [00:00<00:00, 167.11it/s]


Loss: 0.20325708389282227


100%|██████████| 2/2 [00:00<00:00, 125.09it/s]


Loss: 0.18193906545639038


100%|██████████| 2/2 [00:00<00:00, 91.02it/s]


Loss: 0.16319110989570618


100%|██████████| 2/2 [00:00<00:00, 154.27it/s]


Loss: 0.1458778977394104


100%|██████████| 2/2 [00:00<00:00, 91.02it/s]


Loss: 0.13048981130123138


100%|██████████| 2/2 [00:00<00:00, 111.42it/s]


Loss: 0.11637197434902191

##############################
#phase 2: the VAE stage#
############################


100%|██████████| 2/2 [00:00<00:00, 111.41it/s]


Loss: 3.3982386589050293


100%|██████████| 2/2 [00:00<00:00, 125.34it/s]


Loss: 3.209719657897949


100%|██████████| 2/2 [00:00<00:00, 125.35it/s]


Loss: 3.047663927078247


100%|██████████| 2/2 [00:00<00:00, 80.21it/s]


Loss: 2.9104442596435547


100%|██████████| 2/2 [00:00<00:00, 102.78it/s]


Loss: 2.798534870147705


100%|██████████| 2/2 [00:00<00:00, 125.33it/s]


Loss: 2.703227996826172


100%|██████████| 2/2 [00:00<00:00, 133.69it/s]


Loss: 2.6268978118896484


100%|██████████| 2/2 [00:00<00:00, 117.95it/s]


Loss: 2.558938980102539


100%|██████████| 2/2 [00:00<00:00, 125.33it/s]


Loss: 2.5013298988342285


100%|██████████| 2/2 [00:00<00:00, 95.50it/s]


Loss: 2.4523348808288574


100%|██████████| 2/2 [00:00<00:00, 95.49it/s]


Loss: 2.411496639251709


100%|██████████| 2/2 [00:00<00:00, 125.34it/s]


Loss: 2.3753538131713867


100%|██████████| 2/2 [00:00<00:00, 125.33it/s]


Loss: 2.3408660888671875


100%|██████████| 2/2 [00:00<00:00, 117.96it/s]


Loss: 2.314054489135742


100%|██████████| 2/2 [00:00<00:00, 127.56it/s]


Loss: 2.2876296043395996


100%|██████████| 2/2 [00:00<00:00, 133.70it/s]


Loss: 2.2664542198181152


100%|██████████| 2/2 [00:00<00:00, 133.70it/s]


Loss: 2.245938777923584


100%|██████████| 2/2 [00:00<00:00, 105.55it/s]


Loss: 2.2282943725585938


100%|██████████| 2/2 [00:00<00:00, 117.96it/s]


Loss: 2.2107436656951904


100%|██████████| 2/2 [00:00<00:00, 105.54it/s]


Loss: 2.1965765953063965
3 (1600, 15)


In [3]:

if do_clus:
    # Use an ensemble of data projection models to achieve higher accuracy and to avoid local minima, not needed if we use kmeans++
    # first repeat the data projection
    labels = []
    for hidden in latent:
        # labels.append(clus(hidden, k=6, nmax=100))
        labels.append(clus(hidden, nmax=50))
    labels = np.array(labels) 
    print(labels)   
    S = np.zeros((len(labels), len(labels)))  # chance that cell i and j are in the same cluster
    for i, row in enumerate(S):
        for j, _ in enumerate(S):
            if not (i==j):
                S[i, j] = adjusted_rand_score(labels[i], labels[j])
    for i, row in enumerate(S):
        S[i,i] = row.mean()
    print(S)
    found = False
    if (S[S < 0.7]).sum() > 0:
        i = 2
    else:
        i = 1
        
    # find best guessed label (latent variable)
    while not found:
        # print(f'i={i}')
        tmp = KMeans(n_clusters = i, n_init = 100, max_iter = 5000).fit(S)
        k = tmp.labels_
        max = 0
        for c in range(tmp.cluster_centers_.shape[0]): # for k clusters
            score = S[k == c, k == c].mean()
            if score > max and (k==c).sum() > 1:
                max = score
                idx = (k == c)
        if max > 0.8:
            found = True
        if i >= 3:
            found = True
        
        i += 1
    # guess number of clusters
    tmp = []
    for label in labels[idx]:
        tmp.append(np.unique(label).shape[0])
        print(tmp)
    cluster_max = np.floor(np.mean(tmp)+0.5).astype(np.int)
    
    
    # (i) calculate cell-cell weighted similarity matrix 
    W = S * (1 - S)
    print(W.max(), W.min())
    # then combine the clustering results using the wMetaC
    # wMetaC = AgglomerativeClustering(n_clusters=k_classes, linkage='ward')
    # # wMetaC = AgglomerativeClustering(n_clusters=k_classes, affinity='precomputed')
    # wMetaC.fit(latent)
    # # wMetaC.fit(clustered.affinity_matrix_.toarray())
    # print(wMetaC)
    # print(clustered.labels_)
    # print(wMetaC.labels_)

        # print(latent.size())
    
    


(1600, 15)
finding best k...
best k is 5
running SC on k=5...
SpectralClustering(affinity='nearest_neighbors', eigen_solver='arpack',
                   n_clusters=5, n_jobs=-1, n_neighbors=7, random_state=0)
(1600, 15)
finding best k...
best k is 5
running SC on k=5...
SpectralClustering(affinity='nearest_neighbors', eigen_solver='arpack',
                   n_clusters=5, n_jobs=-1, n_neighbors=7, random_state=0)
(1600, 15)
finding best k...
best k is 5
running SC on k=5...
SpectralClustering(affinity='nearest_neighbors', eigen_solver='arpack',
                   n_clusters=5, n_jobs=-1, n_neighbors=7, random_state=0)
[[1 3 3 ... 3 1 3]
 [0 0 0 ... 0 3 1]
 [2 0 2 ... 3 3 4]]
[[ 4.09367810e-05  1.27577776e-03 -1.15296742e-03]
 [ 1.27577776e-03  6.17736990e-04  5.77433208e-04]
 [-1.15296742e-03  5.77433208e-04 -1.91844737e-04]]


NameError: name 'idx' is not defined

In [4]:
pipeline('campbell', 1000, 3, False, 1000, 5, vae_choice='paper')

training non-negative kernel autoencoder


100%|██████████| 22/22 [00:00<00:00, 106.05it/s]


Loss:  0.001624945318326354


100%|██████████| 22/22 [00:00<00:00, 107.14it/s]


Loss:  0.0016046841628849506


100%|██████████| 22/22 [00:00<00:00, 108.25it/s]


Loss:  0.0015911390073597431
training stacked bayesian autoencoder
phase 1: the warm-up process


100%|██████████| 22/22 [00:00<00:00, 154.05it/s]


Loss:  0.02688029780983925


100%|██████████| 22/22 [00:00<00:00, 159.85it/s]


Loss:  0.01863638311624527


100%|██████████| 22/22 [00:00<00:00, 152.01it/s]


Loss:  0.0181533545255661


100%|██████████| 22/22 [00:00<00:00, 156.57it/s]


Loss:  0.01757773384451866


100%|██████████| 22/22 [00:00<00:00, 161.01it/s]


Loss:  0.01729338802397251
phase 2: the VAE stage


100%|██████████| 22/22 [00:00<00:00, 132.88it/s]


Loss:  2.0298502445220947


100%|██████████| 22/22 [00:00<00:00, 140.50it/s]


Loss:  1.9593911170959473


100%|██████████| 22/22 [00:00<00:00, 116.08it/s]


Loss:  1.9687726497650146


100%|██████████| 22/22 [00:00<00:00, 139.61it/s]


Loss:  1.9809030294418335


100%|██████████| 22/22 [00:00<00:00, 146.08it/s]


Loss:  1.951000690460205


tensor([[-0.0781,  0.0789,  0.0101,  ...,  0.0791, -0.1060,  0.0511],
        [-0.0928,  0.0692,  0.0293,  ...,  0.0207, -0.0778,  0.0243],
        [-0.0822,  0.0621, -0.0014,  ...,  0.0103, -0.1345,  0.0899],
        ...,
        [-0.0850,  0.0382, -0.0398,  ...,  0.0492, -0.0836,  0.1068],
        [-0.0944,  0.0489, -0.0496,  ...,  0.0135, -0.0710,  0.1006],
        [ 0.0677,  0.8861,  0.1220,  ...,  0.1088, -0.0515, -0.2227]],
       device='cuda:0')