# scVAEDer pipeline

In [None]:
import torch
import numpy as np
import matplotlib.pyplot as plt
import os
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import matplotlib.pyplot as plt
from utils.helper import sample_batch,extract,make_beta_schedule
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical
from torch.utils.data import DataLoader, TensorDataset
import pandas as pd
import umap.umap_ as umap
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
Data_fin= pd.read_csv("clust.csv", index_col=0)
data = Data_fin.loc[:, Data_fin.columns != 'clusters']
Y= Data_fin['clusters']
classes, counts = np.unique(Y, return_counts=True)
label_encoder = LabelEncoder()
numeric_labels = label_encoder.fit_transform(Data_fin["clusters"])
data = data.to_numpy(dtype=np.float64)

After reading the data and preprocessing steps now we divide the data in training and test sets (You can add validation set here too if you want to find the best architecture for your model). There are alot of ways to divide the data in order to train a machine learning model (for more information please check:https://machinelearningmastery.com/train-test-split-for-evaluating-machine-learning-algorithms/. Thank you Jason!)

In [None]:
train_data = data[:1100]
test_data = data[1100:]

# Convert data to PyTorch tensors and create data loaders
train_data = torch.Tensor(train_data)
test_data = torch.Tensor(test_data)
train_loader = DataLoader(TensorDataset(train_data), batch_size=200, shuffle=True)
test_loader = DataLoader(TensorDataset(test_data), batch_size=200, shuffle=True)




Lets build our autoencoder (AE). You can Choose different types of AE. In our paper we used VAE.

In [None]:
laten_size=30   #Size of the latent layer
layer1=100   #Size of the first layer of enocder and decoder
input_size=1845

# Define the variational autoencoder model
class VAE(nn.Module):
    def __init__(self):
        super(VAE, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_size, layer1),
            nn.ReLU(),
            nn.Linear(layer1, laten_size),
        )
        self.decoder = nn.Sequential(
            nn.Linear(laten_size, layer1),
            nn.ReLU(),
            nn.Dropout(p=0.2),
            nn.Linear(layer1, input_size),
        )
        self.mu = nn.Linear(laten_size, laten_size)
        self.log_var = nn.Linear(laten_size, laten_size)

    def reparameterize(self, mu, log_var):
        std = torch.exp(0.5 * log_var)
        eps = torch.randn_like(std)
        return mu + eps * std

    def forward(self, x):
        x = self.encoder(x)
        mu = self.mu(x)
        log_var = self.log_var(x)
        z = self.reparameterize(mu, log_var)
        x = self.decoder(z)
        return x, mu, log_var

    def sample(self, num_samples):
        z = torch.randn(num_samples, laten_size)
        return self.decoder(z)

# Initialize the model and optimizer
model = VAE()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Define the loss function: We also tried different loss functions Binary Cross-Entropy (BCE) Loss for 
# the reconstuciton loss of VAE but MSE worked better (lower error):
def vae_loss(x, x_recon, mu, log_var):
    recon_loss = nn.functional.mse_loss(x_recon, x, reduction='sum')
    kl_div = -0.5 * torch.sum(1 + log_var - mu.pow(2) - log_var.exp())
    return recon_loss + kl_div

# Train the model
train_losses = []
test_losses = []
num_epochs = 150
for epoch in range(num_epochs):
    model.train()
    train_loss = 0
    for batch in train_loader:
        optimizer.zero_grad()
        x = batch[0]
        x_recon, mu, log_var = model(x)
        loss = vae_loss(x, x_recon, mu, log_var)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
    train_losses.append(train_loss / len(train_data))

    model.eval()
    test_loss = 0
    with torch.no_grad():
        for batch in test_loader:
            x = batch[0]
            x_recon, mu, log_var = model(x)
            loss = vae_loss(x, x_recon, mu, log_var)
            test_loss += loss.item()
        test_losses.append(test_loss / len(test_data))

    print(f"Epoch {epoch+1}, Train Loss: {train_losses[-1]:.4f}, Test Loss: {test_losses[-1]:.4f}")

# Plot the training and validation loss
plt.plot(train_losses, label='Train Loss')
plt.plot(test_losses, label='Test Loss')
plt.legend()
plt.show()

# Convert the data to a PyTorch tensor
data_tensor = torch.tensor(data).float()

# Compute the latent layer for the data
with torch.no_grad():
    model.eval()
    latent_layer = model.encoder(data_tensor)

We can now check how the latent layer looks by using UMAP or tSNE (you can change the parameter values of UMAP to get a nicer plot). We also found doing a batch normalization of the latent layer helps the training process of DDM.

In [None]:
latent_layer = (latent_layer - latent_layer.mean())/(latent_layer.std())

In [None]:
# Visualize the latent layer using UMAP
umap_embedding = umap.UMAP(n_neighbors=100, min_dist=0.3, random_state=42).fit_transform(latent_layer)
plt.scatter(umap_embedding[:, 0], umap_embedding[:, 1], c=numeric_labels, s=5, cmap='viridis')
plt.show()
dataset = torch.Tensor(latent_layer).float()

Now we want to train the DDM. First as described in the method section we define different types of Scheduler (more detail can be found here:https://github.com/acids-ircam/diffusion_models). ***IMPORTANT: I FOUND THE NUMBER OF STEPS (num_steps) EXTREMELY IMPORTANT IN THE QUALITY OF GENERATED SAMPLES SO PLEASE TRY DIFFERENT VALUS (hyper-parameter search):

In [None]:
def make_beta_schedule(schedule='linear', num_steps=1000, start=1e-5, end=1e-2):
    if schedule == 'linear':
        betas = torch.linspace(start, end, num_steps)
    elif schedule == "quad":
        betas = torch.linspace(start ** 0.5, end ** 0.5, num_steps) ** 2
    elif schedule == "sigmoid":
        betas = torch.linspace(-6, 6, num_steps)
        betas = torch.sigmoid(betas) * (end - start) + start
    return betas

def extract(input, t, x):
    shape = x.shape
    out = torch.gather(input, 0, t.to(input.device))
    reshape = [t.shape[0]] + [1] * (len(shape) - 1)
    return out.reshape(*reshape)

def plot_schedule(num_steps,schedule):
    plt.plot(list(range(num_steps)),betas.numpy(),label='betas')
    plt.plot(list(range(num_steps)),torch.sqrt(alphas_prod).numpy(),label='sqrt_alphas_prod')
    plt.plot(list(range(num_steps)),torch.sqrt(1-alphas_prod).numpy(),label='sqrt_one_minus_alphas_prod')
    plt.legend(['betas','sqrt_alphas_prod','sqrt_one_minus_alphas_prod'],loc = 'upper left')
    plt.xlabel('steps')
    plt.ylabel('value')
    plt.title('{} schedule'.format(schedule))
    plt.show()

num_steps=2000

schedule='sigmoid'
betas = make_beta_schedule(schedule=schedule, num_steps=num_steps, start=1e-5, end=1e-2)
alphas = 1-betas
alphas_prod = torch.cumprod(alphas,0)
alphas_prod_p = torch.cat([torch.tensor([1]).float(),alphas_prod[:-1]],0)
alphas_bar_sqrt = torch.sqrt(alphas_prod)
one_minus_alphas_bar_log = torch.log(1 - alphas_prod)
one_minus_alphas_bar_sqrt = torch.sqrt(1 - alphas_prod)
plot_schedule(num_steps,schedule)



Calculate the sample value x_t at any time, based on x_0 and reparameterization trick. You can compute x(t) time based on the initial x_0 value.

In [None]:
def q_x(x_0,t):
    noise = torch.randn_like(x_0).to(device)
    alphas_t = alphas_bar_sqrt[t].to(device)
    alphas_1_m_t = one_minus_alphas_bar_sqrt[t].to(device)
    return (alphas_t * x_0 + alphas_1_m_t * noise)

Run the code below to check the forward process. (Its not necessary therefore I commented it) 

In [None]:
#num_shows = 20
#fig,axs = plt.subplots(2,10,figsize=(20,5))
#plt.rc('text',color='black')

#for i in range(num_shows):
    #print(i)
    #j = i//10
    #k = i%10
    #q_i = q_x(dataset,torch.tensor([i*num_steps//num_shows]))#Generate sample data at time t
    #umap_emb = umap.UMAP(n_neighbors=80, min_dist=0.3).fit_transform(q_i)
    #axs[j,k].scatter(umap_emb[:, 0], umap_emb[:, 1], s=5);

As can be seen from the figure above adding random noise to the initial data gradually affects its integrity. Throughout the forward diffusion process, noise is systematically added, resulting in the generation of noisy data. To acquire an understanding of the iteratively added noise during the forward process, a neural network model is trained. The objective is to learn the patterns and characteristics of the noise introduced at each iteration.

In [None]:
class MLPDiffusion(nn.Module):
    def __init__(self,n_steps, num_units=512):
        super(MLPDiffusion,self).__init__()
        
        self.linears = nn.ModuleList(
            [
                nn.Linear(laten_size,num_units),
                nn.ReLU(),
                nn.Linear(num_units,num_units),
                nn.ReLU(),
                nn.Linear(num_units,num_units),
                nn.ReLU(),
                nn.Linear(num_units,laten_size),
            ]
        )
        self.step_embeddings = nn.ModuleList(
            [
                nn.Embedding(n_steps,num_units),
                nn.Embedding(n_steps,num_units),
                nn.Embedding(n_steps,num_units),
            ]
        )
    def forward(self,x,t):
#         x = x_0
        for idx,embedding_layer in enumerate(self.step_embeddings):
            t_embedding = embedding_layer(t)
            x = self.linears[2*idx](x)
            x += t_embedding
            x = self.linears[2*idx+1](x)
            
        x = self.linears[-1](x)
        
        return x


We define the error function for training:


In [None]:
def diffusion_loss_fn(model, x_0, alphas_bar_sqrt, one_minus_alphas_bar_sqrt, n_steps):
    """Sampling and calculating loss at any time t"""
    batch_size = x_0.shape[0]
    
    ##Generate random time t for a batchsize sample
    t = torch.randint(0,n_steps,size=(batch_size//2,), device=x_0.device)
    t = torch.cat([t,n_steps-1-t],dim=0)
    t = t.unsqueeze(-1)
    
    #Coefficient of x0
    a = alphas_bar_sqrt[t]
    
    #Coefficient of eps
    aml = one_minus_alphas_bar_sqrt[t]
    
    #generate random noise eps
    e = torch.randn_like(x_0, device=x_0.device)
    
    #Construct the input of the model
    x = (x_0*a+e*aml).to(device)
    
    #Send into the model to get the predicted value of random noise at time t
    output = model(x, t.squeeze(-1))
    
    #Compute error along with real noise, average
    return (e - output).square().mean()

In [None]:
def p_sample_loop(model, shape, n_steps, betas, one_minus_alphas_bar_sqrt):
    """Restore x[T-1], x[T-2]|...x[0] from x[T]"""
    cur_x = torch.randn(shape).to(device)
    x_seq = [cur_x]
    for i in reversed(range(n_steps)):
        cur_x = p_sample(model, cur_x, i, betas, one_minus_alphas_bar_sqrt).to(device)
        x_seq.append(cur_x)
    return x_seq


def p_sample(model, x, t, betas, one_minus_alphas_bar_sqrt):
    """Sampling the reconstructed value at time t from x[T]"""
    device="cpu"
    t = torch.tensor([t]).to(device)
    betas = betas.to(device)
    one_minus_alphas_bar_sqrt = one_minus_alphas_bar_sqrt.to(device)
    coeff = (betas[t] / one_minus_alphas_bar_sqrt[t]).to(device)

    eps_theta = model(x, t).to(device)

    mean = (1 / (1 - betas[t]).sqrt()) * (x - (coeff * eps_theta)).to(device)

    z = torch.randn_like(x).to(device)
    sigma_t = betas[t].sqrt().to(device)

    sample = mean + sigma_t * z

    return sample

The trained model can be used to generate novel data. To accomplish this, we simply use Gaussian noise and the formula mentioned in the method section to perform sampling. It has been previously mentioned that Exponential Moving Average(EMA) can inmprove the training process.

In [None]:
class EMA():
    def __init__(self,mu=0.001):
        self.mu = mu
        self.shadow = {}
        
    def register(self,name,val):
        self.shadow[name] = val.clone()
        
    def __call__(self,name,x):
        assert name in self.shadow
        new_average = self.mu * x + (1.0-self.mu)*self.shadow[name]
        self.shadow[name] = new_average.clone()
        return new_average

device="cuda"
total_loss=[]
print('Training model...')
batch_size = 1200
dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True)
num_epoch = 3000
plt.rc('text',color='blue')
model2 = MLPDiffusion(num_steps)
model2.to(device)
model2 = torch.nn.DataParallel(model2).to(device)
alphas_bar_sqrt = alphas_bar_sqrt.to(device)
one_minus_alphas_bar_sqrt = one_minus_alphas_bar_sqrt.to(device)
optimizer = torch.optim.Adam(model2.parameters(),lr=1e-3)

for t in range(num_epoch):
    print("The values are: {} and {}".format(loss, t))
    for idx,batch_x in enumerate(dataloader):
        batch_x = batch_x.to(device)
        loss = diffusion_loss_fn(model2, batch_x, alphas_bar_sqrt, one_minus_alphas_bar_sqrt, num_steps)
        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model2.parameters(),1.)
        optimizer.step()      

Now lets save the model and change the device from 'GPU' to 'CPU'

In [None]:
from collections import OrderedDict

# Specify a path to save to
PATH = "model_interpolate.pth" # Choose whatever you like
# Save
torch.save(model2.state_dict(), PATH)
# Load
device = torch.device('cpu')
model4 = MLPDiffusion(num_steps)

# Original saved file with DataParallel
state_dict = torch.load("model_interpolate.pth")
# create new OrderedDict that does not contain `module.`
new_state_dict = OrderedDict()
for k, v in state_dict.items():
    name = k[7:] # remove `module.`
    new_state_dict[name] = v
# load params
model4.load_state_dict(new_state_dict)

By using the sampling formula we can generate novel data:

In [None]:
x_seq = p_sample_loop(model4, dataset.shape, num_steps, betas, one_minus_alphas_bar_sqrt)
len(x_seq)

By selecting the desired number of denoising steps, the generated data can be visualized using UMAP. 
Keep in mind that increasing the number of steps leads to higher-quality results. For example I add noise in 1000 steps in the forward process then I will select 1000 steps for denoising (reverse process) and save the output in cur_x: 

In [None]:
cur_x = x_seq[len(x_seq)-1].detach().cpu().numpy()
umap_emb = umap.UMAP(n_neighbors=100, min_dist=0.2).fit_transform(cur_x)
plt.scatter(umap_emb[:, 0], umap_emb[:, 1], s=10);  

Now with the generated sample we can perfrom clustering (I chose Kmean you can try other clustering methods if you want)

In [None]:
from sklearn.cluster import KMeans
import time

start_time = time.time()
# Initialize KMeans clustering with 5 clusters
num_clusters = 5
kmeans = KMeans(n_clusters=num_clusters, random_state=0)
# Get the last tensor in X_seq
last_tensor = x_seq[-1].detach().cpu()
# Perform KMeans clustering on the tensor
cluster_labels = kmeans.fit_predict(last_tensor)
# Replot the UMAP with cells colored based on their cluster assignment
fig, ax = plt.subplots(figsize=(10, 10))
umap_emb = umap.UMAP(n_neighbors=15, min_dist=0.09).fit_transform(last_tensor.detach().cpu().numpy())
scatter = ax.scatter(umap_emb[:, 0], umap_emb[:, 1], c=cluster_labels, cmap='viridis', s=10)
plt.title('UMAP plot with cluster assignments')
plt.show()

# Print the total time taken to run the code
print(f'Total time taken: {time.time() - start_time:.2f} seconds')


In [None]:
def cell_sample(cell_type):
    # create a Pandas dataframe
    latent_layer_array = pd.DataFrame(latent_layer.numpy())
    latent_layer_pd = pd.DataFrame(latent_layer_array)
    latent_layer_pd.index= Data_fin.index
    latent_layer_pd = latent_layer_pd.join(Data_fin['clusters'])
    hspc_df = latent_layer_pd[latent_layer_pd['clusters'] == cell_type]
    data2 = hspc_df.loc[:, hspc_df.columns != 'clusters']
    return data2

In [None]:
thromb = cell_sample("Thrombocytes")
hspc = cell_sample("HSPC")
Thromb_diff_latent = q_x(torch.tensor(thromb.iloc[:,:].values), torch.tensor([num_steps-1]))
hspc_diff_latent = q_x(torch.tensor(hspc.iloc[:,:].values), torch.tensor([num_steps-1]))
Thromb_diff_latent_m = torch.tensor(Thromb_diff_latent.mean(axis=0))
hspc_diff_latent_m = torch.tensor(hspc_diff_latent.mean(axis=0))

alpha = torch.tensor(np.linspace(0, 1, 2000, dtype=np.float32)).to(device)
intp = Thromb_diff_latent_m* (1 - alpha[:, None]) + hspc_diff_latent_m * alpha[:, None]

def p_sample_loop2(model, shape, n_steps, betas, one_minus_alphas_bar_sqrt):
    cur_x = intp.to(device)
    x_seq = [cur_x]
    for i in reversed(range(n_steps)):
        cur_x = p_sample(model, cur_x, i, betas, one_minus_alphas_bar_sqrt)
        x_seq.append(cur_x)
    return x_seq

In [None]:
intp

In [None]:
x_seq_zero = p_sample_loop2(model4, intp.shape, num_steps, betas, one_minus_alphas_bar_sqrt)
cur_x2 = x_seq_zero[len(x_seq_zero)-1].detach().numpy()
umap_emb2 = umap.UMAP(n_neighbors=10, min_dist=0.1).fit_transform(cur_x2)
plt.scatter(umap_emb2[:, 0], umap_emb2[:, 1], s=10);

with torch.no_grad():
    decod_intrp = model.decoder(x_seq_zero[1500])
column_names = list(Data_fin.columns) 
