## Import Packages

In [1]:
# system
import os
import pickle
import gzip

# data manipulation
import pandas as pd
import numpy as np

# deep learning
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
import numpy as np
import matplotlib.pyplot as plt

## Load Dataset

In [2]:
with gzip.open('../../result/one_hot_encoding/gene_id_label_ohe.pkl.gz', 'rb') as f:
    data = pickle.load(f)

In [3]:
data

Unnamed: 0,ensembl_gene_id,DE,upstream_region_encoded
0,ENSG00000000457,0,"[[1, 0, 0, 0], [0, 0, 1, 0], [1, 0, 0, 0], [0,..."
1,ENSG00000000460,0,"[[0, 0, 0, 1], [0, 0, 0, 1], [0, 0, 0, 1], [0,..."
2,ENSG00000000938,0,"[[1, 0, 0, 0], [0, 0, 1, 0], [0, 1, 0, 0], [1,..."
3,ENSG00000000971,1,"[[0, 0, 1, 0], [0, 1, 0, 0], [0, 0, 0, 1], [0,..."
4,ENSG00000001460,0,"[[0, 0, 0, 1], [0, 0, 0, 1], [1, 0, 0, 0], [0,..."
...,...,...,...
55221,ENSG00000284520,0,"[[0, 1, 0, 0], [0, 0, 0, 1], [0, 1, 0, 0], [1,..."
55222,ENSG00000284544,0,"[[0, 1, 0, 0], [0, 0, 0, 1], [0, 0, 0, 1], [1,..."
55223,ENSG00000284554,0,"[[1, 0, 0, 0], [0, 0, 1, 0], [1, 0, 0, 0], [0,..."
55224,ENSG00000284568,0,"[[1, 0, 0, 0], [0, 0, 0, 1], [1, 0, 0, 0], [0,..."


## Quality Control

In [4]:
X = np.stack(data['upstream_region_encoded'].values)
Y = data['DE'].values

In [5]:
print(X.shape)
print(Y.shape)

(55223, 2000, 4)
(55223,)


## Train-test Split

In [6]:
X_train, X_dev, Y_train, Y_dev = train_test_split(X, Y, test_size=0.2, shuffle=True, random_state=123)

In [7]:
print(X_train.shape)
print(X_dev.shape)
print(Y_train.shape)
print(Y_dev.shape)

(44178, 2000, 4)
(11045, 2000, 4)
(44178,)
(11045,)


## Prepare Training Data for Generative Model

In [8]:
# get only the positive train data for training generative model

X_train_positive = X_train[Y_train==1]
print(X_train_positive.shape)

gen_train_data = torch.from_numpy(X_train_positive).float().permute(0,2,1)
gen_train_dataset = torch.utils.data.TensorDataset(gen_train_data)
gen_train_dataloader = torch.utils.data.DataLoader(gen_train_dataset, batch_size=32, shuffle=True)

(3342, 2000, 4)


## Define Gen Model and Training Function

In [9]:
class TimeEmbedding(nn.Module):
    def __init__(self, dim):
        super().__init__()
        self.dim = dim
        self.layers = nn.Sequential(
            nn.Linear(1, dim),
            nn.SiLU(),
            nn.Linear(dim, dim)
        )
    
    def forward(self, t):
        t = t.float().view(-1, 1)
        return self.layers(t)

class DNADiffusionModel(nn.Module):
    def __init__(self, sequence_length=2000, channels=4, hidden_dim=128):
        super().__init__()
        self.sequence_length = sequence_length
        self.channels = channels
        
        self.time_embed = TimeEmbedding(hidden_dim)
        
        # 1D CNN layers
        self.input_conv = nn.Conv1d(channels, hidden_dim, 3, padding=1)
        
        self.conv_blocks = nn.ModuleList([
            nn.ModuleList([
                nn.Conv1d(hidden_dim, hidden_dim, 3, padding=1),
                nn.Conv1d(hidden_dim, hidden_dim, 3, padding=1)
            ]) for _ in range(4)
        ])
        
        self.output_conv = nn.Conv1d(hidden_dim, channels, 3, padding=1)
        self.norm_layers = nn.ModuleList([
            nn.GroupNorm(8, hidden_dim) for _ in range(9)
        ])
        
    def forward(self, x, t):
        # x shape: (B,C,L) C is 4 in our case
        # t shape: (B,)
        
        # Time embedding
        t_emb = self.time_embed(t)
        t_emb = t_emb.unsqueeze(-1).expand(-1, -1, self.sequence_length)
        
        h = self.input_conv(x)
        h = F.silu(self.norm_layers[0](h))
        h = h + t_emb
        
        # Residual blocks
        norm_idx = 1  # Start from index 1 as 0 was used for input conv
        for i, (conv1, conv2) in enumerate(self.conv_blocks):
            residual = h
            h = F.silu(self.norm_layers[norm_idx](conv1(h)))
            h = F.silu(self.norm_layers[norm_idx + 1](conv2(h)))
            h = residual + h
            h = h + t_emb
            norm_idx += 2  # Increment by 2 since we used 2 norm layers
        
        out = self.output_conv(h)
        return out

class DiffusionTrainer:
    def __init__(self, sequence_length=2000, channels=4, timesteps=1000, beta_start=1e-4, beta_end=0.02):
        self.sequence_length = sequence_length
        self.channels = channels
        self.timesteps = timesteps
        
        self.betas = torch.linspace(beta_start, beta_end, timesteps).cuda()
        self.alphas = 1. - self.betas
        self.alphas_cumprod = torch.cumprod(self.alphas, dim=0)
        self.sqrt_alphas_cumprod = torch.sqrt(self.alphas_cumprod)
        self.sqrt_one_minus_alphas_cumprod = torch.sqrt(1. - self.alphas_cumprod)
        
    def add_noise(self, x, t):
        noise = torch.randn_like(x).cuda()
        sqrt_alpha_cumprod = self.sqrt_alphas_cumprod[t].view(-1, 1, 1)
        sqrt_one_minus_alpha_cumprod = self.sqrt_one_minus_alphas_cumprod[t].view(-1, 1, 1)
        
        return sqrt_alpha_cumprod * x + sqrt_one_minus_alpha_cumprod * noise, noise
    
    def sample(self, model, batch_size, device='cuda'):
        model.eval()
        with torch.no_grad():
            x = torch.randn(batch_size, self.channels, self.sequence_length).cuda()
            
            for t in range(self.timesteps - 1, -1, -1):
                t_batch = torch.full((batch_size,), t, dtype=torch.long).cuda()
                predicted_noise = model(x, t_batch)
                
                alpha = self.alphas[t]
                alpha_cumprod = self.alphas_cumprod[t]
                beta = self.betas[t]
                
                if t > 0:
                    noise = torch.randn_like(x).cuda()
                else:
                    noise = torch.zeros_like(x).cuda()
                    
                x = (1 / torch.sqrt(alpha)) * (
                    x - ((1 - alpha) / torch.sqrt(1 - alpha_cumprod)) * predicted_noise
                ) + torch.sqrt(beta) * noise

                # # Following might be unnecessary
                # # After every 100 step, convert to one-hot-like representation
                # if t % 100 == 0:  
                #     x = F.softmax(x / 0.1, dim=1)  # Parameter 0.1
                    
            x = F.one_hot(x.argmax(dim=1), num_classes=self.channels).permute(0, 2, 1).float()
        return x

def train_diffusion(model, trainer, train_loader, optimizer, num_epochs=100):
    model.train()
    for epoch in range(num_epochs):
        total_loss = 0
        for batch in train_loader:
            x = batch[0].cuda()
            
            t = torch.randint(0, trainer.timesteps, (x.shape[0],), dtype=torch.long).cuda()
            
            noisy_x, target_noise = trainer.add_noise(x, t)
            
            predicted_noise = model(noisy_x, t)
            
            loss = F.mse_loss(predicted_noise, target_noise)
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
            
        avg_loss = total_loss / len(train_loader)
        print(f"Epoch {epoch+1}, Average Loss: {avg_loss:.4f}")

In [10]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = DNADiffusionModel().to(device)
trainer = DiffusionTrainer()
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)

train_diffusion(model, trainer, gen_train_dataloader, optimizer)

Epoch 1, Average Loss: 6569.5255
Epoch 2, Average Loss: 13.3280
Epoch 3, Average Loss: 12.2871
Epoch 4, Average Loss: 11.3294
Epoch 5, Average Loss: 10.7546
Epoch 6, Average Loss: 10.0462
Epoch 7, Average Loss: 9.5349
Epoch 8, Average Loss: 8.9760
Epoch 9, Average Loss: 8.3820
Epoch 10, Average Loss: 7.5747
Epoch 11, Average Loss: 7.0177
Epoch 12, Average Loss: 6.5203
Epoch 13, Average Loss: 6.0056
Epoch 14, Average Loss: 5.5307
Epoch 15, Average Loss: 4.8236
Epoch 16, Average Loss: 4.6197
Epoch 17, Average Loss: 4.0778
Epoch 18, Average Loss: 3.6245
Epoch 19, Average Loss: 2.9283
Epoch 20, Average Loss: 2.5272
Epoch 21, Average Loss: 2.1937
Epoch 22, Average Loss: 1.9475
Epoch 23, Average Loss: 1.7547
Epoch 24, Average Loss: 1.5951
Epoch 25, Average Loss: 1.4241
Epoch 26, Average Loss: 1.2691
Epoch 27, Average Loss: 1.1465
Epoch 28, Average Loss: 1.0391
Epoch 29, Average Loss: 0.9072
Epoch 30, Average Loss: 0.7879
Epoch 31, Average Loss: 0.6687
Epoch 32, Average Loss: 0.6704
Epoch 33,

In [11]:
# Generate 500*110 = 55,000 fake positive sequences
from tqdm import tqdm
generated_data = []
for _ in tqdm(range(110)):
    with torch.no_grad():
        new_sequences = trainer.sample(model, batch_size=500, device=device)
        generated_data.append(new_sequences)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 110/110 [2:12:46<00:00, 72.42s/it]


In [12]:
generated_data = torch.stack(generated_data).view(-1,4,2000)
print(generated_data.shape)

torch.Size([55000, 4, 2000])


In [13]:
np.savez('generated_X.npz',X_gen=generated_data.cpu().numpy())