In [11]:
# An file for on-target DeepPE (pre-)training.

import os, sys
import numpy as np
import pandas as pd
import torch
from torch.optim import AdamW, lr_scheduler
from torch.utils.data import DataLoader
from tqdm import tqdm

device = 'cuda:0' if torch.cuda.is_available() else 'cpu'

torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

def preprocess_seq(data, seq_length):

    seq_onehot = np.zeros((len(data), 1, seq_length, 4), dtype=float)

    for l in range(len(data)):
        for i in range(seq_length):
            try:
                data[l][i]
            except Exception:
                print(data[l], i, seq_length, len(data))

            if   data[l][i] in "Aa":  seq_onehot[l, 0, i, 0] = 1
            elif data[l][i] in "Cc":  seq_onehot[l, 0, i, 1] = 1
            elif data[l][i] in "Gg":  seq_onehot[l, 0, i, 2] = 1
            elif data[l][i] in "Tt":  seq_onehot[l, 0, i, 3] = 1
            elif data[l][i] in "Xx":  pass
            elif data[l][i] in "Nn.": pass
            else:
                print("[Input Error] Non-ATGC character " + data[l])
                sys.exit()

    return seq_onehot
def seq_concat(data, col1='WT74_On', col2='Edited74_On', seq_length=74):
    wt = preprocess_seq(data[col1], seq_length)
    ed = preprocess_seq(data[col2], seq_length)
    g = np.concatenate((wt, ed), axis=1)
    g = 2 * g - 1

    return g

# LOAD & PREPROCESS GENES

data_id = 'DP_variant_293T_PE2_Conv_220428'
train_file = pd.read_csv('docs/dataset/%s.csv' % data_id)

gene_path = 'docs/dataset/genes/%s.npy' % data_id
if not os.path.isfile(gene_path):
    g_train = seq_concat(train_file)
    np.save(gene_path, g_train)
else:
    g_train = np.load(gene_path)

print(type(g_train))
print(g_train.shape)

<class 'numpy.ndarray'>
(4064, 2, 74, 4)


In [8]:
g_train = torch.tensor(g_train, dtype=torch.float32, device=device)
print(type(g_train))
print(g_train.shape)

<class 'torch.Tensor'>
torch.Size([4064, 2, 74, 4])


In [9]:
g = g_train.permute((0, 3, 1, 2))
print(type(g))
print(g.shape)

<class 'torch.Tensor'>
torch.Size([4064, 4, 2, 74])


In [6]:
import sys, os
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.nn.functional as F

class DeepCas9Model(nn.Module):
    def __init__(self, filter_size, filter_num, node_1=80, node_2=60):
        super(DeepCas9Model, self).__init__()
        length = 30

        # Define layers using torch.nn
        self.conv1 = nn.Conv2d(4, filter_num[0], kernel_size=(1, filter_size[0]))
        self.pool1 = nn.AvgPool2d(kernel_size=(1, 2), stride=(1, 2))

        self.conv2 = nn.Conv2d(filter_num[0], filter_num[1], kernel_size=(1, filter_size[1]))
        self.norm2 = nn.BatchNorm2d(filter_num[1])
        self.pool2 = nn.AvgPool2d(kernel_size=(1, 2), stride=(1, 2))

        # 원래 DeepSpCas9은 sequential하게 가는 것이 아니긴 한데... 어차피 이미 달라진 모델이니..
        # 위에 2d CNN을 2번 거치고, 이후에는 1d CNN

        self.conv3 = nn.Conv1d(filter_num[1], filter_num[2], kernel_size=filter_size[2])
        self.pool3 = nn.AvgPool1d(kernel_size=2, stride=2)

        self.flatten = nn.Flatten()
        self.dense1 = nn.Linear(filter_num[0] * ((length - filter_size[0]) // 2 + 1), node_1)
        self.dense2 = nn.Linear(filter_num[1] * ((length - filter_size[1]) // 2 + 1), node_2)
        self.output_layer = nn.Linear(filter_num[2] * ((length - filter_size[2]) // 2 + 1), 1)

    def forward(self, inputs):
        x = F.relu(self.conv1(inputs))
        x = self.pool1(x)
        x = F.relu(self.conv2(x))
        x = self.norm2(x)
        x = self.pool2(x)
        x = torch.squeeze(x, 2) # dimmension 축소
        print('step marker')
        x = F.relu(self.conv3(x)) # Error
        print('step marker')
        x = self.pool3(x)
        print('step marker')
        x = self.flatten(x)
        print('step marker')
        x = F.relu(self.dense1(x))
        print('step marker')
        x = F.relu(self.dense2(x))
        print('step marker')
        outputs = self.output_layer(x)
        return outputs


def preprocess_seq(data, seq_length):

    seq_onehot = np.zeros((len(data), 1, seq_length, 4), dtype=float)

    for l in range(len(data)):
        for i in range(seq_length):
            try:
                data[l][i]
            except Exception:
                print(data[l], i, seq_length, len(data))

            if   data[l][i] in "Aa":  seq_onehot[l, 0, i, 0] = 1
            elif data[l][i] in "Cc":  seq_onehot[l, 0, i, 1] = 1
            elif data[l][i] in "Gg":  seq_onehot[l, 0, i, 2] = 1
            elif data[l][i] in "Tt":  seq_onehot[l, 0, i, 3] = 1
            elif data[l][i] in "Xx":  pass
            elif data[l][i] in "Nn.": pass
            else:
                print("[Input Error] Non-ATGC character " + data[l])
                sys.exit()

    return seq_onehot


# Create an instance of the custom model
filter_size = [3, 5, 7]
filter_num  = [100, 70, 40]
node_1 = 80
node_2 = 60

custom_model = DeepCas9Model(filter_size=filter_size, filter_num=filter_num, node_1=node_1, node_2=node_2)

# Define the optimizer and loss function
optimizer = torch.optim.Adam(custom_model.parameters(), lr=0.005)
loss_object = nn.MSELoss()

# Function for training the model
def train_step(inputs, targets):
    optimizer.zero_grad()
    predictions = custom_model(inputs)
    loss = loss_object(predictions, targets)
    loss.backward()
    optimizer.step()
    return loss.item()

# Function for predicting using the model
def predict(inputs):
    with torch.no_grad():
        outputs = custom_model(inputs)
    return outputs.numpy()


In [20]:
import os, sys
import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F
import torch.nn as nn
import math


class GeneInteractionModel(nn.Module):

    def __init__(self, hidden_size=128, num_layers=1, num_features=24, dropout=0.1):
        super(GeneInteractionModel, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        self.c1 = nn.Sequential(
            nn.Conv2d(in_channels=4, out_channels=128, kernel_size=(1, 3), stride=1, padding=(0, 1)),
            nn.BatchNorm2d(128),
            nn.GELU(),
        )
        self.c2 = nn.Sequential(
            nn.Conv1d(in_channels=128, out_channels=108, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm1d(108),
            nn.GELU(),
            nn.AvgPool1d(kernel_size=2, stride=2),

            nn.Conv1d(in_channels=108, out_channels=108, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm1d(108),
            nn.GELU(),
            nn.AvgPool1d(kernel_size=2, stride=2),

            nn.Conv1d(in_channels=108, out_channels=128, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm1d(128),
            nn.GELU(),
            nn.AvgPool1d(kernel_size=2, stride=2),
        )

        self.r = nn.GRU(128, hidden_size, num_layers, batch_first=True, bidirectional=True)

        self.s = nn.Linear(2 * hidden_size, 12, bias=False)

        self.d = nn.Sequential(
            nn.Linear(num_features, 96, bias=False),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(96, 64, bias=False),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(64, 128, bias=False)
        )

        self.head = nn.Sequential(
            nn.BatchNorm1d(140),
            nn.Dropout(dropout),
            nn.Linear(140, 1, bias=True),
        )

    def forward(self, g):
        g = torch.squeeze(self.c1(g), 2)
        g = self.c2(g)
        g, _ = self.r(torch.transpose(g, 1, 2))
        g = self.s(g[:, -1, :])
        
        out = self.head(torch.cat((g), dim=1))

        return F.softplus(out)


In [4]:
df = pd.read_csv('docs/dataset/DeepSpCas9_train.csv')
df

Unnamed: 0,Target_context,indel
0,TTCTGCCTTGTTTCTTTCCTCTCTGGGTCG,24.287805
1,ACGACCTTCAGCTCAGTGACAGTGAGGACA,69.500438
2,AGGACGACGACTACAATAAGCCTCTGGATC,25.994760
3,GCAGCAAACTGACGGAGAACCTTGTGGCCC,57.964590
4,CCGGCAGATATCCGTGAAGGCTCTAGGTAC,39.355020
...,...,...
12827,GGGAATACGACGACCAGAGAGCGCTGGAGA,40.853256
12828,TCATGGATTTCCTGGCTCGGGGACTGGTCT,11.480880
12829,GCCTTGTTTCTTTCCTCTCTGGGTCGGATT,63.861469
12830,TCGCACCTGATAGAGCATGTGACAAGGAGA,51.650932


In [8]:
# Prepare train dataset

random_seed = 0
device = 'cuda' if torch.cuda.is_available() else 'cpu'

torch.manual_seed(random_seed)
torch.cuda.manual_seed(random_seed)
torch.cuda.manual_seed_all(random_seed)
np.random.seed(random_seed)

x_train = preprocess_seq(df['Target_context'], 30)
y_train = df['indel']

x_train = torch.tensor(x_train, dtype=torch.float32, device=device)
x_train = x_train.permute((0, 3, 1, 2))
y_train = torch.tensor(y_train.to_numpy(), dtype=torch.float32, device=device)

In [9]:
x_train.shape

torch.Size([12832, 4, 1, 30])

In [32]:
# TRAINING

# Create an instance of the custom model
filter_size = [3, 5, 5]
filter_num  = [100, 70, 40]
node_1 = 80
node_2 = 60

model = DeepCas9Model(filter_size=filter_size, filter_num=filter_num, node_1=node_1, node_2=node_2).to(device)

# Define the optimizer and loss function
loss_object = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.005)

# Function for training the model
def train_step(inputs, targets):
    optimizer.zero_grad()
    predictions = model(inputs)
    loss = loss_object(predictions, targets)
    loss.backward()
    optimizer.step()
    return loss.item()

train_ = train_step(x_train, y_train)

step marker
step marker
step marker
step marker
step marker
step marker
step marker
step marker


RuntimeError: Given input size: (40x1x1). Calculated output size: (40x1x0). Output size is too small

In [21]:
# TRAINING

# Create an instance of the custom model
filter_size = [3, 5, 5]
filter_num  = [100, 70, 40]
node_1 = 80
node_2 = 60

model = DeepCas9Model(filter_size=filter_size, filter_num=filter_num, node_1=node_1, node_2=node_2).to(device)

# Define the optimizer and loss function
loss_object = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.005)

# PARAMS

batch_size = 2048
learning_rate = 5e-3
weight_decay = 5e-2
T_0 = 10
T_mult = 1
hidden_size = 128
n_layers = 1
n_epochs = 10
n_models = 5


# TRAINING

for m in range(n_models):

    random_seed = m

    torch.manual_seed(random_seed)
    torch.cuda.manual_seed(random_seed)
    torch.cuda.manual_seed_all(random_seed)
    np.random.seed(random_seed)

    model = GeneInteractionModel(hidden_size=hidden_size, num_layers=n_layers).to(device)

    criterion = nn.MSELoss()
    optimizer = AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay)



    pbar = tqdm(range(n_epochs))
    for epoch in pbar:
        train_loss = []
        train_count = 0

        pred = model(x_train)
        print(pred.shape)
        loss = criterion(pred, y_train)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        train_loss.append(x.size(0) * loss.detach().cpu().numpy())
        train_count += x.size(0)

        train_loss = sum(train_loss) / train_count
        pbar.set_description('M {:02} | {:.4}'.format(m, train_loss))

    torch.save(model.state_dict(),'docs/models/test_model_{}.pt'.format(random_seed))

  0%|          | 0/10 [00:00<?, ?it/s]


RuntimeError: CUDA out of memory. Tried to allocate 226.00 MiB (GPU 0; 8.00 GiB total capacity; 7.21 GiB already allocated; 0 bytes free; 7.30 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF