<a href="https://colab.research.google.com/github/MartinekV/DL-for-bio-course/blob/master/04_DNA_enhancers_CNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Feeding sequence into a NN

In [None]:
# Activate gpu
# Runtime -> Change runtime type -> T4 GPU

example_seq = 'ACCCTGCCAACACGGGACTTTAC'
vocab = {
    'A':0,
    'C':1,
    'T':2,
    'G':3
}
numericalized = [vocab[c] for c in example_seq]

print(numericalized)

In [None]:
import torch

numericalized_tensor = torch.tensor(numericalized)
ohe_seq = torch.nn.functional.one_hot(numericalized_tensor, num_classes=4).float()

print(ohe_seq)
print(ohe_seq.shape)

In [None]:
flattened_seq = ohe_seq.flatten()

print(flattened_seq)
print(flattened_seq.shape)

In [None]:
import torch
import torch.nn as nn

model = nn.Linear(in_features=92, out_features=1, bias=True)
model(flattened_seq)

In [None]:
import torch.nn as nn

class SimpleClassifier(nn.Module):
  def __init__(self, input_size):
    super().__init__()

    self.net = nn.Sequential(
      nn.Linear(input_size, 1),
      nn.Sigmoid()
    )

  def forward(self, x):
    return self.net(x)

model = SimpleClassifier(input_size=92) # Quiz - how many parameters do we have?
model(flattened_seq)

# Real dataset

## Data exploration

In [6]:
!pip install -q genomic-benchmarks
!pip install torchmetrics -q

In [None]:
from genomic_benchmarks.dataset_getters.pytorch_datasets import HumanNontataPromoters
import pandas as pd

train_dset =  HumanNontataPromoters('train')
train_df = pd.DataFrame(data=[{'x':x,'y':y} for x,y in train_dset])

train_df

In [None]:
length_counts = train_df['x'].apply(len).value_counts()
print("Length counts of the DNA sequences:")
print(length_counts)

label_counts = train_df['y'].value_counts()
print("\nCounts of the labels y:")
print(label_counts)

In [None]:
from torch.utils.data import Dataset, DataLoader
import torch

class FlattenedDataset(Dataset):
    def __init__(self, df):
      self.df = df
      self.vocab = {
          'N':0,
          'A':1,
          'C':2,
          'T':3,
          'G':4
      }

    def __len__(self):
      return len(self.df)

    def __getitem__(self, idx):
      sequence, label = self.df.iloc[idx]

      numericalized = [self.vocab[c] for c in sequence]
      numericalized_tensor = torch.tensor(numericalized)
      ohe_seq = torch.nn.functional.one_hot(
          numericalized_tensor,
          num_classes=len(self.vocab.keys())
      )

      x = ohe_seq.flatten().float()
      y = torch.tensor([label]).float()

      return x, y

dset = FlattenedDataset(train_df)

sample_x, sample_y = dset[0]

print(sample_x, sample_y)
print(sample_x.shape, sample_y.shape)

In [None]:
model = SimpleClassifier(input_size=1255)

x,y = dset[0]
model(x)

In [11]:
train_loader = DataLoader(dset, batch_size=32, shuffle=True)

In [None]:
for x_batch, y_batch in train_loader:
  print(x_batch)
  print(y_batch)
  print(x_batch.shape, y_batch.shape)
  break

In [None]:
model(x_batch)

# Training

In [14]:
class SimpleClassifier(nn.Module):
  def __init__(self, input_size):
    super().__init__()

    self.net = nn.Sequential(
      nn.Linear(input_size, 1),
      nn.Sigmoid()
    )

  def forward(self, x):
    return self.net(x)

In [15]:
import numpy as np

# Training using classification loss and Adam optimizer
def train(model, dataset, batch_size=32, lr=1e-3, epochs=3, gpu=True):
  if(gpu):
    model.cuda()

  train_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

  loss_function = nn.BCELoss()
  optimizer = torch.optim.Adam(model.parameters(), lr=lr)

  for epoch in range(epochs):
    batch_losses = []
    for batch_x, batch_y in train_loader:
      if(gpu):
        batch_x, batch_y = batch_x.cuda(), batch_y.cuda()
      outputs = model(batch_x)
      loss = loss_function(outputs, batch_y)
      loss.backward()
      optimizer.step()
      optimizer.zero_grad()

      batch_losses.append(loss.item())

    print('Mean loss', np.mean(batch_losses))

In [None]:
train_dset = FlattenedDataset(train_df)
model = SimpleClassifier(input_size=1255)
train(model, train_dset, epochs=10)

# Evaluation

In [17]:
from tqdm import tqdm
from torchmetrics import Accuracy

def evaluate(model, dataset, gpu=True):
  accuracy_function = Accuracy(task='binary')

  if(gpu):
    model.cuda()
    accuracy_function.cuda()

  loader = DataLoader(dataset, batch_size=32, shuffle=False)
  model.eval() #Turn off training-only layers
  all_predictions = []
  all_labels = []
  with torch.no_grad(): #Dont track gradients
    for batch_x,batch_y in tqdm(loader):
      if(gpu):
        batch_x, batch_y = batch_x.cuda(), batch_y.cuda()

      output = model(batch_x)
      all_predictions.append(output)
      all_labels.append(batch_y)

  print('Accuracy:', accuracy_function(torch.cat(all_predictions), torch.cat(all_labels)).item())

In [18]:
evaluate(model, train_dset)

100%|██████████| 847/847 [00:07<00:00, 110.41it/s]


Accuracy: 0.8309776186943054


In [19]:
test_df = pd.DataFrame(data=[{'x':x,'y':y} for x,y in HumanNontataPromoters('test')])
test_dset = FlattenedDataset(test_df)

evaluate(model, test_dset)

100%|██████████| 283/283 [00:03<00:00, 85.16it/s] 

Accuracy: 0.8149213790893555





## Multi-layer perceptron (MLP)

In [None]:
class MLP(nn.Module):
  def __init__(self, input_size, hidden_size):
    super().__init__()
    #TODO
    pass

  def forward(self,x):
    #TODO
    pass

# Test the MLP
mlp_model = MLP(input_size=1255, hidden_size = 100)
sample_input = torch.rand(32,1255)
mlp_model(sample_input).size()

In [None]:
train(mlp_model, train_dset, epochs=10)

In [23]:
evaluate(mlp_model, train_dset)
evaluate(mlp_model, test_dset)

100%|██████████| 847/847 [00:06<00:00, 128.18it/s]


Accuracy: 0.9987083673477173


100%|██████████| 283/283 [00:02<00:00, 100.67it/s]


Accuracy: 0.8308612108230591


# Convolutional Neural Network (CNN)

In [None]:
class CNN(nn.Module):
  def __init__(self, in_channels):
    super().__init__()
    #TODO
    pass

  def forward(self,x):
    #TODO
    pass

# Test the CNN
cnn_model = CNN(in_channels=5)
sample_input = torch.rand(32,5,251)
cnn_model(sample_input).size()

In [39]:
# Change pre-processing to fit CNN

class CNNDataset(Dataset):
    def __init__(self, df):
      self.df = df
      self.vocab = {
          'N':0,
          'A':1,
          'C':2,
          'T':3,
          'G':4
      }

    def __len__(self):
      return len(self.df)

    def __getitem__(self, idx):
      sequence, label = self.df.iloc[idx]

      numericalized = [self.vocab[c] for c in sequence]
      numericalized_tensor = torch.tensor(numericalized)
      ohe_seq = torch.nn.functional.one_hot(
          numericalized_tensor,
          num_classes=len(self.vocab.keys())
      )

      # Change in input shape
      # x = ohe_seq.flatten().float()
      x = ohe_seq.permute(1,0).float()

      y = torch.tensor([label]).float()

      return x, y

train_dset = CNNDataset(train_df)
sample_x, sample_y = train_dset[0]
print(sample_x.shape)
print(sample_y.shape)

torch.Size([5, 251])
torch.Size([1])


In [40]:
train(cnn_model, train_dset, epochs=35)

Mean loss 0.4623096344779205
Mean loss 0.3963526786605189
Mean loss 0.3641015295084421
Mean loss 0.3432862030291642
Mean loss 0.33586755254192646
Mean loss 0.32506120926233223
Mean loss 0.3182017371375136
Mean loss 0.3117506699779521
Mean loss 0.3094997005504166
Mean loss 0.3012959602450536
Mean loss 0.29917281947880453
Mean loss 0.295575422442649
Mean loss 0.29208975065380793
Mean loss 0.2874255469145854
Mean loss 0.2848425682506567
Mean loss 0.28374105215143003
Mean loss 0.2788504198052807
Mean loss 0.27752759593507054
Mean loss 0.2747937791849956
Mean loss 0.2735106994297879
Mean loss 0.27134237456033194
Mean loss 0.2699836543513794
Mean loss 0.26868443381054485
Mean loss 0.26443246123958225
Mean loss 0.263358213975058
Mean loss 0.26224901846916787
Mean loss 0.2606262507746183
Mean loss 0.25921774928270014
Mean loss 0.26050435498339225
Mean loss 0.25808443061769926
Mean loss 0.25756072216043785
Mean loss 0.25614851859201365
Mean loss 0.2535916131293267
Mean loss 0.2537094357488358
M

In [41]:
test_dset = CNNDataset(test_df)

evaluate(cnn_model, train_dset)
evaluate(cnn_model, test_dset)

100%|██████████| 847/847 [00:07<00:00, 119.64it/s]


Accuracy: 0.9005424976348877


100%|██████████| 283/283 [00:02<00:00, 129.04it/s]

Accuracy: 0.8536639213562012





# Solutions

In [20]:
class MLP(nn.Module):
  def __init__(self, input_size, hidden_size):
    super().__init__()
    self.net = nn.Sequential(
      nn.Linear(input_size, hidden_size),
      nn.ReLU(),
      nn.Linear(hidden_size, 1),
      nn.Sigmoid(),
    )

  def forward(self,x):
    return self.net(x)

In [36]:
class CNN(nn.Module):
  def __init__(self, in_channels):
    super().__init__()
    self.net = nn.Sequential(
        nn.Conv1d(in_channels=in_channels, out_channels=16, kernel_size=3),
        nn.MaxPool1d(kernel_size=3),
        nn.ReLU(),
        nn.Conv1d(in_channels=16, out_channels=32, kernel_size=3),
        nn.MaxPool1d(kernel_size=3),
        nn.ReLU(),
        nn.Flatten(),
        nn.LazyLinear(out_features=1),
        nn.Sigmoid(),
    )

  def forward(self,x):
    return self.net(x)