<a href="https://colab.research.google.com/github/MartinekV/DL-for-bio-course/blob/master/04_DNA_enhancers_CNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Feeding sequence into a NN

In [None]:
# Activate gpu
# Runtime -> Change runtime type -> T4 GPU

example_seq = 'ACCCTGCCAACACGGGACTTTAC'
vocab = {'A':0,'C':1,'T':2,'G':3}
numericalized = [vocab[c] for c in example_seq]

print(numericalized)

In [None]:
import torch

numericalized_tensor = torch.tensor(numericalized)
ohe_seq = torch.nn.functional.one_hot(numericalized_tensor, num_classes=4).float()

print(ohe_seq)
print(ohe_seq.shape)

In [None]:
flattened_seq = ohe_seq.flatten()

print(flattened_seq)
print(flattened_seq.shape)

In [None]:
import torch
import torch.nn as nn

model = nn.Linear(in_features=92, out_features=1, bias=True)
model(flattened_seq)

In [None]:
import torch.nn as nn

class SimpleClassifier(nn.Module):
  def __init__(self, input_size):
    super().__init__()

    self.net = nn.Sequential(
      nn.Linear(input_size, 1),
      nn.Sigmoid()
    )

  def forward(self, x):
    return self.net(x)

model = SimpleClassifier(input_size=92) # Quiz - how many parameters do we have?
model(flattened_seq)

# Real dataset

## Data exploration

In [None]:
!pip install -q genomic-benchmarks
!pip install torchmetrics -q

In [None]:
from genomic_benchmarks.dataset_getters.pytorch_datasets import HumanNontataPromoters
import pandas as pd

train_dset =  HumanNontataPromoters('train')
train_df = pd.DataFrame(data=[{'x':x,'y':y} for x,y in train_dset])

train_df

In [None]:
length_counts = train_df['x'].apply(len).value_counts()
print("Length counts of the DNA sequences:")
print(length_counts)

label_counts = train_df['y'].value_counts()
print("\nCounts of the labels y:")
print(label_counts)

In [None]:
from torch.utils.data import Dataset, DataLoader
import torch

class FlattenedDataset(Dataset):
    def __init__(self, df):
      self.df = df
      self.vocab = {'N':0,'A':1,'C':2,'T':3,'G':4}

    def __len__(self):
      return len(self.df)

    def __getitem__(self, idx):
      sequence, label = self.df.iloc[idx]

      numericalized = [self.vocab[c] for c in sequence]
      numericalized_tensor = torch.tensor(numericalized)
      ohe_seq = torch.nn.functional.one_hot(
          numericalized_tensor,
          num_classes=len(self.vocab.keys())
      )

      x = ohe_seq.flatten().float()
      y = torch.tensor([label]).float()

      return x, y

dset = FlattenedDataset(train_df)

sample_x, sample_y = dset[0]

print(sample_x, sample_y)
print(sample_x.shape, sample_y.shape)

In [None]:
model = SimpleClassifier(input_size=1255)

x,y = dset[0]
model(x)

In [12]:
train_loader = DataLoader(dset, batch_size=32, shuffle=True)

In [None]:
for x_batch, y_batch in train_loader:
  print(x_batch)
  print(y_batch)
  print(x_batch.shape, y_batch.shape)
  break

In [None]:
model(x_batch)

# Training

In [15]:
class SimpleClassifier(nn.Module):
  def __init__(self, input_size):
    super().__init__()

    self.net = nn.Sequential(
      nn.Linear(input_size, 1),
      nn.Sigmoid()
    )

  def forward(self, x):
    return self.net(x)

In [62]:
def train(model, dataset, batch_size=32, lr=1e-3, epochs=3):
  train_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

  loss_function = nn.BCELoss()
  optimizer = torch.optim.Adam(model.parameters(), lr=lr)

  for epoch in range(epochs):
    for batch_x, batch_y in train_loader:
      outputs = model(batch_x)
      loss = loss_function(outputs, batch_y)
      loss.backward()
      optimizer.step()
      optimizer.zero_grad()

    print(loss.item())

In [None]:
train_dset = FlattenedDataset(train_df)
model = SimpleClassifier(input_size=1255)
train(model, train_dset)

# Evaluation

In [68]:
from tqdm import tqdm
from torchmetrics import Accuracy

def evaluate(model, dataset):
  loader = DataLoader(dataset, batch_size=32, shuffle=False)
  accuracy_function = Accuracy(task='binary')

  model.eval() #Turn off training-only layers
  all_predictions = []
  all_labels = []
  with torch.no_grad(): #Dont track gradients
    for batch_x,batch_y in tqdm(loader):
      output = model(batch_x)
      all_predictions.append(output)
      all_labels.append(batch_y)

  print('Accuracy:', accuracy_function(torch.cat(all_predictions), torch.cat(all_labels)).item())

In [None]:
evaluate(model, train_dset)

In [None]:
test_df = pd.DataFrame(data=[{'x':x,'y':y} for x,y in HumanNontataPromoters('test')])
test_dset = FlattenedDataset(test_df)

evaluate(model, test_dset)

## Multi-layer perceptron (MLP)

In [None]:
class MLP(nn.Module):
  def __init__(self, input_size, hidden_size, num_classes):
    super().__init__()
    #TODO
    pass

  def forward(self,x):
    #TODO
    pass

# Test the MLP
mlp_model = MLP(input_size=1255, hidden_size = 100, num_classes=1)
sample_input = torch.rand(32,1255)
mlp_model(sample_input).size()

In [None]:
train(mlp_model, train_dset)

In [None]:
evaluate(mlp_model, train_dset)
evaluate(mlp_model, test_dset)

# Convolutional Neural Network (CNN)

In [None]:
class CNN(nn.Module):
  def __init__(self, in_channels, num_classes):
    super().__init__()
    #TODO
    pass

  def forward(self,x):
    #TODO
    pass

# Test the CNN
cnn_model = CNN(in_channels=5, num_classes=1)
sample_input = torch.rand(32,5,251)
cnn_model(sample_input).size()

In [None]:
class CNNDataset(Dataset):
    def __init__(self, df):
      self.df = df
      self.vocab = {'N':0,'A':1,'C':2,'T':3,'G':4}

    def __len__(self):
      return len(self.df)

    def __getitem__(self, idx):
      #TODO change pre-processing to fit CNN

      sequence, label = self.df.iloc[idx]

      numericalized = [self.vocab[c] for c in sequence]
      numericalized_tensor = torch.tensor(numericalized)
      ohe_seq = torch.nn.functional.one_hot(
          numericalized_tensor,
          num_classes=len(self.vocab.keys())
      )

      x = ohe_seq.flatten().float()
      y = torch.tensor([label]).float()

      return x, y

train_dset = CNNDataset(train_df)
sample_x, sample_y = train_dset[0]
print(sample_x.shape)
print(sample_y.shape)

In [None]:
train(cnn_model, train_dset)

In [None]:
test_dset = CNNDataset(test_df)

evaluate(cnn_model, train_dset)
evaluate(cnn_model, test_dset)