<a href="https://colab.research.google.com/github/ML-Bioinfo-CEITEC/ECCB2022/blob/main/notebooks/02_fastai_CNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install datasets torchmetrics --quiet

[K     |████████████████████████████████| 365 kB 4.3 MB/s 
[K     |████████████████████████████████| 419 kB 43.7 MB/s 
[K     |████████████████████████████████| 120 kB 64.9 MB/s 
[K     |████████████████████████████████| 115 kB 60.2 MB/s 
[K     |████████████████████████████████| 212 kB 64.0 MB/s 
[K     |████████████████████████████████| 127 kB 40.1 MB/s 
[?25h

In [10]:
import torch
from torch.nn import functional as F
from torch.utils.data import DataLoader

nucleotide_to_number = {
    'A':0,
    'C':1,
    'T':2,
    'G':3,
    'N':4,
}

def numericalize(x, vocab):
  x = [vocab[s] for s in x]
  return x

In [11]:
from torch.utils.data import DataLoader
from datasets import load_dataset

train_dset = load_dataset("simecek/human_nontata_promoters", split="train")
test_dset = load_dataset("simecek/human_nontata_promoters", split="test")

def preprocess(batch):
  xs, ys = [], []
  for example in batch:
    x = example['seq']
    y = example['labels']

    xs.append(numericalize(x, vocab=nucleotide_to_number))
    ys.append([y])
  
  return torch.tensor(xs), torch.tensor(ys).float()
  
train_loader = DataLoader(train_dset, batch_size=32, shuffle=True, collate_fn=preprocess)  
test_loader = DataLoader(test_dset, batch_size=32, collate_fn=preprocess)  



In [12]:
import torch
from torch import nn
from torch.nn import functional as F

class FullyConv(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
      super().__init__()
      self.net = nn.Sequential(
          nn.Conv1d(in_channels=input_dim, out_channels=hidden_dim, kernel_size=5, stride=1),
          nn.ReLU(),
          nn.Conv1d(in_channels=hidden_dim, out_channels=hidden_dim, kernel_size=5, stride=1),
          nn.ReLU(),
          nn.Flatten(),
          nn.LazyLinear(out_features=output_dim), #Lazy layer allows us to skip the in_features parameter and derive it automatically
          nn.Sigmoid(),
      )

    def forward(self, x):
      x = F.one_hot(x, num_classes=5).float()
      x = x.transpose(1, 2) #Transposig because Convolutional layers expect channels to be the second dimension. [32, 251, 5] -> [32, 5, 251]
      x = self.net(x)
      return x

net = FullyConv(5,30,1)#.to('cuda')



In [13]:
from torchmetrics import Accuracy

acc = Accuracy()#.to('cuda')
def test_accuracy(x,y):
  return acc(x, y.int())

In [14]:
from fastai.text.all import *

data = DataLoaders(train_loader, test_loader)
learn = Learner(data, net, loss_func=F.binary_cross_entropy, opt_func=SGD, metrics=[test_accuracy])

In [15]:
learn.fit_one_cycle(3, 1e-2)

epoch,train_loss,valid_loss,test_accuracy,time
0,0.514333,0.505296,0.762453,00:17
1,0.449703,0.458567,0.781492,00:19
2,0.440904,0.4513,0.784481,00:17
