<a href="https://colab.research.google.com/github/ML-Bioinfo-CEITEC/ECCB2022/blob/main/notebooks/02_fastai_CNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install genomic-benchmarks torchmetrics --quiet

[K     |████████████████████████████████| 419 kB 44.3 MB/s 
[K     |████████████████████████████████| 2.3 MB 30.7 MB/s 
[?25h  Building wheel for genomic-benchmarks (setup.py) ... [?25l[?25hdone


In [2]:
import torch
from torch.nn import functional as F
from torch.utils.data import DataLoader

nucleotide_to_number = {
    'A':0,
    'C':1,
    'T':2,
    'G':3,
    'N':4,
}

def numericalize(x, vocab=nucleotide_to_number):
  x = [vocab[s] for s in x]
  return x

In [30]:
from genomic_benchmarks.dataset_getters.pytorch_datasets import HumanNontataPromoters
from torch.utils.data import DataLoader

train_dset = HumanNontataPromoters('train')
test_dset = HumanNontataPromoters('test')

def preprocess(batch):
  xs, ys = [], []
  for x,y in batch:
    xs.append(numericalize(x))
    ys.append([y])
  
  return torch.tensor(xs), torch.tensor(ys).float()
  
train_loader = DataLoader(train_dset, batch_size=32, shuffle=True, collate_fn=preprocess)  
test_loader = DataLoader(test_dset, batch_size=32, collate_fn=preprocess)  


In [31]:
import torch
from torch import nn
from torch.nn import functional as F

class FullyConv(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
      super().__init__()
      self.net = nn.Sequential(
          nn.Conv1d(in_channels=input_dim, out_channels=hidden_dim, kernel_size=5, stride=1),
          nn.ReLU(),
          nn.Conv1d(in_channels=hidden_dim, out_channels=hidden_dim, kernel_size=5, stride=1),
          nn.ReLU(),
          nn.Flatten(),
          nn.LazyLinear(out_features=output_dim), #Lazy layer allows us to skip the in_features parameter and derive it automatically
          nn.Sigmoid(),
      )

    def forward(self, x):
      x = F.one_hot(x, num_classes=5).float()
      x = x.transpose(1, 2) #Transposig because Convolutional layers expect channels to be the second dimension. [32, 251, 5] -> [32, 5, 251]
      x = self.net(x)
      return x

net = FullyConv(5,30,1)#.to('cuda')

In [32]:
from torchmetrics import Accuracy

acc = Accuracy().to('cuda')
def test_accuracy(x,y):
  return acc(x, y.int())

In [33]:
from fastai.text.all import *

data = DataLoaders(train_loader, test_loader)
# learn = Learner(data, net, loss_func=F.binary_cross_entropy, opt_func=SGD, metrics=[accuracy, BalancedAccuracy(), test_accuracy])
learn = Learner(data, net, loss_func=F.binary_cross_entropy, opt_func=SGD, metrics=[test_accuracy])


In [34]:
learn.fit_one_cycle(10, 1e-2)

epoch,train_loss,valid_loss,test_accuracy,time
0,0.627349,0.610222,0.716958,00:05
1,0.467112,0.504875,0.762232,00:05
2,0.428674,0.445241,0.789905,00:05
3,0.400784,0.405004,0.818685,00:05
4,0.372335,0.379104,0.833961,00:06
5,0.361406,0.373024,0.841488,00:05
6,0.357887,0.363064,0.845915,00:05
7,0.338035,0.360716,0.844255,00:05
8,0.353169,0.359559,0.845362,00:05
9,0.342356,0.359502,0.84658,00:05


In [35]:
# learn.validate()

(#2) [0.35950160026550293,0.8465796113014221]