In [1]:
# you have to make the module discoverable to load the classes below:

import sys
import os

os.chdir('../../../nucleotran/')
sys.path.append('./src')

In [2]:
from data.dataloaders import CoverageDatasetHDF5

from features.nucleotide import DNATokenizer
from torch import tensor

import numpy as np
import pandas as pd

import torch
from torch.utils.data import DataLoader
from torch.nn import Embedding
from torch import nn
from torch import permute

In [3]:
import pytorch_lightning as pl

In [4]:
from pytorch_lightning.callbacks import RichProgressBar

In [5]:
# set the "sequence order" or "token/word length"
seq_order = 2

In [6]:
# the tokenizer defines the token length, and the stride
dnatokenizer = DNATokenizer(seq_order=seq_order, stride=1, allow_N=False)

# dna_embed will help us convert the token-representation to one-hot representation
W, mapping = dnatokenizer.get_one_hot_weights_matrix(N_max=0)
dna_embed = Embedding.from_pretrained(tensor(W),freeze=True)

In [7]:
dataset = CoverageDatasetHDF5('data/processed/GRCh38/toydata/regions.bed',
                              '/dhc/dsets/reference_genomes/ensembl/Homo_sapiens.GRCh38.dna.primary_assembly_renamed.fa',
                              'data/processed/GRCh38/toydata/overlaps.h5',
                              dna_tokenizer=dnatokenizer,
                              random_shift=0,
                              random_reverse_complement=True,
                              transform=lambda x: permute(dna_embed(tensor(x)), [0,2,1]), # the loader serves up the data in "channels-last" format, but pytorch wants "channels-first", so we permute
                              target_transform=tensor)

BED-file contains 10000 regions.
93.250% of regions have at least 1 label.


In [8]:
dataset.bedseqloader.resize(128+(seq_order-1)) # add one to keep the input sequence length the same when using bi-nucleotide encoding

In [9]:
# we decide we want ot use chromosomes 9 and 10 as the test set
i_train, i_test = dataset.train_test_split_chromosomes(['9','10'])

In [10]:
i_train

array([   0,    1,    2, ..., 9997, 9998, 9999])

In [11]:
i_val = i_train[(np.arange(len(i_train)) % 1000) < 20] # sample consecutive regions for the validation set, which is a sub-set of the training set
i_train = np.setdiff1d(i_train, i_val)

print(f'train len: {len(i_train)}')
print(f'val len: {len(i_val)}')
print(f'test len: {len(i_test)}')

train len: 8896
val len: 200
test len: 904


In [12]:
batch_size = 2048 # we could probably fit much more, but this is just an example...


train_sampler = torch.utils.data.sampler.BatchSampler(
                torch.utils.data.sampler.SubsetRandomSampler(i_train),
                batch_size=batch_size,
                drop_last=False)

val_sampler = torch.utils.data.sampler.BatchSampler(
                torch.utils.data.sampler.SubsetRandomSampler(i_val),
                batch_size=batch_size,
                drop_last=False)

test_sampler = torch.utils.data.sampler.BatchSampler(
                torch.utils.data.sampler.SubsetRandomSampler(i_test),
                batch_size=batch_size,
                drop_last=False) 

train_dataloader = DataLoader(dataset, sampler=train_sampler)
val_dataloader = DataLoader(dataset, sampler=val_sampler)
test_dataloader = DataLoader(dataset, sampler=test_sampler)

In [21]:
def get_init_bias(i_train, dataset, n_sample=1000):
    
    """
    Estimates the class-frequencies from n_sample samples, and returns the corresponding bias vector to initialize the model with
    """
    
    _, y = dataset[np.random.choice(i_train, size=min(n_sample, len(i_train)), replace=False)]
    frq = (y.detach().numpy().sum(axis=0) + 1) / len(y)
    
    b = -np.log((1/frq - 1))
    return b

In [14]:
B = get_init_bias(i_train, dataset, n_sample=1000)

In [15]:
class DinucleotideBaseline(pl.LightningModule):
    
    """
    baseline model that works only with dinulceotide frequencies.
    
    expects input to be shape (16, l), where l is the sequence length
    """
    
    def __init__(self, input_length=128, n_classes=1, init_bias=None):
        super().__init__()
        
        self.avgpool = nn.AvgPool1d(kernel_size=input_length, stride=1, padding=0)
        self.linear = nn.Linear(in_features=16, out_features=n_classes)
        self.sigmoid = nn.Sigmoid()
        
        if init_bias is not None:
            self.linear.bias.data = init_bias
            
        self.loss_fun = nn.BCELoss(reduction='mean')
        
        
    def forward(self, x):
        x = torch.flatten(self.avgpool(x),1,2)
        x = self.linear(x)
        out = self.sigmoid(x)
        return out
    
    def training_step(self, batch, batch_idx):
        x, y = batch
        x = x[0] # TODO: why does the loaded data have an additional dimension in front (?)
        y = y[0]
        y_hat = self(x)
        loss = self.loss_fun(y_hat, y)
        self.log("train_loss", loss, on_step=True, on_epoch=True, prog_bar=True, logger=True)
        return loss
    
    
    def validation_step(self, batch, batch_idx):
        x, y = batch
        x = x[0] # TODO: why does the loaded data have an additional dimension in front (?)
        y = y[0]
        y_hat = self(x)
        loss = self.loss_fun(y_hat, y)
        self.log("val_loss", loss, on_step=False, prog_bar=True, on_epoch=True, logger=True)
    
    def configure_optimizers(self, lr=None):
        if lr is None:
            lr = 1e-2
        return torch.optim.Adam(self.parameters(), lr=lr)    


In [16]:
trainer = pl.Trainer(max_epochs=20, log_every_n_steps=1, val_check_interval=5, callbacks=[RichProgressBar()])
model = DinucleotideBaseline(input_length=128, n_classes=dataset.labelloader.n_labels, init_bias=tensor(B))

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [17]:
model.double()

DinucleotideBaseline(
  (avgpool): AvgPool1d(kernel_size=(128,), stride=(1,), padding=(0,))
  (linear): Linear(in_features=16, out_features=2106, bias=True)
  (sigmoid): Sigmoid()
  (loss_fun): BCELoss()
)

In [18]:
trainer.fit(model, train_dataloaders=train_dataloader, val_dataloaders=val_dataloader)

In [19]:
pd.Series(model.linear.weight.detach().numpy().mean(axis=0), index=list(dnatokenizer.mapping.keys()))

AA   -0.589999
AC   -0.122978
AG    0.050020
AT   -0.675674
CA   -0.197982
CC    0.540073
CG    0.669514
CT    0.086498
GA    0.067287
GC    0.566212
GG    0.566071
GT   -0.126278
TA   -0.672120
TC    0.007918
TG   -0.136977
TT   -0.554329
dtype: float64

In [20]:
# TODO: add evaluation step to the model
# TODO: load the metadata, visualize evaluation metrics stratified by different groups