# Testing Janggu's Integration 

**Authorship:**
Adam Klie, *08/04/2022*
***
**Description:**
Notebook for testing Janggu's `data` module's compatibility with EUGENe

In [1]:
import os
import sys
from pkg_resources import resource_filename
import eugene as eu

# Autoreload extension
if 'autoreload' not in get_ipython().extension_manager.loaded:
    %load_ext autoreload
%autoreload 2

Global seed set to 13


GPU is available: True
Number of GPUs: 1
Current GPU: 0
GPUs: Quadro RTX 5000


In [3]:
eu.external.janggu?

[0;31mType:[0m        module
[0;31mString form:[0m <module 'eugene.external.janggu' from '/mnt/beegfs/users/aklie/projects/EUGENe/eugene/external/janggu/__init__.py'>
[0;31mFile:[0m        /mnt/beegfs/users/aklie/projects/EUGENe/eugene/external/janggu/__init__.py
[0;31mDocstring:[0m   <no docstring>


In [4]:
REFGENOME = resource_filename('eugene', 'external/janggu/resources/pseudo_genome.fa')
ROI_TRAIN_FILE = resource_filename('eugene', 'external/janggu/resources/roi_train.bed')
ROI_TEST_FILE = resource_filename('eugene', 'external/janggu/resources/roi_test.bed')
PEAK_FILE = resource_filename('eugene', 'external/janggu/resources/scores.bed')
OUT = "/cellar/users/aklie/data/eugene/janggu/mem_test"

In [5]:
REFGENOME, ROI_TRAIN_FILE, ROI_TEST_FILE, PEAK_FILE, OUT

('/mnt/beegfs/users/aklie/projects/EUGENe/eugene/external/janggu/resources/pseudo_genome.fa',
 '/mnt/beegfs/users/aklie/projects/EUGENe/eugene/external/janggu/resources/roi_train.bed',
 '/mnt/beegfs/users/aklie/projects/EUGENe/eugene/external/janggu/resources/roi_test.bed',
 '/mnt/beegfs/users/aklie/projects/EUGENe/eugene/external/janggu/resources/scores.bed',
 '/cellar/users/aklie/data/eugene/janggu/mem_test')

Load the datasets for training and testing

In [7]:
sdata_train = eu.dl.read_bed(
    bed_file=PEAK_FILE,
    roi_file=ROI_TRAIN_FILE,
    ref_file=REFGENOME,
    binsize=200
)

In [15]:
sdata_

SeqData object with = 7797 seqs
seqs = None
names = (7797,)
rev_seqs = None
ohe_seqs = (7797, 200, 4)
ohe_rev_seqs = None
seqs_annot: 'target'
pos_annot: None
seqsm: None
uns: None

In [14]:
sdata_train.write_h5sd(os.path.join(OUT, "sdata_train_raw.h5sd"))

In [17]:
sdata_train.seqs = eu.pp.decode_DNA_seqs(sdata_train.ohe_seqs)

Decoding DNA sequences:   0%|          | 0/7797 [00:00<?, ?it/s]

In [18]:
sdata_train.write_h5sd(os.path.join(OUT, "sdata_train_seqs_added.h5sd"))

In [19]:
sdata_train.rev_seqs = eu.pp.reverse_complement_seqs(sdata_train.seqs)

Reverse complementing DNA sequences:   0%|          | 0/7797 [00:00<?, ?it/s]

In [20]:
sdata_train.write_h5sd(os.path.join(OUT, "sdata_train_seqs-revseqs_added.h5sd"))

In [30]:
sdata_train.ohe_rev_seqs = eu.pp.ohe_DNA_seqs(sdata_train.rev_seqs)

One-hot-encoding sequences:   0%|          | 0/7797 [00:00<?, ?it/s]

In [32]:
sdata_train.write_h5sd(os.path.join(OUT, "sdata_train_seqs-revseqs-oherevseqs_added.h5sd"))

In [33]:
!ls -lh $OUT

total 39M
-rw-r--r-- 1 aklie carter-users 6.2M Aug 10 09:16 sdata_train_raw.h5sd
-rw-r--r-- 1 aklie carter-users 7.7M Aug 10 09:18 sdata_train_seqs_added.h5sd
-rw-r--r-- 1 aklie carter-users 9.2M Aug 10 09:19 sdata_train_seqs-revseqs_added.h5sd
-rw-r--r-- 1 aklie carter-users  16M Aug 10 09:24 sdata_train_seqs-revseqs-oherevseqs_added.h5sd


In [6]:
sdata_read = eu.dl.read_h5sd(os.path.join(OUT, "sdata_train_seqs-revseqs-oherevseqs_added.h5sd"))

In [7]:
sdata_read.ohe_seqs

array([[[0, 1, 0, 0],
        [1, 0, 0, 0],
        [0, 1, 0, 0],
        ...,
        [0, 1, 0, 0],
        [1, 0, 0, 0],
        [1, 0, 0, 0]],

       [[0, 1, 0, 0],
        [0, 0, 0, 1],
        [0, 1, 0, 0],
        ...,
        [1, 0, 0, 0],
        [1, 0, 0, 0],
        [0, 1, 0, 0]],

       [[1, 0, 0, 0],
        [1, 0, 0, 0],
        [0, 0, 1, 0],
        ...,
        [0, 0, 0, 1],
        [0, 0, 0, 1],
        [0, 1, 0, 0]],

       ...,

       [[0, 0, 0, 1],
        [0, 1, 0, 0],
        [0, 0, 0, 1],
        ...,
        [1, 0, 0, 0],
        [0, 0, 1, 0],
        [1, 0, 0, 0]],

       [[0, 0, 1, 0],
        [0, 0, 0, 1],
        [0, 0, 1, 0],
        ...,
        [0, 0, 0, 1],
        [0, 0, 1, 0],
        [1, 0, 0, 0]],

       [[0, 0, 1, 0],
        [1, 0, 0, 0],
        [0, 0, 1, 0],
        ...,
        [0, 0, 1, 0],
        [0, 0, 0, 1],
        [1, 0, 0, 0]]], dtype=int8)

In [12]:
sdata_read

SeqData object with = 7797 seqs
seqs = (7797,)
names = (7797,)
rev_seqs = (7797,)
ohe_seqs = (7797, 200, 4)
ohe_rev_seqs = (7797, 200, 4)
seqs_annot: 'target'
pos_annot: None
seqsm: None
uns: None

In [40]:
model = eu.models.Kopp21CNN(
    input_len=200,
    output_dim=1
    
)



In [42]:
model.scheduler

'lr_scheduler'

In [36]:
eu.pp.train_test_split_data(sdata_train)

SeqData object modified:
    seqs_annot:
        + train


In [43]:
eu.train.fit(model, sdata_train, target="target", epochs=2)

No transforms given, assuming just need to tensorize).
No transforms given, assuming just need to tensorize).


Missing logger folder: /cellar/users/aklie/eugene_log/dsKopp21CNN_binary_classification
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
Set SLURM handle signals.

  | Name       | Type        | Params
-------------------------------------------
0 | hp_metric  | AUROC       | 0     
1 | conv       | Conv1d      | 450   
2 | maxpool    | MaxPool1d   | 0     
3 | batchnorm  | BatchNorm1d | 20    
4 | conv2      | Conv1d      | 248   
5 | batchnorm2 | BatchNorm1d | 16    
6 | linear     | Linear      | 9     
-------------------------------------------
743       Trainable params
0         Non-trainable params
743       Total params
0.003     Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

  f"The dataloader, {name}, does not have many workers which may be a bottleneck."
Global seed set to 13
  f"The dataloader, {name}, does not have many workers which may be a bottleneck."
  f"The number of training samples ({self.num_training_batches}) is smaller than the logging interval"


Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Metric val_loss improved. New best score: 0.421


Validating: 0it [00:00, ?it/s]

Metric val_loss improved by 0.138 >= min_delta = 0.0. New best score: 0.283


---

# Scratch