# Test out the model from the AI-ATAC repo on a test dataset from the first notebook
We are going to use the code from AI-ATAC repo: https://github.com/smaslova/AI-TAC. You'll need to clone the repo with Git and then add the path to the code as shown below.

# Set-up

In [18]:
# Imports for AI-ATAC model
import sys
sys.path.append('/cellar/users/aklie/opt/AI-TAC/code')
import aitac
import torch
import torch.nn as nn

# Imports for eugene
from eugene import models
from eugene import train
import seqdata as sd
from eugene import settings

In [22]:
settings.logging_dir = "/cellar/users/aklie/data/datasets/AI-ATAC/analysis/10Nov23/seqdata"

In [23]:
# Hyper parameters
num_epochs = 10
num_classes = 81
batch_size = 100
learning_rate = 0.001
num_filters = 300

# Load data

In [24]:
sdata = sd.open_zarr("/cellar/users/aklie/data/datasets/AI-ATAC/analysis/10Nov23/seqdata/fold_0/ai-atac_train.zarr")

In [25]:
# reshape the ohe 
sdata['ohe_seq'] = sdata['ohe_seq'].transpose('_sequence', '_ohe', 'length')

# Build model

In [33]:
class ArchWrapper(nn.Module):
    def __init__(self, arch):
        super().__init__()
        self.arch = arch
    def forward(self, x):
        return self.arch(x)[0]

In [34]:
# Define the model
model = ArchWrapper(aitac.ConvNet(num_classes, num_filters))

# Define a SequenceModule
module = models.SequenceModule(
    arch=model,
    input_len=250,
    output_dim=num_classes,
    task='regression',
    #loss_fxn=aitac.pearson_loss,
    loss_fxn="mse",
    optimizer='adam',
    optimizer_lr=learning_rate,
    metric="pearson",
    seed=1234
    
)
module

[rank: 0] Global seed set to 1234


SequenceModule(
  (arch): ArchWrapper(
    (arch): ConvNet(
      (layer1_conv): Sequential(
        (0): Conv2d(1, 300, kernel_size=(4, 19), stride=(1, 1))
        (1): ReLU()
      )
      (layer1_process): Sequential(
        (0): MaxPool2d(kernel_size=(1, 3), stride=(1, 3), padding=(0, 1), dilation=1, ceil_mode=False)
        (1): BatchNorm2d(300, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
      (layer2): Sequential(
        (0): Conv2d(300, 200, kernel_size=(1, 11), stride=(1, 1))
        (1): ReLU()
        (2): MaxPool2d(kernel_size=(1, 4), stride=(1, 4), padding=(0, 1), dilation=1, ceil_mode=False)
        (3): BatchNorm2d(200, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
      (layer3): Sequential(
        (0): Conv2d(200, 200, kernel_size=(1, 7), stride=(1, 1))
        (1): ReLU()
        (2): MaxPool2d(kernel_size=(1, 4), stride=(1, 4), padding=(0, 1), dilation=1, ceil_mode=False)
        (3): BatchNorm2d(200, eps=1e-05

# Train the model

In [35]:
# Rename the _celltypes coord to _targets
sdata = sdata.rename({'_celltypes': '_targets'})

ValueError: cannot rename '_celltypes' because it is not a variable or dimension in this dataset

In [41]:
import pandas as pd
test = sdata[["fold_0_train", "chrom"]].to_dataframe()
pd.crosstab(test["fold_0_train"], test["chrom"])

chrom,chr10,chr11,chr12,chr13,chr14,chr15,chr16,chr17,chr18,chr19,chr2,chr4,chr5,chr7,chr8,chr9
fold_0_train,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
False,0,0,0,0,0,0,12279,0,0,0,0,0,0,0,16619,0
True,18954,20082,14493,16120,13725,13836,0,14523,11203,9158,26891,20462,20112,19294,0,17288


In [42]:
train.fit_sequence_module(
    model=module,
    sdata=sdata,
    seq_var="ohe_seq",
    target_vars="peak_height",
    train_var="fold_0_train",
    batch_size=batch_size,
    epochs=num_epochs,
    in_memory=True,
    seed=1234,
    name="fold_0",
    version="AI-ATAC_MSE",
)

Dropping 0 sequences with NaN targets.
Loading ohe_seq and ['peak_height'] into memory


[rank: 0] Global seed set to 1234
  rank_zero_warn(
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name         | Type        | Params
---------------------------------------------
0 | arch         | ArchWrapper | 3.0 M 
1 | train_metric | R2Score     | 0     
2 | val_metric   | R2Score     | 0     
3 | test_metric  | R2Score     | 0     
---------------------------------------------
3.0 M     Trainable params
0         Non-trainable params
3.0 M     Total params
12.192    Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(
  rank_zero_warn(


Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]