In [2]:
import os
import glob
import torch
import numpy as np
import eugene as eu

In [9]:
from eugene.dataload import ProfileDataset
from eugene.models._profile_models import BPNet
eu.settings.logging_dir = "/cellar/users/aklie/projects/ML4GLand/use_cases/avsec21/models"

In [10]:
from bpnetlite.io import extract_peaks
from bpnetlite.performance import calculate_performance_measures

In [5]:
# Set paths
data_dir = "/cellar/users/aklie/data/eugene/avsec21/ENCSR000EGM/data"
reference_dir = "/cellar/users/aklie/data/eugene/avsec21/reference"
peaks = os.path.join(data_dir, "peaks.bed")
seqs = os.path.join(reference_dir, "hg38.fa")
signals = [os.path.join(data_dir, "plus.bw"), os.path.join(data_dir, "minus.bw")]
controls = [os.path.join(data_dir, "control_plus.bw"), os.path.join(data_dir, "control_minus.bw")]

# Set training and validation chromosomes
valid_chroms = ['chr{}'.format(i) for i in range(18, 23)]

# EUGENe

## Load data

In [6]:
X_val, y_val, X_ctl_val = eu.dl.read_profile(peaks, seqs, signals, controls, max_jitter=0, chroms=valid_chroms)
X_val.shape, y_val.shape, X_ctl_val.shape

(torch.Size([7051, 4, 2114]),
 torch.Size([7051, 2, 1000]),
 torch.Size([7051, 2, 2114]))

In [7]:
X_val_dataset = ProfileDataset(X_val, y_val, X_ctl_val)
X_val_loader = X_val_dataset.to_dataloader(batch_size=64, num_workers=4, shuffle=False)

# Load model

In [11]:
model_file = glob.glob(os.path.join(eu.settings.logging_dir, "version_0", "checkpoints", "*.ckpt"))[0]
model_file

'/cellar/users/aklie/models/eugene/avsec21/version_0/checkpoints/epoch=4-step=3579.ckpt'

In [12]:
model = BPNet.load_from_checkpoint(
    model_file, 
    input_len=2114,
    output_dim=1000,
    n_outputs=2,
    n_control_tracks=2, 
    trimming=(2114 - 1000) // 2
)
model.eval().cuda()

BPNet(
  (iconv): Conv1d(4, 64, kernel_size=(21,), stride=(1,), padding=(10,))
  (irelu): ReLU()
  (rconvs): ModuleList(
    (0): Conv1d(64, 64, kernel_size=(3,), stride=(1,), padding=(2,), dilation=(2,))
    (1): Conv1d(64, 64, kernel_size=(3,), stride=(1,), padding=(4,), dilation=(4,))
    (2): Conv1d(64, 64, kernel_size=(3,), stride=(1,), padding=(8,), dilation=(8,))
    (3): Conv1d(64, 64, kernel_size=(3,), stride=(1,), padding=(16,), dilation=(16,))
    (4): Conv1d(64, 64, kernel_size=(3,), stride=(1,), padding=(32,), dilation=(32,))
    (5): Conv1d(64, 64, kernel_size=(3,), stride=(1,), padding=(64,), dilation=(64,))
    (6): Conv1d(64, 64, kernel_size=(3,), stride=(1,), padding=(128,), dilation=(128,))
    (7): Conv1d(64, 64, kernel_size=(3,), stride=(1,), padding=(256,), dilation=(256,))
  )
  (rrelus): ModuleList(
    (0): ReLU()
    (1): ReLU()
    (2): ReLU()
    (3): ReLU()
    (4): ReLU()
    (5): ReLU()
    (6): ReLU()
    (7): ReLU()
  )
  (fconv): Conv1d(66, 2, kernel_s

In [13]:
batch = next(iter(X_val_loader))
example_output = model(batch[0].cuda(), batch[1].cuda())
example_output[0].shape, example_output[1].shape

(torch.Size([64, 2, 1000]), torch.Size([64, 1]))

# Get the prediction 

In [15]:
from tqdm.auto import tqdm
def predict(model, loader):
	with torch.no_grad():
		y_profiles, y_counts = [], []
		for i, batch in tqdm(enumerate(loader), total=len(loader)):
			X = batch[0].cuda()
			X_ctl = batch[1].cuda()
			y_profiles_, y_counts_ = model(X, X_ctl)
			y_profiles_ = y_profiles_.cpu()
			y_counts_ = y_counts_.cpu()
			
			y_profiles.append(y_profiles_)
			y_counts.append(y_counts_)

		y_profiles = torch.cat(y_profiles)
		y_counts = torch.cat(y_counts)
		return y_profiles, y_counts

In [16]:
y_profile_eugene, y_counts_eugene = predict(model, X_val_loader)
y_profile_eugene.shape, y_counts_eugene.shape

HBox(children=(FloatProgress(value=0.0, max=111.0), HTML(value='')))




(torch.Size([7051, 2, 1000]), torch.Size([7051, 1]))

In [17]:
# Clean up profile
z = y_profile_eugene.shape
y_profile_clean_eugene = y_profile_eugene.reshape(y_profile_eugene.shape[0], -1)
y_profile_clean_eugene = torch.nn.functional.log_softmax(y_profile_clean_eugene, dim=-1)
y_profile_clean_eugene = y_profile_clean_eugene.reshape(*z)

In [18]:
eugene_measures = calculate_performance_measures(
    y_profile_clean_eugene, 
    y_val, 
    y_counts_eugene, 
    kernel_sigma=7, 
    kernel_width=81, 
    measures=['profile_mnll', 'profile_pearson', 'count_pearson', 'count_mse']
)

profile_corr = eugene_measures['profile_pearson']
count_corr = eugene_measures['count_pearson']

valid_loss = eugene_measures['profile_mnll'].mean()
valid_loss += model.alpha * eugene_measures['count_mse'].mean()

print(f"Profile correlations for each task: {profile_corr.numpy().mean(axis=0)}")
print(f"Count correlations for each task: {count_corr.numpy()}")
print(f"Profile MNLLL: {eugene_measures['profile_mnll'].mean():.3f}")
print(f"Count MSE: {eugene_measures['count_mse'].mean():.3f}")
print(f"Validation loss: {valid_loss:.3f}")

Profile correlations for each task: [0.41662365 0.41636887]
Count correlations for each task: [0.66827923 0.6687726 ]
Profile MNLLL: 266.908
Count MSE: 1.019
Validation loss: 267.927


# bpnet-lite

## Load data

In [19]:
X_valid, y_valid, X_ctl_valid = extract_peaks(peaks, seqs, signals, controls, chroms=valid_chroms, max_jitter=0)
X_valid.shape, y_valid.shape, X_ctl_valid.shape


(torch.Size([7051, 4, 2114]),
 torch.Size([7051, 2, 1000]),
 torch.Size([7051, 2, 2114]))

# Load model

In [20]:
models_dir = "/cellar/users/aklie/projects/ML4GLand/use_cases/avsec21/models"
pretrained_model = torch.load(os.path.join(models_dir, "bpnet.64.8.torch")).eval().cuda()

In [21]:
y_profile_lite, y_counts_lite = pretrained_model.predict(X_valid, X_ctl_valid)

In [22]:
z = y_profile_lite.shape
y_profile_clean_lite = y_profile_lite.reshape(y_profile_lite.shape[0], -1)
y_profile_clean_lite = torch.nn.functional.log_softmax(y_profile_clean_lite, dim=-1)
y_profile_clean_lite = y_profile_clean_lite.reshape(*z)

In [23]:
lite_measures = calculate_performance_measures(
    y_profile_clean_lite, 
	y_valid, 
    y_counts_lite, 
    kernel_sigma=7, 
    kernel_width=81, 
    measures=['profile_mnll', 'profile_pearson', 'count_pearson', 'count_mse']
)


profile_corr = lite_measures['profile_pearson']
count_corr = lite_measures['count_pearson']

valid_loss = lite_measures['profile_mnll'].mean()
valid_loss += model.alpha * lite_measures['count_mse'].mean()

print(f"Profile correlations for each task: {profile_corr.numpy().mean(axis=0)}")
print(f"Count correlations for each task: {count_corr.numpy()}")
print(f"Profile MNLLL: {lite_measures['profile_mnll'].mean():.3f}")
print(f"Count MSE: {lite_measures['count_mse'].mean():.3f}")
print(f"Validation loss: {valid_loss:.3f}")

Profile correlations for each task: [0.41187093 0.41174912]
Count correlations for each task: [0.6579376 0.660682 ]
Profile MNLLL: 267.863
Count MSE: 0.782
Validation loss: 268.645
