In [1]:
from pathlib import Path
from functools import partial
from squid.mutagenizer import RandomMutagenesis
from squid.predictor import ScalarPredictor
from squid.mave import InSilicoMAVE

from insilico_mpra.predict import predict_ensemble_from_onehot

  __import__("pkg_resources").declare_namespace(__name__)


In [2]:
weight_dir = '/grid/koo/home/nagai/projects/continual_learning/insilico_mpra/model_weight'
batch_size = 1024
num_workers = 4
N = 30_000

In [3]:
# import os
# import glob
# import shutil

# weight_dir = '/grid/koo/home/nagai/projects/continual_learning/insilico_mpra/model_weight'
# for i in range(10):
#     sub_path = os.path.join(weight_dir, str(i))
#     if os.path.isdir(sub_path):
#         ckpt_pattern = os.path.join(sub_path, "best_model*.ckpt")
#         ckpt_list = glob.glob(ckpt_pattern)
#         if ckpt_list:
#             # Take the first matching file (assumes one per subdir)
#             src_ckpt = ckpt_list[0]
#             dst_ckpt = os.path.join(weight_dir, f'{i}/best.ckpt')
#             shutil.copyfile(src_ckpt, dst_ckpt)

In [4]:
from insilico_mpra.predict import load_model

def load_models(weight_dir, num_models=10):
    models = []
    for i in range(num_models):
        model_path = Path(weight_dir) / str(i) / 'best.ckpt'
        config_path = Path(weight_dir) / str(i) / 'config.json'
        if model_path.exists() and config_path.exists():
            model, _ = load_model(config_path, model_path)
            models.append(model)
        else:
            raise FileNotFoundError(f"Model or config file not found for model {i} at {model_path} or {config_path}")
    return models

models = load_models(weight_dir, num_models=10)

In [5]:
import torch
onehot = torch.zeros((10, 4, 230), dtype=torch.float32)
out = predict_ensemble_from_onehot(models, onehot)
print(out.shape)



(10, 1)


In [11]:
out[0]
ls = [out for _ in range(7)]
import numpy as np
np.concatenate(ls, axis=0).shape

(70, 1)

In [7]:
import h5py

tmpfile = '/grid/koo/home/shared/clg_procap/mpra/250808/18107.h5'
with h5py.File(tmpfile, 'r') as f:
    x = f['x'][:]  # Assuming 'x' is the key for one-hot encoded sequences
    y = f['y'][:]  # Assuming 'y' is the key for labels
print(f'x shape: {x.shape}, y shape: {y.shape}')

x shape: (30000, 230, 4), y shape: (1,)


In [8]:
y

array([-0.59592503], dtype=float32)

In [23]:
mut_generator = RandomMutagenesis(mut_rate=0.1)

ensemble_fun = partial(
    predict_ensemble_from_onehot,
    models=models,
    batch_size=batch_size,
    num_workers=num_workers
)
mut_predictor = ScalarPredictor(pred_fun=ensemble_fun)

mave = InSilicoMAVE(
    mut_generator=mut_generator,
    mut_predictor=mut_predictor,
    seq_length=230,
    mut_window=[15, 215],
)