In [18]:
import yaml
import torch
import ankh
from data_prepare import load_embeddings_to_df
from torch_utils import InferenceDataset, load_models

In [16]:
models = load_models()

In [3]:
df = prepare_embed_df()

In [4]:
train_folds, valid_folds = make_folds(df)

In [12]:
train_folds[0].label.value_counts()

1    28136
0    27761
Name: label, dtype: int64

Количество параметров в модели

In [3]:
with open("config.yml", "r") as f:
    config = yaml.safe_load(f)

# %%
input_dim = config["model_config"]["input_dim"]
nhead = config["model_config"]["nhead"]
hidden_dim = config["model_config"]["hidden_dim"]
num_hidden_layers = config["model_config"]["num_hidden_layers"]
num_layers = config["model_config"]["num_layers"]
kernel_size = config["model_config"]["kernel_size"]
dropout = config["model_config"]["dropout"]
pooling = config["model_config"]["pooling"]


epochs = config["training_config"]["epochs"]
lr = config["training_config"]["lr"]
factor = config["training_config"]["factor"]
patience = config["training_config"]["patience"]
min_lr = config["training_config"]["min_lr"]
batch_size = config["training_config"]["batch_size"]
seed = config["training_config"]["seed"]
num_workers = config["training_config"]["num_workers"]

In [5]:
binary_classification_model = ankh.ConvBertForBinaryClassification(
        input_dim=input_dim,
        nhead=nhead,
        hidden_dim=hidden_dim,
        num_hidden_layers=num_hidden_layers,
        num_layers=num_layers,
        kernel_size=kernel_size,
        dropout=dropout,
        pooling=pooling,
    )

In [7]:
total_params = sum(p.numel() for p in binary_classification_model.parameters())

In [9]:
trainable_params = sum(p.numel() for p in binary_classification_model.parameters() if p.requires_grad)

In [10]:
total_params, trainable_params

(13013775, 13013775)

In [2]:
embed_df = load_embeddings_to_df("../data/embeddings/ankh_embeddings/not_annotated_seqs_v1_2d.h5")

In [3]:
inderence_dataset = InferenceDataset(embed_df)

In [5]:
from torch.utils.data import DataLoader

In [20]:
inference_dataloader = DataLoader(
        inderence_dataset,
        num_workers=1,
        shuffle=False,
        batch_size=1,
    )

In [9]:
id_, x = next(iter(inference_dataloader))

In [None]:
id_

In [19]:
DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [42]:
identifiers = []
scores = []
with torch.no_grad():
    for id_, x in inference_dataloader:
        x = x.to(DEVICE)
        ens_logits = []

        for model in models:
            model.eval()
            model = model.to(DEVICE)
            output = model(x)

            logits = output.logits
            ens_logits.append(logits)

        ens_logits = torch.stack(ens_logits, dim=0)
        ens_logits = torch.mean(ens_logits, dim=0)
        prob_score = torch.sigmoid(ens_logits)

        identifiers.extend(id_)
        scores.append(prob_score.cpu().numpy().item())
        break

In [29]:
a = scores[0] > 0.5

In [44]:
identifiers

['A0A023PYF4']

: 