In [25]:
import pandas as pd
import sys
sys.path.insert(0, "../../src/")

import torch

from embedding_representation.ESMBasedEmbedding import ESMBasedEmbedding
from embedding_representation.MistralBasedEmbedding import MistralBasedEmbedding
from embedding_representation.Prot5BasedEmbedding import Prot5BasedEmbedding

In [26]:
torch.cuda.empty_cache()

In [27]:
model_table = pd.DataFrame({
    "model_name": [
        "Rostlab/prot_t5_xl_uniref50",
        "Rostlab/prot_t5_xl_bfd",
        "RaphaelMourad/Mistral-Peptide-v1-15M",
        "RaphaelMourad/Mistral-Peptide-v1-134M",
        "RaphaelMourad/Mistral-Peptide-v1-422M"
    ],
    "embedding_dim": [
        1024,
        1024,
        256,
        768,
        768
    ]
})

In [28]:
def get_embedding_class(model_name):
    if model_name.startswith("facebook/"):
        return ESMBasedEmbedding
    elif model_name.startswith("Rostlab/"):
        return Prot5BasedEmbedding
    elif model_name.startswith("RaphaelMourad/"):
        return MistralBasedEmbedding
    else:
        raise ValueError(f"No se reconoce clase para el modelo: {model_name}")

In [29]:
def process_model(row, df_data):
    import gc
    # Se obtiene el nombre del modelo y la dimensión de embedding
    model_name = row["model_name"]
    embedding_dim = row["embedding_dim"]
    model_id = model_name.split("/")[-1]

    print(f"{model_name} is processing...")

    # Se define device: forzamos CPU si es ProtT5
    if "prot_t5" in model_name.lower():
        device_name = "cpu"
    else:
        device_name = "cuda" if torch.cuda.is_available() else "cpu"

    # Se crea la clase de embedding
    EmbeddingClass = get_embedding_class(model_name)
    model = EmbeddingClass(
        name_model=model_name,
        name_tokenizer=model_name,
        dataset=df_data,
        column_seq="sequence",
        columns_ignore=["target"],
        name_device=device_name
    )

    # Se carga el modelo y el tokenizer
    model.loadModelTokenizer()

    # Se generan embeddings
    model.getEmbedding()

    # Se reduce dimensionalidad a 1D
    reduced_embedding = model.reduceEmbedding(
        type_reduction=1,
        embedding_matrix=model.embeddings,
        dimension_based=1
    )
    reduced_embedding["target"] = df_data["target"].values

    model.exportingEmbeddings(
        embeddings=reduced_embedding,
        name_export=f"../../results_demos/demo_{name_data}_{model_id}"
    )

    # Zero Padding
    embeddings_processed = model.zeroPaddingEmbedding(
        embedding_matrix=model.embeddings,
        max_length=50,
        embedding_dimension=embedding_dim
    )

    model.exportingEmbeddings(
        embeddings=embeddings_processed,
        type_export=2,
        name_export=f"../../results_demos/demo_non_reduced_{name_data}_{model_id}",
        extension="npz"
    )

    # Liberar memoria
    del model
    torch.cuda.empty_cache()
    gc.collect()

    return f"{model_id} is done"


In [30]:
name_data = "antiviral_homology_90"
df_data = pd.read_csv(f"../../dataset_demos/{name_data}.csv")

In [None]:
results = model_table.apply(process_model, axis=1, df_data=df_data)
print(results)

Rostlab/prot_t5_xl_uniref50 is processing...
Using device:  cpu
