## Demo embedding extraction

This notebook shows how to use the embedding extraction module for generating numerical representation using the pre-trained models.

The following family of pre-trained models is available for testing

- Prot5Based
- ESMBasedEmbedding
- Ankh2BasedEmbedding
- BertBasedMebedding
- MistralBasedEmbedding

In [27]:
import warnings
warnings.filterwarnings("ignore")

In [28]:
import sys
sys.path.insert(0, "../../utils/")

In [29]:
import pandas as pd

- Loading modules

In [30]:
from embedding_extraction.prot5_based import Prot5Based
from embedding_extraction.esm_based import ESMBasedEmbedding
from embedding_extraction.bert_based import BertBasedMebedding
from embedding_extraction.mistral_based import MistralBasedEmbedding
from dimensionality_reductions.non_linear_reductions import NonLinearReductions

ModuleNotFoundError: No module named 'umap'

In [None]:
import seaborn as sns

- Read AMP dataset and select only 10 examples

In [None]:
name_data = "antiviral_homology_90"
df_data = pd.read_csv(f"../../dataset_demos/{name_data}.csv")

df_data.head(5)

### ProT5 evaluation

In [None]:
df_to_prot5 = df_data.copy()
df_to_prot5["sequence"] = df_to_prot5["sequence"].apply(lambda x: " ".join(x)) # this process is necessary for the application of the pre-trained model

prot5_based = Prot5Based(
    name_device="cuda",
    dataset=df_to_prot5,
    name_model="Rostlab/ProstT5",
    name_tokenizer="Rostlab/ProstT5",
    column_seq="sequence",
    columns_ignore=["label"],
)

print("Loading model/tokenizer")
prot5_based.load_model_tokenizer()

print("Generating embedding")
df_embedding = prot5_based.embedding_process(batch_size=5)

prot5_based.cleaning_memory()
print("Process finished")

df_embedding

In [None]:
df_values = df_embedding.drop(columns=["label"])
nonlinear_instance = NonLinearReductions(dataset=df_values)

transform_values_umap = nonlinear_instance.applyUMAP()
transform_values_umap["label"] = df_embedding["label"].values

sns.scatterplot(data=transform_values_umap, x="p_1", y="p_2", hue="label", palette="Set2")


### ESM evaluation

In [None]:
esm_based = ESMBasedEmbedding(
    name_device="cuda",
    dataset=df_data,
    name_model="facebook/esm2_t6_8M_UR50D",
    name_tokenizer="facebook/esm2_t6_8M_UR50D",
    column_seq="sequence",
    columns_ignore=["label"],
)

print("Loading model/tokenizer")
esm_based.load_model_tokenizer()

print("Generating embedding")
df_embedding = esm_based.embedding_process(batch_size=5)

esm_based.cleaning_memory()
print("Process finished")

df_embedding

In [None]:
df_values = df_embedding.drop(columns=["label"])
nonlinear_instance = NonLinearReductions(dataset=df_values)

transform_values_umap = nonlinear_instance.applyUMAP()
transform_values_umap["label"] = df_embedding["label"].values

sns.scatterplot(data=transform_values_umap, x="p_1", y="p_2", hue="label", palette="Set2")


### Bert evaluation

In [None]:
df_to_bert = df_data.copy()
df_to_bert["sequence"] = df_to_bert["sequence"].apply(lambda x: " ".join(x)) # this process is necessary for the application of the pre-trained model

bert_based = BertBasedMebedding(
    name_device="cuda",
    dataset=df_to_bert,
    name_model="Rostlab/prot_bert_bfd_ss3",
    name_tokenizer="Rostlab/prot_bert_bfd_ss3",
    column_seq="sequence",
    columns_ignore=["label"],
)

print("Loading model/tokenizer")
bert_based.load_model_tokenizer()

print("Generating embedding")
df_embedding = bert_based.embedding_process(batch_size=5)

bert_based.cleaning_memory()
print("Process finished")

df_embedding

In [None]:
df_values = df_embedding.drop(columns=["label"])
nonlinear_instance = NonLinearReductions(dataset=df_values)

transform_values_umap = nonlinear_instance.applyUMAP()
transform_values_umap["label"] = df_embedding["label"].values

sns.scatterplot(data=transform_values_umap, x="p_1", y="p_2", hue="label", palette="Set2")
