In [1]:
!pip install esm
!pip install matplotlib



In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
# x8QSZuDZ0rQ2ILo12ECHw
from getpass import getpass
from esm.sdk import client
from tqdm import tqdm

# token = getpass("Token from Forge console: ")
token = "3ZRAotPMfctUNHjJyuD72L"
model = client(
    model="esmc-600m-2024-12", url="https://forge.evolutionaryscale.ai", token=token
)

In [4]:
from concurrent.futures import ThreadPoolExecutor
from typing import Sequence

from esm.sdk.api import (
    ESM3InferenceClient,
    ESMProtein,
    ESMProteinError,
    LogitsConfig,
    LogitsOutput,
    ProteinType,
)

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

EMBEDDING_CONFIG = LogitsConfig(
    sequence=True, return_embeddings=True, return_hidden_states=True
)


def embed_sequence(model: ESM3InferenceClient, sequence: str) -> LogitsOutput:
    protein = ESMProtein(sequence=sequence)
    protein_tensor = model.encode(protein)
    output = model.logits(protein_tensor, EMBEDDING_CONFIG)
    return output


# def batch_embed(
#     model: ESM3InferenceClient, inputs: Sequence[ProteinType]
# ) -> Sequence[LogitsOutput]:
#     """Forge supports auto-batching. So batch_embed() is as simple as running a collection
#     of embed calls in parallel using asyncio.
#     """
#     total = len(inputs)
#     with ThreadPoolExecutor() as executor:
#         futures = [
#             executor.submit(embed_sequence, model, protein) for protein in inputs
#         ]
#         results = []
#         for future in tqdm(futures, total=total, desc="Processing sequences"):
#             try:
#                 results.append(future.result())
#             except Exception as e:
#                 print(f"Error processing sequence: {e}")
#                 break
#                 # results.append(ESMProteinError(500, str(e)))
#     return results

def batch_embed(
    model: ESM3InferenceClient, inputs: Sequence[ProteinType]
) -> Sequence[LogitsOutput]:
    """Forge supports auto-batching. So batch_embed() is as simple as running a collection
    of embed calls in parallel using asyncio.
    """
    total = len(inputs)
    with ThreadPoolExecutor() as executor:
        # Create a list of futures
        futures = [
            executor.submit(embed_sequence, model, protein) for protein in inputs
        ]
        results = []

        # Use enumerate to track the index while iterating through futures with tqdm
        for idx, future in enumerate(tqdm(futures, total=total, desc="Processing sequences")):
            try:
                results.append(future.result())
                # Optional: Print progress information
                tqdm.write(f"Completed sequence {idx}/{total}")
            except Exception as e:
                print(f"Error processing sequence {idx}: {e}")
                break  # This breaks the loop after encountering an exception
                # results.append(ESMProteinError(500, str(e)))
    return results

In [5]:
# train_df = pd.read_csv("/content/drive/MyDrive/Protein-binding/data/development_set/full_grouped_train_binding_sites_df.csv")
test_df = pd.read_csv("/content/drive/MyDrive/Protein-binding/data/independent_set/grouped_test_46_new_binding_sites.csv")

In [6]:
test_sequences_list = test_df["sequence"].tolist()
test_outputs = batch_embed(model, test_sequences_list)

Processing sequences:   2%|▏         | 1/46 [00:07<05:50,  7.79s/it]

Completed sequence 0/46


Processing sequences:  30%|███       | 14/46 [00:34<00:54,  1.70s/it]

Completed sequence 1/46
Completed sequence 2/46
Completed sequence 3/46
Completed sequence 4/46
Completed sequence 5/46
Completed sequence 6/46
Completed sequence 7/46
Completed sequence 8/46
Completed sequence 9/46
Completed sequence 10/46
Completed sequence 11/46
Completed sequence 12/46
Completed sequence 13/46
Completed sequence 14/46


Processing sequences:  30%|███       | 14/46 [00:38<00:54,  1.70s/it]

Completed sequence 15/46


Processing sequences:  37%|███▋      | 17/46 [00:51<01:17,  2.68s/it]

Completed sequence 16/46


Processing sequences:  39%|███▉      | 18/46 [00:59<01:29,  3.20s/it]

Completed sequence 17/46
Completed sequence 18/46
Completed sequence 19/46
Completed sequence 20/46
Completed sequence 21/46
Completed sequence 22/46
Completed sequence 23/46
Completed sequence 24/46


Processing sequences:  57%|█████▋    | 26/46 [01:13<00:49,  2.45s/it]

Completed sequence 25/46
Completed sequence 26/46


Processing sequences:  61%|██████    | 28/46 [01:27<00:56,  3.14s/it]

Completed sequence 27/46
Completed sequence 28/46
Completed sequence 29/46
Completed sequence 30/46


Processing sequences:  70%|██████▉   | 32/46 [01:33<00:37,  2.68s/it]

Completed sequence 31/46
Completed sequence 32/46
Completed sequence 33/46
Completed sequence 34/46
Completed sequence 35/46


Processing sequences:  80%|████████  | 37/46 [01:41<00:20,  2.23s/it]

Completed sequence 36/46
Completed sequence 37/46
Completed sequence 38/46
Completed sequence 39/46
Completed sequence 40/46
Completed sequence 41/46


Processing sequences: 100%|██████████| 46/46 [01:49<00:00,  2.39s/it]

Completed sequence 42/46
Completed sequence 43/46
Completed sequence 44/46
Completed sequence 45/46





In [7]:
print(len(test_outputs))

46


In [10]:
test_outputs_np = np.array(test_outputs)
np.save('/content/drive/MyDrive/Protein-binding/test46_sequences_esm3_medium_embeddings_Apr_15.npy', test_outputs_np)

In [11]:
test_sequences_esm_embeddings = np.load('/content/drive/MyDrive/Protein-binding/test46_sequences_esm3_medium_embeddings_Apr_15.npy', allow_pickle=True)

In [None]:
print(test_sequences_esm_embeddings)

In [None]:
# sample_element = test_sequences_esm_embeddings[10]
# sample_embeddings = sample_element.embeddings
# print(f"Sample embeddings:\n {sample_embeddings}")
# num_layers = sample_element.hidden_states.shape[0]  # Get the first dimension's size
# print(f"Number of layers in hidden_states: {num_layers}")