In [15]:
! pip install --upgrade transformers py3Dmol accelerate pandas biopython

Collecting biopython
  Downloading biopython-1.85-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Downloading biopython-1.85-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m59.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: biopython
Successfully installed biopython-1.85


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [23]:
import torch
import pandas as pd
import numpy as np
import py3Dmol
import time

from tqdm import tqdm
from transformers.utils import send_example_telemetry
from transformers import AutoTokenizer, EsmForProteinFolding
from transformers.models.esm.openfold_utils.protein import to_pdb, Protein as OFProtein
from transformers.models.esm.openfold_utils.feats import atom14_to_atom37
from Bio import PDB
from Bio.PDB.DSSP import DSSP
from Bio.PDB.Polypeptide import is_aa
from Bio.SeqUtils import seq1

In [4]:
send_example_telemetry("protein_folding_notebook", framework="pytorch")
tokenizer = AutoTokenizer.from_pretrained("facebook/esmfold_v1")
model = EsmForProteinFolding.from_pretrained("facebook/esmfold_v1", low_cpu_mem_usage=True)

model = model.cuda()
model.esm = model.esm.half()
model.trunk.set_chunk_size(64)

torch.backends.cuda.matmul.allow_tf32 = True

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/40.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/72.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/121 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/2.10k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


pytorch_model.bin:   0%|          | 0.00/8.44G [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/8.44G [00:00<?, ?B/s]

Some weights of EsmForProteinFolding were not initialized from the model checkpoint at facebook/esmfold_v1 and are newly initialized: ['esm.contact_head.regression.bias', 'esm.contact_head.regression.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
# train_df = pd.read_csv('/content/drive/MyDrive/Protein-binding/data/development_set/full_grouped_train_binding_sites_df.csv')
test_df = pd.read_csv('/content/drive/MyDrive/Protein-binding/data/independent_set/grouped_test_46_new_binding_sites.csv')

In [6]:
def convert_outputs_to_pdb(outputs):
    final_atom_positions = atom14_to_atom37(outputs["positions"][-1], outputs)
    outputs = {k: v.to("cpu").numpy() for k, v in outputs.items()}
    final_atom_positions = final_atom_positions.cpu().numpy()
    final_atom_mask = outputs["atom37_atom_exists"]
    pdbs = []
    for i in range(outputs["aatype"].shape[0]):
        aa = outputs["aatype"][i]
        pred_pos = final_atom_positions[i]
        mask = final_atom_mask[i]
        resid = outputs["residue_index"][i] + 1
        pred = OFProtein(
            aatype=aa,
            atom_positions=pred_pos,
            atom_mask=mask,
            residue_index=resid,
            b_factors=outputs["plddt"][i],
            chain_index=outputs["chain_index"][i] if "chain_index" in outputs else None,
        )
        pdbs.append(to_pdb(pred))
    return pdbs

In [None]:
# def folding_protein(protein_sequence):
#     # Ensure protein_sequence is a string and clean it
#     if isinstance(protein_sequence, list):
#         # If it's a list, take the first element (assuming it's the sequence)
#         protein_sequence = protein_sequence[0] if len(protein_sequence) > 0 else ""
#     # Remove newlines, spaces, and FASTA headers
#     protein_sequence = protein_sequence.strip().replace("\n", "").replace(" ", "")
#     if protein_sequence.startswith(">"):
#         protein_sequence = protein_sequence.split("\n", 1)[1] if "\n" in protein_sequence else ""

#     # Tokenize with padding and truncation
#     tokenized_input = tokenizer(
#         [protein_sequence],
#         return_tensors="pt",
#         add_special_tokens=False,
#         padding=True,
#         truncation=True,
#         max_length=1024  # ESMFold typically supports up to 1024 residues
#     )['input_ids']

#     # Move to GPU
#     tokenized_input = tokenized_input.cuda()

#     # Run prediction
#     with torch.no_grad():
#         outputs = model(tokenized_input)
#     return outputs

In [8]:
def folding_protein(protein_sequence):
    # Ensure protein_sequence is a string and clean it
    if isinstance(protein_sequence, list):
        # If it's a list, take the first element (assuming it's the sequence)
        protein_sequence = protein_sequence[0] if len(protein_sequence) > 0 else ""
    # Remove newlines, spaces, and FASTA headers
    protein_sequence = protein_sequence.strip().replace("\n", "").replace(" ", "")
    if protein_sequence.startswith(">"):
        protein_sequence = protein_sequence.split("\n", 1)[1] if "\n" in protein_sequence else ""

    # Check if the sequence is valid after cleaning
    if not protein_sequence:
        print("Warning: Empty protein sequence encountered. Skipping folding.")
        return None  # Or raise an exception if you prefer

    # Tokenize with padding and truncation
    tokenized_input = tokenizer(
        [protein_sequence],
        return_tensors="pt",
        add_special_tokens=False,
        padding=True,
        truncation=True,
        max_length=1024  # ESMFold typically supports up to 1024 residues
    )['input_ids']

    # Move to GPU
    tokenized_input = tokenized_input.cuda()

    # Run prediction
    with torch.no_grad():
        outputs = model(tokenized_input)
    return outputs

In [None]:
prot_id = "Q9NZV6" # Replace with Q9NZV6 or Q00277
prot_seq = train_df.loc[train_df['prot_id'] == prot_id]['sequence'].values[0]

print(prot_id)
print(prot_seq)
print(len(prot_seq))

In [None]:
outputs = folding_protein(prot_seq)

In [None]:
pdbs = convert_outputs_to_pdb(outputs)

In [None]:
print(pdbs[0])

In [None]:
# index = 10

# prot_id = train_df.iloc[index]['prot_id']
# prot_seq = train_df.iloc[index]['sequence']

# outputs = folding_protein(prot_seq)
# pdbs = convert_outputs_to_pdb(outputs)

In [None]:
# for idx, row in tqdm(train_df.iterrows(), total = train_df.shape[0]):
#     prot_id = row['prot_id']
#     prot_seq = row['sequence']
#     try:
#         outputs = folding_protein(prot_seq)
#     except:
#         print(f"Error while folding protein ID {prot_id}")
#     finally:
#         pdbs = convert_outputs_to_pdb(outputs)
#         with open(f"/content/drive/MyDrive/Protein-binding/esmFold_pdb_files/{prot_id}.pdb", "w") as f:
#             f.write(pdbs[0])


In [9]:
for idx, row in tqdm(test_df.iterrows(), total = test_df.shape[0]):
    prot_id = row['prot_id']
    prot_seq = row['sequence']
    try:
        outputs = folding_protein(prot_seq)
    except:
        print(f"Error while folding protein ID {prot_id}")
    finally:
        pdbs = convert_outputs_to_pdb(outputs)
        with open(f"/content/drive/MyDrive/Protein-binding/esmFold_pdb_files/{prot_id}.pdb", "w") as f:
            f.write(pdbs[0])



  0%|          | 0/46 [00:00<?, ?it/s][A
  2%|▏         | 1/46 [00:03<02:25,  3.22s/it][A
  4%|▍         | 2/46 [00:05<01:51,  2.53s/it][A
  7%|▋         | 3/46 [00:07<01:33,  2.18s/it][A
  9%|▊         | 4/46 [00:11<02:16,  3.25s/it][A
 11%|█         | 5/46 [00:13<01:50,  2.70s/it][A
 13%|█▎        | 6/46 [00:15<01:35,  2.38s/it][A
 15%|█▌        | 7/46 [00:17<01:24,  2.17s/it][A
 17%|█▋        | 8/46 [00:18<01:14,  1.96s/it][A
 20%|█▉        | 9/46 [00:20<01:10,  1.90s/it][A
 22%|██▏       | 10/46 [00:22<01:07,  1.88s/it][A
 24%|██▍       | 11/46 [00:24<01:09,  1.99s/it][A
 26%|██▌       | 12/46 [00:26<01:05,  1.93s/it][A
 28%|██▊       | 13/46 [00:28<01:02,  1.89s/it][A
 30%|███       | 14/46 [00:29<00:59,  1.85s/it][A
 33%|███▎      | 15/46 [00:31<00:57,  1.84s/it][A
 35%|███▍      | 16/46 [00:33<00:58,  1.96s/it][A
 37%|███▋      | 17/46 [00:44<02:09,  4.46s/it][A
 39%|███▉      | 18/46 [00:51<02:29,  5.34s/it][A
 41%|████▏     | 19/46 [00:53<01:53,  4.19s/it]

### Test functions

In [20]:
def get_structure(prot_id, pdb_file):
    parser = PDB.PDBParser()
    structure = parser.get_structure(prot_id, pdb_file)
    return structure

def extract_coordinates(structure):
    # Extract Cα coordinates (central carbon atom)
    coordinates = []
    for model in structure:
        for chain in model:
            for residue in chain:
                if "CA" in residue:  # Get Cα atom
                    ca_atom = residue["CA"]
                    coord = ca_atom.get_coord()  # Returns numpy array [x, y, z]
                    coordinates.append(coord)

    return coordinates

def calculate_residue_distances(coordinates):
    """
    Calculate pairwise distances between residues in a protein structure.
    Args: Coordinates (list): List of residue atom's coordinates.
    Returns:
        np.ndarray: 2D array of pairwise distances.
    """
    num_residues = len(coordinates)
    distances = np.zeros((num_residues, num_residues))

    for i in range(num_residues):
        for j in range(i + 1, num_residues):
            dist = np.linalg.norm(coordinates[i] - coordinates[j])
            distances[i, j] = distances[j, i] = dist

    return distances

In [21]:
prot_id = "A0A0B0QJR1"
structure_file = f"/content/drive/MyDrive/Protein-binding/esmFold_pdb_files/{prot_id}.pdb"

In [24]:
structure = get_structure(prot_id, structure_file)
coordinates = extract_coordinates(structure)
distances = calculate_residue_distances(coordinates)

