In [None]:
import torch
from transformers import AutoTokenizer, AutoModel
import pandas as pd
import csv
import json

# Load ProtBERT tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("Rostlab/prot_bert")
model = AutoModel.from_pretrained("Rostlab/prot_bert")
model.eval()

# Function to get token embeddings
def get_embeddings(sequence):
    sequence = " ".join(sequence)  # Add spaces between amino acids
    inputs = tokenizer(sequence, return_tensors="pt", padding=True, truncation=True)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.squeeze().tolist()

# Load dataset
csv_file = "Train.csv"  # Update the path if needed
df = pd.read_csv(csv_file)
sequences = df['Sequence'].tolist()

# Output CSV file
output_file = "/content/drive/MyDrive/PSSP big/train_embeddings_2"
start_idx = 1000  # Start from 1001st sequence (index 1000)
end_idx = 2000  # Stop at 2000th sequence (index 1999)

# Write embeddings to CSV with checkpointing
with open(output_file, mode="w", newline="") as f:
    writer = csv.writer(f)
    writer.writerow(["sequence", "embedding"])  # CSV header

    for i, seq in enumerate(sequences[start_idx:end_idx], start=start_idx + 1):
        embeddings = get_embeddings(seq)
        print(f"Processed {i}: {seq[:10]}... -> {len(embeddings)} embeddings")  # Output progress
        writer.writerow([seq, json.dumps(embeddings)])  # Save to CSV

print(f"Processed sequences {start_idx + 1} to {end_idx}. Embeddings saved to {output_file}")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/86.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/361 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/81.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.68G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.68G [00:00<?, ?B/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Processed 1001: MAASNATYNN... -> 119 embeddings
Processed 1002: MSITLSDSAA... -> 109 embeddings
Processed 1003: MEHICGTSRI... -> 202 embeddings
Processed 1004: MSVFDSKFKG... -> 412 embeddings
Processed 1005: SDAATTFLQR... -> 209 embeddings
Processed 1006: MRTELLSKLY... -> 264 embeddings
Processed 1007: MPELEVKGKK... -> 117 embeddings
Processed 1008: AFRILTINPG... -> 383 embeddings
Processed 1009: GSHAFAPSDR... -> 108 embeddings
Processed 1010: MDLTNKNVIF... -> 256 embeddings
Processed 1011: MKKTQTWILT... -> 275 embeddings
Processed 1012: ATNKQVEISA... -> 128 embeddings
Processed 1013: GAKNYYDITL... -> 215 embeddings
Processed 1014: AARGANHVYL... -> 303 embeddings
Processed 1015: GHMDSMDHRI... -> 119 embeddings
Processed 1016: MSLMGYKNNR... -> 276 embeddings
Processed 1017: MGGSHHHHHH... -> 166 embeddings
Processed 1018: MDWLPRNTNC... -> 116 embeddings
Processed 1019: MHHHHHHSSG... -> 242 embeddings
Processed 1020: MDLLSCTVND... -> 77 embeddings
Processed 1021: GSQRVLVEPD... -> 262 embe

In [None]:
import pandas as pd
import json

# Load the CSV file containing embeddings
csv_file = "/content/drive/MyDrive/PSSP big/train_embeddings_2"  # Update with the correct file
df = pd.read_csv(csv_file)

# Iterate over the first 5 sequences
for i in range(min(5, len(df))):
    sequence = df.loc[i, "sequence"]
    embeddings = json.loads(df.loc[i, "embedding"])  # Convert from JSON string to list

    print(f"Sequence {i+1}: {sequence[:10]}...")  # Print first 10 AAs for reference
    print("First 5 amino acid embeddings:")

    # Print embeddings for first 5 amino acids
    for j in range(min(5, len(embeddings))):
        print(f"AA {j+1}: {embeddings[j][:5]}")  # Print first 5 dimensions for readability

    print("-" * 50)  # Separator for clarity


Sequence 1: MAASNATYNN...
First 5 amino acid embeddings:
AA 1: [0.16351790726184845, 0.026101328432559967, 0.08993391692638397, -0.2651503086090088, 0.2581440508365631]
AA 2: [0.08048053830862045, 0.08980249613523483, 0.040511604398489, -0.1695208102464676, 0.14786012470722198]
AA 3: [0.10991034656763077, 0.04478240758180618, 0.029447514563798904, -0.09592229127883911, 0.07564140856266022]
AA 4: [0.007083553355187178, 0.07397487759590149, 0.017930403351783752, -0.11953765898942947, 0.06586409360170364]
AA 5: [-0.01959274895489216, -0.0016199809033423662, 0.08816464990377426, -0.061735354363918304, 0.08922812342643738]
--------------------------------------------------
Sequence 2: MSITLSDSAA...
First 5 amino acid embeddings:
AA 1: [0.10945986956357956, 0.019381141290068626, 0.04453824833035469, -0.325766384601593, 0.2756304144859314]
AA 2: [0.10565010458230972, 0.07076577097177505, 0.08626673370599747, -0.03645419701933861, 0.34749260544776917]
AA 3: [0.09130249917507172, 0.135194450616