In [1]:
import pandas as pd
import numpy as np
from Bio import SeqIO
from Bio.PDB import PDBParser, DSSP
import sklearn as sl
import gc
import matplotlib.pyplot as plt
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.utils.class_weight import compute_class_weight
from imblearn.over_sampling import RandomOverSampler
import numpy as np
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset
from tqdm.notebook import tqdm
import torch.optim as optim
from torch.utils.data import WeightedRandomSampler, DataLoader
from sklearn.metrics import f1_score
from torch.nn.utils.rnn import pad_sequence
from torch.optim import Adam

In [2]:
data = np.load('cullpdb+profile_5926_filtered.npy.gz', allow_pickle=False)
print(data.shape)
reshaped = data.reshape(np.shape(data)[0], 700, 57)
print(reshaped.shape)

(5365, 39900)
(5365, 700, 57)


### Get primary structures and secondary structures

In [3]:
all_residues = ['A','C','E','D','G','F','I','H','K','M','L','N','Q','P','S','R','T','W','V','Y','X','NoSeq']
desired_sequence = 'MEPKVAELKQKIEDTLCPFGFEVYPFQVAWYNELLPPAFHLPLPGPTLAFLVLSTPAMFDRALKPFLQSCHLRMLTDPVDQCVAYHLGRVRESLPELQIEIIADYEVHPNRRPKILAQTAAHVAGAAYYYQRQDVEADPWGNQRISGVCIHPRFGGWFAIRGVVLLPGIEVPDLPPRKPHDCVPTRADRIALLEGFNFHWRDWTYRDAVTPQERYSEEQKAYFSTPPAQRLALLGLAQPSEKPSSPSPDLPFTTPAPKKPGNPSRARSWLSPRVSPPASPGP'
target_sequence = list(desired_sequence)
all_structures = ['L', 'B', 'E', 'G', 'I', 'H', 'S', 'T', 'NoSeqS']
structures_classes = ['H','E','C']
dssp8_to_q3 = {
    5: 0,  # H
    3: 0,  # G
    4: 0,  # I
    2: 1,  # E
    1: 1,  # B
    0: 2,  # L
    6: 2,  # S
    7: 2   # T
}
str8_to_3 = {
    'H':'H',
    'G':'H',
    'I':'H',
    'E':'E',
    'B':'E',
    'L':'C',
    'S':'C',
    'T':'C'
}

# creating the list of sequences
sequences = []
structures = []
for i in range(int(np.shape(reshaped)[0])):
    prot = reshaped[i,:,:]

    indices = np.argmax(prot[:,:22], axis = 1)
    indices = indices[np.where(indices != 21)]
    indices = [all_residues[index] for index in indices]
    sequences.append(indices)

    sec_indices = np.argmax(prot[:,22:31], axis = 1)
    sec_indices = sec_indices[np.where(sec_indices != 8)]
    sec_indices = [all_structures[index] for index in sec_indices]
    # map into 3 classes
    sec_indices = np.vectorize(str8_to_3.get)(sec_indices)
    structures.append(sec_indices)
    

print(len(sequences[1]))
print(len(structures[1]))
print(sequences[1])
print(structures[1])
print("Total proteins converted in single residues:",len(sequences))

148
148
['R', 'P', 'E', 'S', 'E', 'L', 'I', 'R', 'Q', 'S', 'W', 'R', 'V', 'V', 'S', 'R', 'S', 'P', 'L', 'E', 'H', 'G', 'T', 'V', 'L', 'F', 'A', 'R', 'L', 'F', 'A', 'L', 'E', 'P', 'S', 'L', 'L', 'P', 'L', 'F', 'Q', 'Y', 'N', 'G', 'R', 'Q', 'F', 'S', 'S', 'P', 'E', 'D', 'S', 'L', 'S', 'S', 'P', 'E', 'F', 'L', 'D', 'H', 'I', 'R', 'K', 'V', 'M', 'L', 'V', 'I', 'D', 'A', 'A', 'V', 'T', 'N', 'V', 'E', 'D', 'L', 'S', 'S', 'L', 'E', 'E', 'Y', 'L', 'T', 'S', 'L', 'G', 'R', 'K', 'H', 'R', 'A', 'V', 'G', 'V', 'R', 'L', 'S', 'S', 'F', 'S', 'T', 'V', 'G', 'E', 'S', 'L', 'L', 'Y', 'M', 'L', 'E', 'K', 'S', 'L', 'G', 'P', 'D', 'F', 'T', 'P', 'A', 'T', 'R', 'T', 'A', 'W', 'S', 'R', 'L', 'Y', 'G', 'A', 'V', 'V', 'Q', 'A', 'M', 'S', 'R', 'G', 'W', 'D', 'G']
['C' 'C' 'C' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'C' 'C' 'H'
 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'C' 'H' 'H' 'H'
 'H' 'H' 'H' 'C' 'C' 'E' 'C' 'C' 'E' 'C' 'C' 'C' 'C' 'H' 'H' 'H' 'H' 'C'
 'C' 'C' 'H' 'H' 'H' 'H' 'H' 'H'

In [4]:
# remamp into numbers
aa_to_idx = {
    'A': 0, 'C': 1, 'D': 2, 'E': 3,
    'F': 4, 'G': 5, 'H': 6, 'I': 7,
    'K': 8, 'L': 9, 'M': 10, 'N': 11,
    'P': 12, 'Q': 13, 'R': 14, 'S': 15,
    'T': 16, 'V': 17, 'W': 18, 'Y': 19, 'X': 20
}
dssp_to_idx = {
    'H': 0,
    'E': 1,
    'C': 2
}

# GET PRIMARY STRUCTURES into TORCH
aa_pad_idx = 21  # index for padding
# Convert each sequence to tensor of indices
seq_tensors = [torch.tensor([aa_to_idx.get(aa, aa_pad_idx) for aa in seq]) for seq in sequences]
# Pad them to max length with padding value aa_pad_idx
primary_structures = pad_sequence(seq_tensors, batch_first=True, padding_value=aa_pad_idx)

# GET SECONDARY STRUCTURES
ss_pad_idx = 3  # padding idx for DSSP if needed
ss_tensors = [torch.tensor([dssp_to_idx.get(ss, ss_pad_idx) for ss in ss_seq]) for ss_seq in structures]
secondary_structures = pad_sequence(ss_tensors, batch_first=True, padding_value=ss_pad_idx)

print(primary_structures.shape)  # (num_sequences, max_seq_length)
print(secondary_structures.shape)  # (num_sequences, max_seq_length)



torch.Size([5365, 696])
torch.Size([5365, 696])


# MODEL

In [5]:
class ProteinTransformer(nn.Module):
    def __init__(self, vocab_size, ss_vocab_size, d_model=128, n_heads=4, num_layers=2, dim_feedforward=256, dropout=0.1, aa_pad_idx=21, ss_pad_idx=3):
        super().__init__()
        
        self.aa_pad_idx = aa_pad_idx
        self.ss_pad_idx = ss_pad_idx
        
        # Embeddings with padding_idx to zero-out padding embeddings
        self.aa_embed = nn.Embedding(vocab_size, d_model, padding_idx=aa_pad_idx)
        self.ss_embed = nn.Embedding(ss_vocab_size, d_model, padding_idx=ss_pad_idx)
        
        # Positional encoding
        self.pos_embed = nn.Embedding(700, d_model)  # max seq length
        
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_model,
            nhead=n_heads,
            dim_feedforward=dim_feedforward,
            dropout=dropout,
            activation='gelu',
            batch_first=True
        )
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        
        self.fc_out = nn.Linear(d_model, vocab_size)

    def forward(self, aa_ids, ss_ids):
        """
        aa_ids: (batch, seq_len)
        ss_ids: (batch, seq_len)
        """
        bsz, seq_len = aa_ids.size()
        
        # Create padding mask: True where padding tokens are present
        # Shape: (batch, seq_len)
        padding_mask = (aa_ids == self.aa_pad_idx)
        
        # Embeddings
        x = self.aa_embed(aa_ids) + self.ss_embed(ss_ids)
        
        # Positional encoding
        positions = torch.arange(seq_len, device=aa_ids.device).unsqueeze(0).expand(bsz, seq_len)
        x = x + self.pos_embed(positions)
        
        # Transformer expects src_key_padding_mask with True for padded positions
        x = self.transformer(x, src_key_padding_mask=padding_mask)
        
        logits = self.fc_out(x)  # (batch, seq_len, vocab_size)
        return logits

## Datasets and dataloaders

In [6]:
# datasets and dataloaders
frac = 0.2 # fraction of points in valid set
train_ps = primary_structures[int(np.shape(primary_structures)[0] * frac):,:]
valid_ps = primary_structures[:int(np.shape(primary_structures)[0] * frac),:]
train_ss = secondary_structures[int(np.shape(secondary_structures)[0] * frac):,:]
valid_ss = secondary_structures[:int(np.shape(secondary_structures)[0] * frac),:]


print('Training set:',np.shape(train_ps))
print('Validation set:',np.shape(valid_ps))

Training set: torch.Size([4292, 696])
Validation set: torch.Size([1073, 696])


In [7]:
import torch
from torch.utils.data import Dataset, DataLoader

class ProteinDataset(Dataset):
    def __init__(self, ps_tensor, ss_tensor):
        assert ps_tensor.shape == ss_tensor.shape, "ps and ss must have the same shape"
        self.ps = ps_tensor
        self.ss = ss_tensor
    
    def __len__(self):
        return self.ps.shape[0]
    
    def __getitem__(self, idx):
        return self.ps[idx], self.ss[idx]

# Create datasets
train_dataset = ProteinDataset(torch.tensor(train_ps), torch.tensor(train_ss))
valid_dataset = ProteinDataset(torch.tensor(valid_ps), torch.tensor(valid_ss))

# Create dataloaders
batch_size = 128
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=batch_size)


  train_dataset = ProteinDataset(torch.tensor(train_ps), torch.tensor(train_ss))
  valid_dataset = ProteinDataset(torch.tensor(valid_ps), torch.tensor(valid_ss))


In [8]:
# training 
model = ProteinTransformer(vocab_size=22, ss_vocab_size=4, aa_pad_idx=21, ss_pad_idx=3)

optimizer = Adam(model.parameters(), lr=1e-4)
criterion = nn.CrossEntropyLoss(ignore_index=21)  # ignore padding in target loss
num_epochs = 1

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    loop = tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}")
    for aa_ids, ss_ids in loop:
        aa_ids = aa_ids.to(device)
        ss_ids = ss_ids.to(device)

        optimizer.zero_grad()

        input_aa = aa_ids[:, :-1]
        input_ss = ss_ids[:, :-1]
        target_aa = aa_ids[:, 1:]

        logits = model(input_aa, input_ss)

        logits = logits.reshape(-1, logits.size(-1))
        target_aa = target_aa.reshape(-1)

        loss = criterion(logits, target_aa)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        loop.set_postfix(loss=running_loss / (loop.n + 1))

Epoch 1/1:   0%|          | 0/34 [00:00<?, ?it/s]

In [9]:
print(type(model))
model.eval()
total_loss = 0
with torch.no_grad():
    for aa_ids, ss_ids in valid_loader:
        aa_ids = aa_ids.to(device)
        ss_ids = ss_ids.to(device)

        input_aa = aa_ids[:, :-1]
        input_ss = ss_ids[:, :-1]
        target_aa = aa_ids[:, 1:]

        logits = model(input_aa, input_ss)
        logits = logits.reshape(-1, logits.size(-1))
        target_aa = target_aa.reshape(-1)

        loss = criterion(logits, target_aa)
        total_loss += loss.item() * aa_ids.size(0)  # weighted sum
val_loss = total_loss / len(valid_loader.dataset)
perplexity = torch.exp(torch.tensor(val_loss))
print(f"Validation Loss: {val_loss:.4f}")


<class '__main__.ProteinTransformer'>


  output = torch._nested_tensor_from_mask(


Validation Loss: 2.9517


# Evaluate on MMACHC

In [10]:
# produce the sequence
code = 'Q9Y4U1'
faa = f"fasta/{code}.fasta"
pdb_file = f"pdb/{code}.pdb"
sec_map = ['H', 'E', 'C']
max_len = 700
try:
    # --- 1. Load primary sequence ---
    record = SeqIO.read(faa, "fasta")
    seq = str(record.seq)
    L = len(seq)

    # --- 2. One-hot encode primary structure ---
    prot = np.zeros((max_len, len(all_residues)), dtype=float)
    for j, aa in enumerate(seq[:max_len]):  # truncate to 700
        idx = all_residues.index(aa) if aa in all_residues else no_seq_idx
        prot[j, idx] = 1

    # --- 3. Parse secondary structure with DSSP ---
    parser = PDBParser(QUIET=True)
    structure = parser.get_structure(code, pdb_file)
    model_DSSP = next(structure.get_models())

    dssp = DSSP(model_DSSP, pdb_file)  # Requires DSSP installed
    ss_list = []
    for key in dssp.keys():
        ss = dssp[key][2]  # DSSP secondary structure code
        ss_list.append(ss if ss != ' ' else 'C')  # convert blanks to coil

    # --- 4. One-hot encode secondary structure ---
    sec = np.zeros((max_len, len(sec_map)), dtype=float)
    for j, ss in enumerate(ss_list[:max_len]):  # truncate to 700
        idx = sec_map.index(ss) if ss in sec_map else sec_map.index('C')
        sec[j, idx] = 1

    # --- 5. Combine features: primary (22) + secondary (3) ---
    feat = np.concatenate([prot, sec], axis=1)  # shape: (700, 25)

    print(f"{code} → {feat.shape}")
except Exception as e:
    print(f"Skipping {code} due to error: {e}")

Q9Y4U1 → (700, 25)


In [19]:
ps_test = np.argmax(prot, axis = 1)[:282]
ss_test = np.argmax(sec, axis = 1)[:282]



In [24]:
# Encode secondary structure for your sequence (length must match primary)
ss_seq = torch.tensor(ss_test, device=device)  # shape (1, seq_len)

model.eval()
with torch.no_grad():
    seq_idx = torch.tensor(ps_test, device=device)  # shape (1, seq_len)
    
    input_aa = seq_idx[:-1]
    input_ss = ss_seq[:-1]
    target_aa = seq_idx[:]
    input_aa = torch.tensor(ps_test).unsqueeze(0)  # Shape: (1, seq_len)
    input_ss = torch.tensor(ss_test).unsqueeze(0)  # Shape: (1, seq_len)
    
    
    logits = model(input_aa, input_ss)
    
    logits = logits.reshape(-1, logits.size(-1))
    target_aa = target_aa.reshape(-1)
    
    #loss = criterion(logits, target_aa)
    print(f"Loss: {loss.item():.4f}")
    
    probs = torch.softmax(logits, dim=-1)
    predicted = torch.argmax(probs, dim=-1)
    
    accuracy = (predicted == target_aa).float().mean().item()
    print(f"Accuracy: {accuracy:.4f}")


Loss: 2.9535
Accuracy: 0.0355


In [26]:
print(predicted[:10])
print(target_aa[:10])

tensor([5, 5, 9, 0, 9, 9, 9, 0, 9, 9])
tensor([ 9,  2, 13,  8, 18,  0,  2, 10,  8, 12])


# Pretrained model

In [28]:
from transformers import AutoTokenizer, AutoModelForMaskedLM

tokenizer = AutoTokenizer.from_pretrained("Rostlab/prot_bert", do_lower_case=False)
model = AutoModelForMaskedLM.from_pretrained("Rostlab/prot_bert")


tokenizer_config.json:   0%|          | 0.00/86.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/361 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/81.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.68G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.68G [00:00<?, ?B/s]

Some weights of the model checkpoint at Rostlab/prot_bert were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
