# Extract Sequences from pdb and save parts

In [6]:
!pip install biopython



In [1]:
import os
import json
from Bio.PDB import PDBParser
from Bio.Seq import Seq
from Bio import SeqIO

def load_jsonl_in_dict(jsonl_file_path):
    # Initialize a list to store all JSON objects
    data = []

    # Read the .jsonl file and parse each line
    with open(jsonl_file_path, 'r') as file:
        for line in file:
            # Load each JSON object from the line
            data.append(json.loads(line.strip()))

    # Convert to a dictionary if needed (e.g., if the JSONL has a key-value structure)
    # Example: Assuming each JSON object has a unique 'id' field
    data_dict = {item['id']: item for item in data if 'id' in item}

    return data_dict


def extract_sequence(file_path):
    # Load the PDB file
    pdb_file = file_path
    parser = PDBParser()
    structure = parser.get_structure(pdb_file.split("/")[-1].replace(".pdb", ""), pdb_file)

    # Define the three-letter to one-letter amino acid conversion dictionary
    three_to_one = {
        'ALA': 'A', 'ARG': 'R', 'ASN': 'N', 'ASP': 'D', 'CYS': 'C',
        'GLN': 'Q', 'GLU': 'E', 'GLY': 'G', 'HIS': 'H', 'ILE': 'I',
        'LEU': 'L', 'LYS': 'K', 'MET': 'M', 'PHE': 'F', 'PRO': 'P',
        'SER': 'S', 'THR': 'T', 'TRP': 'W', 'TYR': 'Y', 'VAL': 'V'
    }

    # Extract sequence
    sequence = []
    for model in structure:
        for chain in model:
            for residue in chain:
                # Only consider standard amino acids
                if residue.id[0] == " " and residue.resname in three_to_one:
                    # Append the one-letter code
                    sequence.append(three_to_one[residue.resname])

    # Join the sequence list into a single string
    one_letter_sequence = ''.join(sequence)

    return one_letter_sequence

In [13]:
d = load_jsonl_in_dict("../data/train.jsonl")

In [15]:
counter = 0
for k,v in d.items():
    print(k,v)
    counter += 1
    if counter == 5:
        break

1es5-A {'id': '1es5-A', 'sequence': 'VTKPTIAAVGGYAMNNGTGTTLYTKAADTRRSTGSTTKIMTAKVVLAQSNLNLDAKVTIQKAYSDYVVANNASQAHLIVGDKVTVRQLLYGLMLPSGCDAAYALADKYGSGSTRAARVKSFIGKMNTAATNLGLHNTHFDSFDGIGNGANYSTPRDLTKIASSAMKNSTFRTVVKTKAYTAKTVTKTGSIRTMDTWKNTNGLLSSYSGAIGVKTGAGPEAKYCLVFAATRGGKTVIGTVLASTSIPARESDATKIMNYGFAL', 'label': 'CCCCCCCCCEEEEEECCCCCEEEEECCCCCECCHHHHHHHHHHHHHCCCCCCCCCEEECCHHHHHHHHHCCCCCCCCCCCCEEEHHHHHHHHHCCCCHHHHHHHHHHHCCCCCHHHHHHHHHHHHHHHHHHCCCCCCECCCCCCCCCCCCEECHHHHHHHHHHHCCCHHHHHHHCCCEECCEEECCCCCEEECCCEECCCCHHHHCCCEEEEEEEEECCCEEEEEEEEEECCEEEEEEEEEECCHHHHHHHHHHHHHHHHHC', 'resolved': '0011111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111'}
2a6h-E {'id': '2a6h-E', 'sequence': 'MAEPGIDKLFGMVDSKYRLTVVVAKRAQQLLRHGFKNTVLEPEERPKMQTLEGLFDDPNAETWAMKELLTGRLVFGENLVPEDRLQKEMERIYPGEREE', 'label

In [28]:
seq_pdb = extract_sequence("../data/train_pdbs/1es5-A.pdb")
print("PDB: ", seq_pdb)

KPTIAAVGGYAMNNGTGTTLYTKAADTRRSTGSTTKIMTAKVVLAQSNLNLDAKVTIQKAYSDYVVANNASQAHLIVGDKVTVRQLLYGLMLPSGCDAAYALADKYGSGSTRAARVKSFIGKMNTAATNLGLHNTHFDSFDGIGNGANYSTPRDLTKIASSAMKNSTFRTVVKTKAYTAKTVTKTGSIRTMDTWKNTNGLLSSYSGAIGVKTGAGPEAKYCLVFAATRGGKTVIGTVLASTSIPARESDATKIMNYGFAL


In [29]:
print("JSONL:", d["1es5-A"]["sequence"])

JSONL: VTKPTIAAVGGYAMNNGTGTTLYTKAADTRRSTGSTTKIMTAKVVLAQSNLNLDAKVTIQKAYSDYVVANNASQAHLIVGDKVTVRQLLYGLMLPSGCDAAYALADKYGSGSTRAARVKSFIGKMNTAATNLGLHNTHFDSFDGIGNGANYSTPRDLTKIASSAMKNSTFRTVVKTKAYTAKTVTKTGSIRTMDTWKNTNGLLSSYSGAIGVKTGAGPEAKYCLVFAATRGGKTVIGTVLASTSIPARESDATKIMNYGFAL


# Save the parts

In [None]:
from tqdm import tqdm
folder = "../data/train_pdbs/"
files = os.listdir(folder)

pdb_seqs = []
counter = 0
part_no = 0
for file in tqdm(files):
    pid = file.replace(".pdb", "")
    seq_pdb = extract_sequence(folder + file)
    pdb_seqs.append((pid, seq_pdb))
    counter += 1
    if counter % 1000 == 0:
        with open(f"../data/parts/train_{part_no}.csv", "w") as wf:
            for pid, seq in pdb_seqs:
                wf.write(f"{pid};{seq}\n")
            part_no += 1
        pdb_seqs = []
    













































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































 84%|████████▎ | 8121/9712 [25:48<04:25,  6.00it/s]