In [1]:
#import string
from dataclasses import dataclass
from pathlib import Path
from typing import List

from Bio import SeqIO


base_path = Path('/srv/data1/home/jo0348st')


In [12]:
# @dataclass
# class Protein:

#     chain_list: List[str]

#     def __post_init__(self):

#         letters = string.ascii_lowercase

#         chains = []

#         for i, chain in enumerate(self.chain_list):

#             chain_name = f'chain_{letters[i]}'

#             setattr(self, chain_name, chain)

#             chains.append(chain_name)

#         self.chains = chains

@dataclass
class Protein:

    def pdb_to_seq(pdb_path: Path):

        pass

@dataclass
class Crosslinking:

    def x():

        pass


In [25]:
def kojak_generator(sequence: str,
                    pos: int,
                    xl_residue: str = 'K',
                    len_cutoff: int = 4) -> str:
    """Extract XLs from sequence.

    This function helps to extract each XL peptide from sequence.
    For PEPKPEP--QPEPKQPEP we need to call this function two times to give us
    both peptides. It also helps to define position of K on each peptide which
    is needed in kojak format.

    This function only generates tryptic peptides!

    Originally authored by Hamed Khakzad (Fri Dec  2 14:37:17 2016),
    and edited and commented by Joel Ströbaek (Nov 2022).

    Parameters
    ----------


    Returns
    -------

    """

    #TODO
    #   - Make a proper digestor

    xlink = []

    # Make sure the correct input position was given.
    if sequence[pos] in xl_residue:

        pos_temp1 = 0

        pos_temp2 = 0

        # Check upstream for trypsin cleavage site.
        for cnt1 in range(pos - 1, -1, -1):

            if (sequence[cnt1] in 'K') or (sequence[cnt1] in 'R'):

                pos_temp1 = cnt1 + 1

                break

            else:

                pos_temp1 = 0

        # Check downstream for trypsin cleavage site.
        for cnt2 in range(pos + 1, len(sequence), +1):

            if (sequence[cnt2] in 'K') or (sequence[cnt2] in 'R'):

                pos_temp2 = cnt2 + 1

                break

            else:

                pos_temp2 = len(sequence)

        xlink = sequence[pos_temp1:pos_temp2]

    else:

        print("ERROR")

    if len(xlink) >= len_cutoff:

        return xlink

    else:

        return None

def xl_residue_index(seq: str,
                     xl_residue: str = 'K', kojak_indexing: bool = True):

    kojak_index = 1 if kojak_indexing else 0

    return seq.find(xl_residue) + kojak_index


In [33]:
import warnings

from Bio import SeqIO
from Bio import BiopythonParserWarning


warnings.simplefilter('ignore', BiopythonParserWarning)


def protein_to_rec_list(pdb_path: Path, file_type: str = 'pdb-atom'):

    chain_recs = []

    with open(pdb_path, 'r+') as f:

        for rec in SeqIO.parse(f, file_type):

            chain_recs.append(rec)

    return chain_recs

def rec_to_xls(rec_list, linker_res: str = 'K') -> List[tuple]:

    prev_chains = []

    xls = []

    for rec in rec_list:

        seq = str(rec.seq)

        if seq in prev_chains:

            continue

        else:

            prev_chains.append(seq)

            for i, r in enumerate(seq):

                if r in linker_res:

                    xl = kojak_generator(seq, i)

                    if xl:

                        xl_tuple = (xl, xl_residue_index(xl))

                        if xl_tuple not in xls:

                            xls.append(xl_tuple)

    return xls


path = '/srv/data1/home/jo0348st/data/2023-di_heusel_m1_mAbs/pdb/pred/230209'

path = Path(path)

ab = path / 'targets/top05_230209_r0/top05_230209_r0.pdb'

ab_recs = protein_to_rec_list(ab)

ab_xls = rec_to_xls(ab_recs)

ag = path / 'binders/bait-m1_sec02_230209_r0.pdb'

ag_recs = protein_to_rec_list(ag)

ag_xls = rec_to_xls(ag_recs)

m1_path = '/srv/data1/home/jo0348st/data/2023-di_heusel_m1_mAbs/fasta/bait/bait-m1.fasta'

if m1_path:

    m1_recs = protein_to_rec_list(m1_path, 'fasta')

    m1_xls = rec_to_xls(m1_recs)

    m1_xl_ref = [x[0] for x in m1_xls]

    for xl, idx in ag_xls:

        if xl not in m1_xl_ref:

            ag_xls.remove((xl, idx))

kojak_xls = []

for pep_1, idx_1 in ab_xls:

    for pep_2, idx_2 in ag_xls:

        kojak_xl = f'-.{pep_1}({idx_1})--{pep_2}({idx_2}).-'

        if kojak_xl not in kojak_xls:

            kojak_xls.append(kojak_xl)

# with open('/home/jo0348st/test/xls.txt', 'w+') as f:

#     for xl in kojak_xls:

#         f.write(f'{xl}\n')

kojak_xls


['-.DIKMDQSPSSMYASLGER(3)--ATALEKELEEK(6).-',
 '-.DIKMDQSPSSMYASLGER(3)--ELEEKK(5).-',
 '-.DIKMDQSPSSMYASLGER(3)--KALELAIDQASQDYNR(1).-',
 '-.DIKMDQSPSSMYASLGER(3)--ANVLEKELETITR(6).-',
 '-.DIKMDQSPSSMYASLGER(3)--NLLGNAKLELDQLSSEK(7).-',
 '-.DIKMDQSPSSMYASLGER(3)--LELDQLSSEKEQLTIEK(10).-',
 '-.DIKMDQSPSSMYASLGER(3)--EQLTIEKAK(7).-',
 '-.DIKMDQSPSSMYASLGER(3)--AKLEEEK(2).-',
 '-.DIKMDQSPSSMYASLGER(3)--LEEEKQISDASR(5).-',
 '-.DIKMDQSPSSMYASLGER(3)--EAKK(3).-',
 '-.DIKMDQSPSSMYASLGER(3)--KQVEK(1).-',
 '-.VIITCKASQDNNSYLSWFQQK(6)--ATALEKELEEK(6).-',
 '-.VIITCKASQDNNSYLSWFQQK(6)--ELEEKK(5).-',
 '-.VIITCKASQDNNSYLSWFQQK(6)--KALELAIDQASQDYNR(1).-',
 '-.VIITCKASQDNNSYLSWFQQK(6)--ANVLEKELETITR(6).-',
 '-.VIITCKASQDNNSYLSWFQQK(6)--NLLGNAKLELDQLSSEK(7).-',
 '-.VIITCKASQDNNSYLSWFQQK(6)--LELDQLSSEKEQLTIEK(10).-',
 '-.VIITCKASQDNNSYLSWFQQK(6)--EQLTIEKAK(7).-',
 '-.VIITCKASQDNNSYLSWFQQK(6)--AKLEEEK(2).-',
 '-.VIITCKASQDNNSYLSWFQQK(6)--LEEEKQISDASR(5).-',
 '-.VIITCKASQDNNSYLSWFQQK(6)--EAKK(3).-',
 '-.

In [34]:
all_xls = !cat {'/home/jo0348st/test/all_xls.txt'}
xls = !cat {'/home/jo0348st/test/xls.txt'}

all_xls = set(all_xls)
xls = set(xls)

len(all_xls.difference(xls)) / len(all_xls)


0.21025641025641026