In [None]:
# default_exp proteolytic_cleavage

# Protease information

## Protease dictionary

A numba compatible dictionary that stores different regular expressions for proteolytic enzymes. 
The dictionary is identical to the one in *AlphaPept* and was largely taken from the *Pyteomics* website which in turn derived the rules are from *expasy*.

In [None]:
#export
protease_dict = dict()

protease_dict["arg-c"] = "R"
protease_dict["asp-n"] = "\w(?=D)"
protease_dict["bnps-skatole"] = "W"
protease_dict["caspase 1"] = "(?<=[FWYL]\w[HAT])D(?=[^PEDQKR])"
protease_dict["caspase 2"] = "(?<=DVA)D(?=[^PEDQKR])"
protease_dict["caspase 3"] = "(?<=DMQ)D(?=[^PEDQKR])"
protease_dict["caspase 4"] = "(?<=LEV)D(?=[^PEDQKR])"
protease_dict["caspase 5"] = "(?<=[LW]EH)D"
protease_dict["caspase 6"] = "(?<=VE[HI])D(?=[^PEDQKR])"
protease_dict["caspase 7"] = "(?<=DEV)D(?=[^PEDQKR])"
protease_dict["caspase 8"] = "(?<=[IL]ET)D(?=[^PEDQKR])"
protease_dict["caspase 9"] = "(?<=LEH)D"
protease_dict["caspase 10"] = "(?<=IEA)D"
protease_dict["chymotrypsin high specificity"] = "([FY](?=[^P]))|(W(?=[^MP]))"
protease_dict["chymotrypsin low specificity"] = "([FLY](?=[^P]))|(W(?=[^MP]))|(M(?=[^PY]))|(H(?=[^DMPW]))"
protease_dict["clostripain"] = "R"
protease_dict["cnbr"] = "M"
protease_dict["enterokinase"] = "(?<=[DE]{3})K"
protease_dict["factor xa"] = "(?<=[AFGILTVM][DE]G)R"
protease_dict["formic acid"] = "D"
protease_dict["glutamyl endopeptidase"] = "E"
protease_dict["granzyme b"] = "(?<=IEP)D"
protease_dict["hydroxylamine"] = "N(?=G)"
protease_dict["iodosobenzoic acid"] = "W"
protease_dict["lysc"] = "K"
protease_dict["ntcb"] = "\w(?=C)"
protease_dict["pepsin ph1.3"] = "((?<=[^HKR][^P])[^R](?=[FL][^P]))|((?<=[^HKR][^P])[FL](?=\w[^P]))"
protease_dict["pepsin ph2.0"] = "((?<=[^HKR][^P])[^R](?=[FLWY][^P]))|((?<=[^HKR][^P])[FLWY](?=\w[^P]))"
protease_dict["proline endopeptidase"] = "(?<=[HKR])P(?=[^P])"
protease_dict["proteinase k"] = "[AEFILTVWY]"
protease_dict["staphylococcal peptidase i"] = "(?<=[^E])E"
protease_dict["thermolysin"] = "[^DE](?=[AFILMV])"
protease_dict["thrombin"] = "((?<=G)R(?=G))|((?<=[AFGILTVM][AFGILTVWA]P)R(?=[^DE][^DE]))"
protease_dict["trypsin_full"] = "([KR](?=[^P]))|((?<=W)K(?=P))|((?<=M)R(?=P))"
protease_dict["trypsin_exception"] = "((?<=[CD])K(?=D))|((?<=C)K(?=[HY]))|((?<=C)R(?=K))|((?<=R)R(?=[HR]))"
protease_dict["non-specific"] = "()"
protease_dict["trypsin"] = "([KR](?=[^P]))"
protease_dict["custom_enzyme"] = None

In [None]:
protease_dict["trypsin"]

'([KR](?=[^P]))'

In [None]:
#hide
def test_get_protease_dict():
    p_dict = protease_dict
    assert p_dict["trypsin"] == "([KR](?=[^P]))"
    
test_get_protease_dict()

## Get proteolytic cleavage sites for a protein sequence

In [None]:
#export
import re
def get_cleavage_sites(sequence: str, protease: str):
    """
    Function to get the position of proteolytic cleavage sites in a sequence.

    Args:
        sequence (str): Amino acid sequence.
        protease (str): Protease to use for in silico digestion.
    Returns:
        list: List of cleavage site indices for the selected protease.

    """
    pattern = re.compile(protease_dict[protease])
    pattern_idx = pattern.finditer(sequence)
    pattern_idx = [m.start(0) for m in pattern_idx]
    return pattern_idx

In [None]:
#hide
import numpy as np
def test_get_cleavage_sites():
    cleavage_sites = get_cleavage_sites("PEPTIDERANGEKATRAT", "trypsin")
    np.testing.assert_equal(cleavage_sites, [7, 12, 15])
    cleavage_sites2 = get_cleavage_sites("PEPTIDERANGEKATRAT", "lysc")
    np.testing.assert_equal(cleavage_sites2, [12])
    cleavage_sites3 = get_cleavage_sites("PEPTIDERANGEKATRAT", "caspase 2")
    np.testing.assert_equal(cleavage_sites3, [])
    cleavage_sites4 = get_cleavage_sites("PEPVDVADTIDE", "caspase 2")
    np.testing.assert_equal(cleavage_sites4, [7])
    
test_get_cleavage_sites()

In [None]:
#hide

###### Export notebook to script ###### 

In [None]:
#hide
from nbdev.showdoc import *

In [None]:
#hide
from nbdev.export import *
notebook2script()

Converted Importing.ipynb.
Converted Preprocessing.ipynb.
Converted SequencePlot.ipynb.
Converted Uniprot_integration.ipynb.
Converted index.ipynb.
Converted organisms_data.ipynb.
Converted proteolytic_cleavage.ipynb.
