# AlphaBase provides quite many functionalities for MS-based proteomics

Starting from a precursor dataframe, for example:

In [1]:
import pandas as pd

df = pd.DataFrame({
    'sequence': ['ACDEFHIK', 'APDEFMNIK', 'SWDEFMNTIRAAAAKDDDDR'],
    'mods': ['Carbamidomethyl@C', '', 'Phospho@S;Oxidation@M'],
    'mod_sites': ['2', '', '1;6'],
    'charge': [1,2,3],
})
df

Unnamed: 0,sequence,mods,mod_sites,charge
0,ACDEFHIK,Carbamidomethyl@C,2,1
1,APDEFMNIK,,,2
2,SWDEFMNTIRAAAAKDDDDR,Phospho@S;Oxidation@M,1;6,3


### Mass calculation

`alphabase.peptide.fragment.create_fragment_mz_dataframe()`

In [2]:
from alphabase.peptide.fragment import create_fragment_mz_dataframe

OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.


### Precursor isotopes

`alphabase.peptide.precursor.calc_precursor_isotope_intensity()`

In [3]:
from alphabase.peptide.precursor import calc_precursor_isotope_intensity

### Protein processing

`alphabase.protein.fasta.Digest` class.

In [4]:
from alphabase.protein.fasta import Digest
import string

digest = Digest(
    protease="trypsin",
    max_missed_cleavages=2,
    peptide_length_min=7,
    peptide_length_max=45,
)
seqs, miss_cleaves, prot_nterms, prot_cterms = digest.cleave_sequence(string.ascii_uppercase)
seqs, miss_cleaves, prot_nterms, prot_cterms

(['ABCDEFGHIJK',
  'ABCDEFGHIJKLMNOPQR',
  'ABCDEFGHIJKLMNOPQRSTUVWXYZ',
  'LMNOPQR',
  'LMNOPQRSTUVWXYZ',
  'STUVWXYZ'],
 [0, 1, 2, 0, 1, 0],
 [True, True, True, False, False, False],
 [False, False, True, False, True, True])

Here are built-in protease dict for digestion, the digestion is actually performed by regular expression.

In [5]:
from alphabase.protein.fasta import protease_dict
protease_dict

{'arg-c': 'R',
 'asp-n': '\\w(?=D)',
 'bnps-skatole': 'W',
 'caspase 1': '(?<=[FWYL]\\w[HAT])D(?=[^PEDQKR])',
 'caspase 2': '(?<=DVA)D(?=[^PEDQKR])',
 'caspase 3': '(?<=DMQ)D(?=[^PEDQKR])',
 'caspase 4': '(?<=LEV)D(?=[^PEDQKR])',
 'caspase 5': '(?<=[LW]EH)D',
 'caspase 6': '(?<=VE[HI])D(?=[^PEDQKR])',
 'caspase 7': '(?<=DEV)D(?=[^PEDQKR])',
 'caspase 8': '(?<=[IL]ET)D(?=[^PEDQKR])',
 'caspase 9': '(?<=LEH)D',
 'caspase 10': '(?<=IEA)D',
 'chymotrypsin high specificity': '([FY](?=[^P]))|(W(?=[^MP]))',
 'chymotrypsin low specificity': '([FLY](?=[^P]))|(W(?=[^MP]))|(M(?=[^PY]))|(H(?=[^DMPW]))',
 'chymotrypsin': '([FLY](?=[^P]))|(W(?=[^MP]))|(M(?=[^PY]))|(H(?=[^DMPW]))',
 'clostripain': 'R',
 'cnbr': 'M',
 'enterokinase': '(?<=[DE]{3})K',
 'factor xa': '(?<=[AFGILTVM][DE]G)R',
 'formic acid': 'D',
 'glutamyl endopeptidase': 'E',
 'glu-c': 'E',
 'granzyme b': '(?<=IEP)D',
 'hydroxylamine': 'N(?=G)',
 'iodosobenzoic acid': 'W',
 'lys-c': 'K',
 'lys-n': '\\w(?=K)',
 'ntcb': '\\w(?=C)',
 'peps

We also support arbitray digestion if we design the correct regular expression, for example if we would like to combine `trypsin` (`[KR]`), `asp-n` (`\\w(?=D)`, cleave before `D`), and `lys-n` (`\\w(?=K)`, cleave before `K`), the final regular expression will be `([KR]|\\w(?=D)|\\w(?=K))`.

In [6]:
digest = Digest(
    protease="([KR]|\\w(?=D)|\\w(?=K))",
    max_missed_cleavages=2,
    peptide_length_min=7,
    peptide_length_max=45,
)
seqs, miss_cleaves, prot_nterms, prot_cterms = digest.cleave_sequence(string.ascii_uppercase)
seqs, miss_cleaves, prot_nterms, prot_cterms

(['ABCDEFGHIJ',
  'ABCDEFGHIJK',
  'DEFGHIJ',
  'DEFGHIJK',
  'DEFGHIJKLMNOPQR',
  'KLMNOPQR',
  'KLMNOPQRSTUVWXYZ',
  'LMNOPQR',
  'LMNOPQRSTUVWXYZ',
  'STUVWXYZ'],
 [1, 2, 0, 1, 2, 1, 2, 0, 1, 0],
 [True, True, False, False, False, False, False, False, False, False],
 [False, False, False, False, False, False, True, False, True, True])

`alphabase.alphabase.protein.fasta.SpecLibFasta` provides functionalities to process peptides from the given fasta

In [None]:
from alphabase.protein.fasta import SpecLibFasta
fastalib = SpecLibFasta(
    charged_frag_types = ["b_z1", "b_z2", "y_z1", "y_z2"],
    protease = "trypsin",
    max_missed_cleavages = 2,
    peptide_length_min = 7,
    peptide_length_max = 35,
    precursor_charge_min = 2,
    precursor_charge_max = 4,
    precursor_mz_min = 400.0,
    precursor_mz_max = 2000.0,
    var_mods = ["Acetyl@Protein_N-term", "Oxidation@M"],
    min_var_mod_num = 0,
    max_var_mod_num = 2,
    fix_mods = ["Carbamidomethyl@C"],
    labeling_channels = None,
    special_mods = [],
    min_special_mod_num = 0,
    max_special_mod_num = 1,
    special_mods_cannot_modify_pep_n_term = False,
    special_mods_cannot_modify_pep_c_term = False,
    decoy = None,
    include_contaminants = False,
    I_to_L = False,
)

# fastalib.import_and_process_fasta(fasta_files=[...])

### PSM readers

`alphabase.psm_reader`

In [7]:
from alphabase.psm_reader import (
    # DDA
    AlphaPeptReader,
    pFindReader,
    MaxQuantReader,
    SageReaderTSV,
    SageReaderParquet,
    MSFragger_PSM_TSV_Reader,
    MSFraggerPepXML,
    # DIA
    DiannReader,
    SpectronautReader,
    SpectronautReportReader,
    SwathReader,
)