# SpecLibFasta usage

In [1]:
%reload_ext autoreload
%autoreload 2

In [2]:
from alphabase.protein.fasta import SpecLibFasta

Proteins from a dict (or loaded from fasta files)

In [3]:
prot1 = 'MABCDESTKAFGHIJKLMNOPQRAFGHIJK'
prot2 = 'AFGHIJKLMNOPQR'
protein_dict = {
    'xx': {
        'protein_id': 'xx',
        'gene_name': '',
        'sequence': prot1
    },
    'yy': {
        'protein_id': 'yy',
        'gene_name': 'gene',
        'sequence': prot2
    }
}

`alphabase.protein.fasta.SpecLibFasta.get_peptides_from_protein_dict` will digest a protein dict into a peptide dataframe. 

`alphabase.protein.fasta.SpecLibFasta.get_peptides_from_fasta` will digest a fasta file or a fasta list into a peptide dataframe. 

In [4]:
fasta_lib = SpecLibFasta(
    ['b_z1','y_z1'], I_to_L=False, decoy='pseudo_reverse',
    var_mods=['Acetyl@Protein N-term', 'Oxidation@M'],
    fix_mods=['Carbamidomethyl@C'],
)
# fasta_lib.get_peptides_from_fasta(fasta_files)
fasta_lib.get_peptides_from_protein_dict(protein_dict)
fasta_lib.precursor_df

Unnamed: 0,sequence,protein_idxes,miss_cleavage,is_prot_nterm,is_prot_cterm,mods,mod_sites,nAA
0,AFGHIJK,0;1,0,True,True,,,7
1,LMNOPQR,0;1,0,False,True,,,7
2,ABCDESTK,0,0,True,False,,,8
3,MABCDESTK,0,0,True,False,,,9
4,AFGHIJKLMNOPQR,0;1,1,True,True,,,14
5,LMNOPQRAFGHIJK,0,1,False,True,,,14
6,ABCDESTKAFGHIJK,0,1,True,False,,,15
7,MABCDESTKAFGHIJK,0,1,True,False,,,16
8,AFGHIJKLMNOPQRAFGHIJK,0,2,False,True,,,21
9,ABCDESTKAFGHIJKLMNOPQR,0,2,True,False,,,22


In [5]:
fasta_lib.protein_df

Unnamed: 0,protein_id,gene_name,sequence
0,xx,,MABCDESTKAFGHIJKLMNOPQRAFGHIJK
1,yy,gene,AFGHIJKLMNOPQR


We can also append the protein names to precursor_df

In [6]:
fasta_lib.append_protein_name()
fasta_lib.precursor_df

Unnamed: 0,sequence,protein_idxes,miss_cleavage,is_prot_nterm,is_prot_cterm,mods,mod_sites,nAA,proteins,genes
0,AFGHIJK,0;1,0,True,True,,,7,xx;yy,gene
1,LMNOPQR,0;1,0,False,True,,,7,xx;yy,gene
2,ABCDESTK,0,0,True,False,,,8,xx,
3,MABCDESTK,0,0,True,False,,,9,xx,
4,AFGHIJKLMNOPQR,0;1,1,True,True,,,14,xx;yy,gene
5,LMNOPQRAFGHIJK,0,1,False,True,,,14,xx,
6,ABCDESTKAFGHIJK,0,1,True,False,,,15,xx,
7,MABCDESTKAFGHIJK,0,1,True,False,,,16,xx,
8,AFGHIJKLMNOPQRAFGHIJK,0,2,False,True,,,21,xx,
9,ABCDESTKAFGHIJKLMNOPQR,0,2,True,False,,,22,xx,


If we have our own precursor_df loaded by psm_readers, we can directly assign it to fasta_lib. 

``` python
fasta_lib._precursor_df = precursor_df
```
Thus, we can still use SpecLibFasta functionalities for this precursor_df.

Add modifications including both var_mods (`Acetyl@Protein N-term`, `Oxidation@M`, see initialzation of fasta_lib) and fix_mods (`Carbamidomethyl@C`) into the precursor_df.

In [7]:
fasta_lib.add_modifications()
fasta_lib.precursor_df[['sequence','mods','mod_sites']]

Unnamed: 0,sequence,mods,mod_sites
0,AFGHIJK,,
1,AFGHIJK,Acetyl@Protein N-term,0
2,LMNOPQR,Oxidation@M,2
3,LMNOPQR,,
4,ABCDESTK,Carbamidomethyl@C,3
5,ABCDESTK,Carbamidomethyl@C;Acetyl@Protein N-term,3;0
6,MABCDESTK,Carbamidomethyl@C;Oxidation@M,4;1
7,MABCDESTK,Carbamidomethyl@C,4
8,MABCDESTK,Carbamidomethyl@C;Acetyl@Protein N-term;Oxidat...,4;0;1
9,MABCDESTK,Carbamidomethyl@C;Acetyl@Protein N-term,4;0


`alphabase.protein.fasta.SpecLibFasta.add_additional_modifications` is specially designed for `Phospho`, as it may generate thousands of peptidoforms for a peptide with multiple phospho sites. 

In [8]:
fasta_lib.add_additional_modifications(
    ['Phospho@S','Phospho@T'], max_mod_num=1, max_peptidoform_num=100,
)
fasta_lib.precursor_df

Unnamed: 0,sequence,protein_idxes,miss_cleavage,is_prot_nterm,is_prot_cterm,mods,mod_sites,nAA,proteins,genes
0,AFGHIJK,0;1,0,True,True,,,7,xx;yy,gene
1,AFGHIJK,0;1,0,True,True,Acetyl@Protein N-term,0,7,xx;yy,gene
2,LMNOPQR,0;1,0,False,True,Oxidation@M,2,7,xx;yy,gene
3,LMNOPQR,0;1,0,False,True,,,7,xx;yy,gene
4,ABCDESTK,0,0,True,False,Carbamidomethyl@C;Phospho@S,3;6,8,xx,
...,...,...,...,...,...,...,...,...,...,...
79,MABCDESTKAFGHIJKLMNOPQR,0,2,True,False,Carbamidomethyl@C;Acetyl@Protein N-term;Oxidat...,4;0;1;18;8,23,xx,
80,MABCDESTKAFGHIJKLMNOPQR,0,2,True,False,Carbamidomethyl@C;Acetyl@Protein N-term;Oxidat...,4;0;1;18,23,xx,
81,MABCDESTKAFGHIJKLMNOPQR,0,2,True,False,Carbamidomethyl@C;Acetyl@Protein N-term;Phospho@S,4;0;7,23,xx,
82,MABCDESTKAFGHIJKLMNOPQR,0,2,True,False,Carbamidomethyl@C;Acetyl@Protein N-term;Phospho@T,4;0;8,23,xx,


Flexible method to add peptide labeling

In [9]:
fasta_lib.add_peptide_labeling({
    '': [], # not labelled for reference
    '0': ['Dimethyl@Any N-term','Dimethyl@K'],
    '8': ['Dimethyl:2H(6)13C(2)@Any N-term','Dimethyl:2H(6)13C(2)@K'],
})
fasta_lib.precursor_df

Unnamed: 0,sequence,protein_idxes,miss_cleavage,is_prot_nterm,is_prot_cterm,mods,mod_sites,nAA,proteins,genes,label_channel
0,AFGHIJK,0;1,0,True,True,,,7,xx;yy,gene,
1,AFGHIJK,0;1,0,True,True,Acetyl@Protein N-term,0,7,xx;yy,gene,
2,LMNOPQR,0;1,0,False,True,Oxidation@M,2,7,xx;yy,gene,
3,LMNOPQR,0;1,0,False,True,,,7,xx;yy,gene,
4,ABCDESTK,0,0,True,False,Carbamidomethyl@C;Phospho@S,3;6,8,xx,,
...,...,...,...,...,...,...,...,...,...,...,...
247,MABCDESTKAFGHIJKLMNOPQR,0,2,True,False,Carbamidomethyl@C;Acetyl@Protein N-term;Oxidat...,4;0;1;18;8;9;16,23,xx,,8
248,MABCDESTKAFGHIJKLMNOPQR,0,2,True,False,Carbamidomethyl@C;Acetyl@Protein N-term;Oxidat...,4;0;1;18;9;16,23,xx,,8
249,MABCDESTKAFGHIJKLMNOPQR,0,2,True,False,Carbamidomethyl@C;Acetyl@Protein N-term;Phosph...,4;0;7;9;16,23,xx,,8
250,MABCDESTKAFGHIJKLMNOPQR,0,2,True,False,Carbamidomethyl@C;Acetyl@Protein N-term;Phosph...,4;0;8;9;16,23,xx,,8


In [10]:
fasta_lib.add_charge()
fasta_lib.precursor_df[['sequence','mods','mod_sites','charge']]

Unnamed: 0,sequence,mods,mod_sites,charge
0,AFGHIJK,,,2
1,AFGHIJK,,,3
2,AFGHIJK,,,4
3,AFGHIJK,Acetyl@Protein N-term,0,2
4,AFGHIJK,Acetyl@Protein N-term,0,3
...,...,...,...,...
751,MABCDESTKAFGHIJKLMNOPQR,Carbamidomethyl@C;Acetyl@Protein N-term;Phosph...,4;0;8;9;16,3
752,MABCDESTKAFGHIJKLMNOPQR,Carbamidomethyl@C;Acetyl@Protein N-term;Phosph...,4;0;8;9;16,4
753,MABCDESTKAFGHIJKLMNOPQR,Carbamidomethyl@C;Acetyl@Protein N-term;Dimeth...,4;0;9;16,2
754,MABCDESTKAFGHIJKLMNOPQR,Carbamidomethyl@C;Acetyl@Protein N-term;Dimeth...,4;0;9;16,3


Append precursor mz and isotope information

In [11]:
fasta_lib.calc_precursor_mz()
fasta_lib.calc_precursor_isotope()
fasta_lib.precursor_df[['precursor_mz']+[col for col in fasta_lib.precursor_df.columns if col.startswith('isotope')]]

Unnamed: 0,precursor_mz,isotope_m1_intensity,isotope_apex_intensity,isotope_apex_offset,isotope_right_most_intensity,isotope_right_most_offset,isotope_m1_mz,isotope_apex_mz,isotope_right_most_mz
0,414.242338,0.479110,1.000000,0,0.479110,1,414.743988,414.242338,414.743988
1,506.273844,0.536589,1.000000,0,0.208670,2,506.775494,506.273844,507.277144
2,498.276387,0.536208,1.000000,0,0.206410,2,498.778037,498.276387,499.279687
3,889.498341,0.993457,1.000000,0,0.239382,3,889.999991,889.498341,891.003291
4,593.334653,0.993457,1.000000,0,0.239382,3,593.669086,593.334653,594.337953
...,...,...,...,...,...,...,...,...,...
78,884.893321,1.259619,1.259619,1,0.510575,3,885.227755,885.227755,885.896621
79,663.921810,1.259619,1.259619,1,0.510575,3,664.172635,664.172635,664.674285
80,1318.838886,1.258972,1.258972,1,0.507694,3,1319.340536,1319.340536,1320.343836
81,879.561683,1.258972,1.258972,1,0.507694,3,879.896116,879.896116,880.564983


Using `alphabase.spectral_library.base.SpecLibBase.calc_fragment_mz_df` to calculate fragment mz dataframe.

In [12]:
fasta_lib.calc_fragment_mz_df()
fasta_lib.fragment_mz_df

Unnamed: 0,b_z1,y_z1
0,114.054955,714.429722
1,261.123369,567.361308
2,318.144833,510.339844
3,455.203744,373.280932
4,568.287808,260.196868
...,...,...
1123,2034.240795,603.436978
1124,2091.262258,546.415514
1125,2228.321170,409.356602
1126,2341.405234,296.272538


`calc_fragment_mz_df()` also generate pointers `frag_start_idx` and `frag_end_idx` in the precursor_df to locate fragments of each precursor. 

In [13]:
fasta_lib.precursor_df[['frag_start_idx','frag_end_idx']]

Unnamed: 0,frag_start_idx,frag_end_idx
0,0,6
1,6,12
2,12,18
3,18,31
4,31,44
...,...,...
78,1028,1048
79,1048,1068
80,1068,1088
81,1088,1108


Note that all fragment ions are stored from peptide's N-terminal to C-terminal, so the b-ions are in the ascending order (from b1 to bn) and y-ions are in the decending order (from yn to y1).

In [14]:
start, end = fasta_lib.precursor_df[['frag_start_idx','frag_end_idx']].values[1]
fasta_lib.fragment_mz_df.iloc[start:end,:]

Unnamed: 0,b_z1,y_z1
6,114.09134,898.456348
7,261.12674,751.420948
8,375.169668,637.37802
9,612.317394,400.230294
10,709.370158,303.17753
11,837.428736,175.118952


Save protein_df, precursor_df, fragment_mz_df, fragment_intensity_df into a hdf file.

In [15]:
# fasta_lib.save_hdf('path/to/hdf_file.hdf')