In [None]:
%reload_ext autoreload
%autoreload 2

# Testing fasta

Use SpecLibFasta to build a library (database)

#### Init fasta lib

In [None]:
from alphabase.protein.fasta import SpecLibFasta

protein_dict = {
    'xx': {
        'protein_id': 'xx',
        'sequence': 'MACDESTYKBKFGHIKLMNPQRST'
    },
    'yy': {
        'protein_id': 'yy',
        'sequence': 'FGHIKLMNPQR'
    }
}

fastalib = SpecLibFasta(
    ['b_z1','b_z2','y_z1','y_z2'], 
    var_mods=['Oxidation@M','Acetyl@Protein N-term'],
    fix_mods=['Carbamidomethyl@C'],
    decoy='pseudo_reverse',
    I_to_L=False, 
)


Call `import_protein_dict` or `import_fasta` to load proteins, append decoys, add modifications and add charge states.

```
fastalib.import_fasta([fasta1, fasta2])
```

In [None]:
fastalib.import_and_process_protein_dict(protein_dict)
fastalib.protein_df

Unnamed: 0,protein_id,sequence
0,xx,MACDESTYKBKFGHIKLMNPQRST
1,yy,FGHIKLMNPQR


In [None]:
assert 'decoy' in fastalib.precursor_df.columns
assert 'mods' in fastalib.precursor_df.columns
assert 'mod_sites' in fastalib.precursor_df.columns
assert 'charge' in fastalib.precursor_df.columns

Call `calc_precursor_isotope` to calculate the precursor_mz, and M1/M2 isotope mz and intensity.

In [None]:
fastalib.calc_precursor_isotope()
assert 'precursor_mz' in fastalib.precursor_df.columns
assert 'isotope_apex_mz' in fastalib.precursor_df.columns
assert 'isotope_apex_intensity' in fastalib.precursor_df.columns
assert 'isotope_apex_offset' in fastalib.precursor_df.columns
assert 'isotope_right_most_mz' in fastalib.precursor_df.columns
assert 'isotope_right_most_intensity' in fastalib.precursor_df.columns
assert 'isotope_right_most_offset' in fastalib.precursor_df.columns
assert 'isotope_m1_mz' in fastalib.precursor_df.columns
assert 'isotope_m1_intensity' in fastalib.precursor_df.columns

Call `calc_fragment_mz_df` to calculate the fragment dataframe

In [None]:
fastalib.calc_fragment_mz_df()
assert 'frag_start_idx' in fastalib.precursor_df.columns
assert 'frag_stop_idx' in fastalib.precursor_df.columns
import numpy as np
assert len(fastalib.fragment_mz_df) == (fastalib.precursor_df.nAA.values-1).sum()

Use `save_hdf` to save as hdf file:
```
fastalib.save_hdf(hdf_file_path)
```

Then use `load_hdf` to load precursor and fragment dataframes:
```
fastalib.load_df(hdf_file_path, load_mod_seq=True)
```

#### Test protein decoy

In [None]:
from alphabase.protein.protein_level_decoy import ProteinReverseDecoy
protein_dict = {
    'xx': {
        'protein_id': 'xx',
        'full_name': 'xx_xx',
        'gene_name': 'x_x',
        'sequence': 'MACDESTYKBKFGHIKLMNPQRST'
    },
    'yy': {
        'protein_id': 'yy',
        'full_name': 'yy_yy',
        'gene_name': 'y_y',
        'sequence': 'FGHIKLMNPQR'
    }
}
fastalib = SpecLibFasta(
    ['b_z1','b_z2','y_z1','y_z2'], 
    var_mods=['Oxidation@M','Acetyl@Protein N-term'],
    fix_mods=['Carbamidomethyl@C'],
    decoy='pseudo_reverse',
    I_to_L=False, 
)
fastalib.get_peptides_from_protein_dict(protein_dict=protein_dict)
rev_decoy = ProteinReverseDecoy(fastalib)
rev_decoy.decoy_sequence()
rev_decoy.protein_df

Unnamed: 0,sequence,protein_id,full_name,gene_name
0,,REV_,REV_,REV_
1,,REV_,REV_,REV_
2,TSRQPNMLKIHGFKBKYTSEDCAM,REV_xx,REV_xx_xx,REV_x_x
3,RQPNMLKIHGF,REV_yy,REV_yy_yy,REV_y_y


In [None]:
rev_decoy.precursor_df

Unnamed: 0,sequence,protein_idxes,miss_cleavage,is_prot_nterm,is_prot_cterm,mods,mod_sites,nAA
0,IHGFKBK,2,1,False,False,,,7
1,RQPNMLK,3,1,True,False,,,7
2,YTSEDCAM,2,0,False,True,,,8
3,TSRQPNMLK,2,1,True,False,,,9
4,BKYTSEDCAM,2,1,False,True,,,10
5,QPNMLKIHGF,3,1,False,True,,,10
6,QPNMLKIHGFK,2,1,False,False,,,11
7,RQPNMLKIHGF,3,2,True,True,,,11
8,QPNMLKIHGFKBK,2,2,False,False,,,13
9,TSRQPNMLKIHGFK,2,2,True,False,,,14


In [None]:
for seq,prot_id in fastalib.protein_df[['sequence','protein_id']].values:
    assert seq not in rev_decoy.protein_df.sequence.values
    assert seq[::-1] in rev_decoy.protein_df.sequence.values
    assert prot_id not in rev_decoy.protein_df.protein_id.values
    assert "REV_"+prot_id in rev_decoy.protein_df.protein_id.values

In [None]:
fastalib.precursor_df

Unnamed: 0,sequence,protein_idxes,miss_cleavage,is_prot_nterm,is_prot_cterm,mods,mod_sites,nAA
0,BKFGHIK,0,1,False,False,,,7
1,LMNPQRST,0,1,False,True,,,8
2,ACDESTYK,0,0,True,False,,,8
3,MACDESTYK,0,0,True,False,,,9
4,ACDESTYKBK,0,1,True,False,,,10
5,MACDESTYKBK,0,1,True,False,,,11
6,FGHIKLMNPQR,0;1,1,True,True,,,11
7,BKFGHIKLMNPQR,0,2,False,False,,,13
8,FGHIKLMNPQRST,0,2,False,True,,,13
9,ACDESTYKBKFGHIK,0,2,True,False,,,15


In [None]:
rev_decoy.concat_to_target_lib()

In [None]:
fastalib.protein_df

Unnamed: 0,protein_id,full_name,gene_name,sequence,decoy
0,xx,xx_xx,x_x,MACDESTYKBKFGHIKLMNPQRST,0
1,yy,yy_yy,y_y,FGHIKLMNPQR,0
2,REV_xx,REV_xx_xx,REV_x_x,TSRQPNMLKIHGFKBKYTSEDCAM,1
3,REV_yy,REV_yy_yy,REV_y_y,RQPNMLKIHGF,1


In [None]:
for seq,prot_id in fastalib.protein_df.query('decoy==0')[['sequence','protein_id']].values:
    assert seq[::-1] in fastalib.protein_df.sequence.values
    assert "REV_"+prot_id in fastalib.protein_df.protein_id.values

In [None]:
fastalib.precursor_df

Unnamed: 0,sequence,protein_idxes,miss_cleavage,is_prot_nterm,is_prot_cterm,mods,mod_sites,nAA,decoy
0,BKFGHIK,0,1,False,False,,,7,0
1,RQPNMLK,3,1,True,False,,,7,1
2,IHGFKBK,2,1,False,False,,,7,1
3,LMNPQRST,0,1,False,True,,,8,0
4,ACDESTYK,0,0,True,False,,,8,0
5,YTSEDCAM,2,0,False,True,,,8,1
6,MACDESTYK,0,0,True,False,,,9,0
7,TSRQPNMLK,2,1,True,False,,,9,1
8,ACDESTYKBK,0,1,True,False,,,10,0
9,QPNMLKIHGF,3,1,False,True,,,10,1


In [None]:
assert fastalib.precursor_df.query('decoy==1').protein_idxes.apply(
    lambda x: (np.array([int(i) for i in x.split(';')])>=len(fastalib.protein_df)//2).all()
).all()