In [None]:
#---#| default_exp psm_reader.psm_reader

In [None]:
#| hide
%reload_ext autoreload
%autoreload 2

# Base Class for PSM Readers

In [None]:
from alphabase.psm_reader.psm_reader import translate_other_modification, PSMReaderBase, psm_reader_yaml

In [None]:
#| hide
assert 'a', [] == translate_other_modification('A', {'A': 'a', 'B': 'b'})
assert 'b', [] == translate_other_modification('B', {'A': 'a', 'B': 'b'})
assert 'a;a', [] == translate_other_modification('A;A', {'A': 'a', 'B': 'b'})
assert 'a;b', [] == translate_other_modification('A;B', {'A': 'a', 'B': 'b'})
assert 'a;b', ['X'] == translate_other_modification('A;B;X', {'A': 'a', 'B': 'b'})

## PSMReaderBase 

`PSMReaderBase` is the base abstract class for all readers. It defines the basic procedures for importing other search engine results into AlphaBase format.

The main entry method is `import_file(filename)`, and it will generate `self._psm_df` (or property `self.psm_df`) after `import_file`.

In `import_file` method, we designed five steps to load result files in to AlphaBase format:

1. `origin_df = self._load_file(filename)`. We load result files into a dataframe without doing any file conversion. As different search engines have different file format, some of them are not in the tabular format. **All subclass of `PSMReaderBase` need to re-implement this method**.

2. `self._translate_columns(origin_df)`. We translate columns in `origin_df` into AlphaBase columns by `self.column_mapping`. `self.column_mapping` provides a flexible way for developers to extract their required columns. 

3. `self._load_modifications(origin_df)`. As different search engines have different representation of modifications. We use this method to extract the modifications into `self._psm_df['mods']` and `self._psm_df['mod_sites']`. Note that the modification names are still in other search engines' format. **All subclass of `PSMReaderBase` need to re-implement this method**.

4. `self._translate_modifications`. Convert modification names into AlphaBase names (`unimod_name@AA`). For most of the search engines, we need a dict (`self.modification_mapping`) to map search engine modification format into AlphaBase (`unimod_name@AA`, `unimod_name` is <umod:mod title=...> in the unimod xml file). **All subclass of `PSMReaderBase` need to re-implement this method**.

5. `self._post_process(filename, origin_df)`. Any required post-processing steps. For example, we remove unknown modifications here.

### Other results must be converted into the alphabase dataframe with required columns:
1. `sequence` (str): AA sequence, for example, 'ATMYPEDR'.
2. `mods` (str): modification names, separated by ';'. For example, 'Oxidation@M', 'Acetyl@Protein_N-term;Oxidation@M'.
3. `mod_sites` (str): modification sites, seperated by ';'. For example, '3', '0;3'. The N-term site is 0, and the C-term site is -1, and all other modification sites start from 1.
4. `nAA` (int): number of AA in the sequence, could be set by `df['nAA']=df.sequence.str.len`.
5. `charge` (int): precursor charge states.
6. `rt` (float): retention time (RT) of peptides, in minutes by default.
7. `rt_norm` (float): RT normalized by the maximum value, could be set by `df['rt_norm'] = df.rt/df.rt.max`.
### and optional columns:
8. `ccs` (float): collisional cross section (CCS) value, requred for IM data.
9. `mobility` (float): precursor ion mobility value, requred for IM data.
11. `precursor_mz` (float): precursor m/z value.
12. `proteins` (str): protein names, separated by ';'.
13. `genes` (str): gene names, separated by ';'.
14. `protein_ids` (str): protein ids or uniprot ids, separated by ';'.
15. `score` (float): PSM score. The larger the better PSMs, meaning that `E-value` or `P-value` scores must be `-log`.
16. `fdr` (float): FDR or q-value.
17. `raw_name` (str): Raw file name.
18. `spec_idx` (int): spectrum index starting from 0 in RAW data. For thermo RAW, it is also Scan number - 1. We can use it to locate the MS2 spectrum for identification.
19. `query_id` (int or str): the unique id for not only inlucdes unique spectrum (`spec_idx`), but also the precursor or MS1 isotope index. It could be `query_idx` in alphapept.
20. `decoy`: 0 if the peptide is target match, otherwise 1.

`modification_mapping` example (MaxQuant, unimod will be automatically added):

```python
{
  'Acetyl@Protein_N-term': [
    '_(Acetyl (Protein_N-term))',
    '_(ac)',
  ],
  'Carbamidomethyl@C': [
    'C(Carbamidomethyl (C))',
  ],
  'Oxidation@M': [
    'M(Oxidation (M))',
    'M(ox)',
  ],
  'Phospho@S': [
    'S(Phospho (S))',
    'S(Phospho (ST))',
    'S(Phospho (STY))',
    'S(ph)',
    'pS',
  ],
  'Phospho@T': [
    'T(Phospho (T))',
    'T(Phospho (ST))',
    'T(Phospho (STY))',
    'T(ph)',
    'pT',
  ],
  'Phospho@Y': [
    'Y(Phospho (Y))',
    'Y(Phospho (STY))',
    'Y(ph)',
    'pY',
  ],
  'Deamidated@N': ['N(Deamidation (NQ))','N(de)']
  'Deamidated@Q': ['Q(Deamidation (NQ))','Q(de)']
  'GlyGly@K': ['K(GlyGly (K))', 'K(gl)']
}
```

In [None]:
#| hide
class TestReader(PSMReaderBase):
    def _init_column_mapping(self): pass
    def _load_file(self): pass
    def _load_modifications(self): pass

reader = TestReader(
    modification_mapping={'A':'a','B':'b'}
)
for mod, other_mods in reader.modification_mapping.items():
    for other in other_mods:
        assert other in reader.rev_mod_mapping
        assert mod == reader.rev_mod_mapping[other]

# PSMReaderProvider

To make it easier to create different readers, we design a `Provider` or `Factory` called `PSMReaderProvider` to manage all reader classes. `PSMReaderProvider` is instantiated as a global object `psm_reader_provider`. 

After a subclass of `PSMReaderBase` is defined, for example `AlphaPeptReader`, we can then register it in to `psm_reader_provider` by using `psm_reader_provider.register_reader('alphapept', AlphaPeptReader)`. Once we are going to use it, we just need to create a `AlphaPeptReader` object with `psm_reader_provider.get_reader('alphapept')`.

As we have loaded all readers in `psm_reader_provider` within alphabase.psm_reader.\__init__.py, we can easily access all registered readers by `psm_reader_provider`.

In [None]:
#| hide
from alphabase.psm_reader import psm_reader_provider
from alphabase.psm_reader import (
    alphapept_reader, maxquant_reader, 
    pfind_reader, dia_psm_reader
)

In [None]:
#| hide
assert isinstance(psm_reader_provider.get_reader_by_yaml(psm_reader_yaml['alphapept']), alphapept_reader.AlphaPeptReader)
assert isinstance(psm_reader_provider.get_reader_by_yaml(psm_reader_yaml['maxquant']), maxquant_reader.MaxQuantReader)
assert isinstance(psm_reader_provider.get_reader_by_yaml(psm_reader_yaml['diann']), dia_psm_reader.DiannReader)
assert isinstance(psm_reader_provider.get_reader_by_yaml(psm_reader_yaml['spectronaut']), dia_psm_reader.SpectronautReader)
# assert isinstance(psm_reader_provider.get_reader_by_yaml(psm_reader_yaml['pfind']), pfind_reader.pFindReader)
reader = psm_reader_provider.get_reader_by_yaml(psm_reader_yaml['diann'])
assert set(reader.modification_mapping['Phospho@S'])==set([
    'pS',
    'S(ph)',
    'S(UniMod:21)',
    'S(Phospho (S))',
    'S(Phospho (ST))',
    'S(Phospho (STY))',
    'S(Phospho (STYDH))',
    'S[ph]',
    'S[UniMod:21]',
    'S[Phospho (S)]',
    'S[Phospho (ST)]',
    'S[Phospho (STY)]',
    'S[Phospho (STYDH)]'
])
try:
    psm_reader_provider.get_reader_by_yaml(psm_reader_yaml['unknown'])
except Exception as e:
    assert type(e) is KeyError