In [None]:
#| default_exp scoring.ml_scoring_base

# Base Class of ML Scoring Methods

In [None]:
#| export
import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.base import BaseEstimator

from alphabase.scoring.feature_extraction_base import BaseFeatureExtractor
from alphabase.scoring.fdr import (
    calculate_fdr,
    calculate_fdr_from_ref,
    fdr_to_q_values,
    fdr_from_ref,
)

There are two key modules in ML-based rescoring: feature extraction and rescoring algorithm. Here we designed these two modules as flexible as possible for future extensions.

## Feature extraction

The feature extractor is more important than the ML methods, so we designed a flexible architecture for feature extraction. As shown in `BaseFeatureExtractor`, a feature extractor inherited from `BaseFeatureExtractor` must re-implement `BaseFeatureExtractor.extract_features`, and tells the ML methods what are the extracted features by providing `BaseFeatureExtractor.feature_list`. 

For example, if we have two feature extractors, `AlphaPeptFE` and `AlphaPeptDeepFE`:

```python
class AlphaPeptFE(BaseFeatureExtractor):
    def extract_features(self, psm_df):
        psm_df['ap_f1'] = ...
        self._feature_list.append('ap_f1')
        psm_df['ap_f2'] = ...
        self._feature_list.append('ap_f2')

class AlphaPeptDeepFE(BaseFeatureExtractor):
    def extract_features(self, psm_df):
        psm_df['ad_f1'] = ...
        self._feature_list.append('ad_f1')
        psm_df['ad_f2'] = ...
        self._feature_list.append('ad_f2')
```

We can easily design a new feature extractor which combines these two and more feature extractors:

```python
class CombFE(BaseFeatureExtractor):
    def __init__(self):
        self.fe_list = [AlphaPeptFE(),AlphaPeptDeepFE()]

    def extract_features(self, psm_df):
        for fe in self.fe_list:
            fe.extract_features(psm_df)

    @property
    def feature_list(self):
        f_set = set()
        for fe in self.fe_list:
            f_set.update(fe.feature_list)
        return list(f_set)
```

This will be useful for rescoring with DL features, for instance, when AlphaPeptDeep is or is not installed.

## Rescoring Algorithm

The rescoring algorithm called `Percolator` (Kall et al. 2007) based on the semi-supervised learning algorithm is still the most widely used in MS-based proteomics. Therefore, we used `Percolator` as the base rescoring class and others can re-implement its methods for different algorithms.  as well as different 

1. Rescoring algorithm. We have provided the base rescoring code structure in `Percolator`. If we are going to support DiaNN's brute-force supervised learning methods, we can define the class like this:

```python
class DiaNNRescoring(Percolator):
    def __init__(self):
        super().__init__()
        self.training_fdr = 100000 # disable target filtration on FDR, which is the same as DiaNN but different from Percolator

        self._ml_model.fit(
            train_df[self.feature_list].values, 
            train_label
        )
    def rescore(self, psm_df):
        # We don't need iteration anymore, but cross validation is still necessary
        df = self._cv_score(df)
        return self._estimate_fdr(df)
```

2. ML models. Personally, `Percolator` with a linear classifier (SVM or LogisticRegression) is prefered. But as a framework, we should support different ML models. We can easily switch to the random forest by `self.ml_model = RandomForestClassifier()`. We can also use a DL model which provides sklearn-like `fit()` and `decision_function()` APIs for rescoring.

In [None]:
#| export

class Percolator:
    def __init__(self):
        self._feature_extractor:BaseFeatureExtractor = BaseFeatureExtractor()
        self._ml_model = LogisticRegression()
        
        self.fdr_level = 'psm' # psm, precursor, peptide, or sequence
        self.training_fdr = 0.01
        self.per_raw_fdr = False

        self.max_training_sample = 200000
        self.min_training_sample = 100
        self.cv_fold = 1
        self.iter_num = 1

        self._base_features = ['score','nAA','charge']

    @property
    def feature_list(self)->list:
        """ Get extracted feature_list. Property, read-only """
        return list(set(
            self._base_features+
            self.feature_extractor.feature_list
        ))

    @property
    def ml_model(self):
        """ 
        ML model in Percolator.
        It can be sklearn models or other models but implement 
        the methods `fit()` and `decision_function()` (or `predict_proba()`) 
        which are the same as sklearn models.
        """
        return self._ml_model
    
    @ml_model.setter
    def ml_model(self, model):
        self._ml_model = model

    @property
    def feature_extractor(self)->BaseFeatureExtractor:
        """
        The feature extractor inherited from `BaseFeatureExtractor`
        """
        return self._feature_extractor
    
    @feature_extractor.setter
    def feature_extractor(self, fe:BaseFeatureExtractor):
        self._feature_extractor = fe

    def extract_features(self,
        psm_df:pd.DataFrame,
        *args, **kwargs
    )->pd.DataFrame:
        """
        Extract features for rescoring.

        *args and **kwargs are used for 
        `self.feature_extractor.extract_features`.

        Parameters
        ----------
        psm_df : pd.DataFrame
            PSM DataFrame

        Returns
        -------
        pd.DataFrame
            psm_df with feature columns appended inplace.
        """
        psm_df['ml_score'] = psm_df.score
        psm_df = self._estimate_psm_fdr(psm_df)
        return self._feature_extractor.extract_features(
            psm_df, *args, **kwargs
        )

    def rescore(self, 
        df:pd.DataFrame
    )->pd.DataFrame:
        """
        Estimate ML scores and then FDRs (q-values)

        Parameters
        ----------
        df : pd.DataFrame
            psm_df

        Returns
        -------
        pd.DataFrame
            psm_df with `ml_score` and `fdr` columns updated inplace
        """
        for i in range(self.iter_num):
            df = self._cv_score(df)
            df = self._estimate_fdr(df, 'psm', False)
            df = self.feature_extractor.update_features(df)
        df = self._estimate_fdr(df)
        return df

    def run_rerank_workflow(self,
        top_k_psm_df:pd.DataFrame,
        rerank_column:str='spec_idx',
        *args, **kwargs
    )->pd.DataFrame:
        """
        Run percolator workflow with reranking 
        the peptides for each spectrum.

        - self.extract_features()
        - self.rescore()

        *args and **kwargs are used for 
        `self.feature_extractor.extract_features`.

        Parameters
        ----------
        top_k_psm_df : pd.DataFrame
            PSM DataFrame

        rerank_column : str
            The column use to rerank PSMs. 
            
            For example, use the following code to select 
            the top-ranked peptide for each spectrum.
            ```
            rerank_column = 'spec_idx' # scan_num
            idx = top_k_psm_df.groupby(
                ['raw_name',rerank_column]
            )['ml_score'].idxmax()
            psm_df = top_k_psm_df.loc[idx].copy()
            ```
        Returns
        -------
        pd.DataFrame
            Only top-scored PSM is returned for 
            each group of the `rerank_column`.
        """
        top_k_psm_df = self.extract_features(
            top_k_psm_df, *args, **kwargs
        )
        idxmax = top_k_psm_df.groupby(
            ['raw_name',rerank_column]
        )['ml_score'].idxmax()

        df = top_k_psm_df.loc[idxmax].copy()
        self._train_and_score(df)

        top_k_psm_df = self._predict(top_k_psm_df)
        idxmax = top_k_psm_df.groupby(
            ['raw_name',rerank_column]
        )['ml_score'].idxmax()
        return top_k_psm_df.loc[idxmax].copy()

    def run_rescore_workflow(self,
        psm_df:pd.DataFrame,
        *args, **kwargs
    )->pd.DataFrame:
        """
        Run percolator workflow:

        - self.extract_features()
        - self.rescore()

        *args and **kwargs are used for 
        `self.feature_extractor.extract_features`.

        Parameters
        ----------
        psm_df : pd.DataFrame
            PSM DataFrame

        Returns
        -------
        pd.DataFrame
            psm_df with feature columns appended inplace.
        """
        df = self.extract_features(
            psm_df, *args, **kwargs
        )
        return self.rescore(df)

    def _estimate_fdr_per_raw(self,
        df:pd.DataFrame,
        fdr_level:str
    )->pd.DataFrame:
        df_list = []
        for raw_name, df_raw in df.groupby('raw_name'):
            df_list.append(self._estimate_fdr(df_raw, 
                fdr_level = fdr_level,
                per_raw_fdr = False
            ))
        return pd.concat(df_list, ignore_index=True)

    def _estimate_psm_fdr(self,
        df:pd.DataFrame,
    )->pd.DataFrame:
        df = df.sort_values(
            ['ml_score','decoy'], ascending=False
        ).reset_index(drop=True)
        target_values = 1-df['decoy'].values
        decoy_cumsum = np.cumsum(df['decoy'].values)
        target_cumsum = np.cumsum(target_values)
        fdr_values = decoy_cumsum/target_cumsum
        df['fdr'] = fdr_to_q_values(fdr_values)
        return df
        
    def _estimate_fdr(self, 
        df:pd.DataFrame,
        fdr_level:str=None,
        per_raw_fdr:bool=None,
    )->pd.DataFrame:
        if fdr_level is None: 
            fdr_level = self.fdr_level
        if per_raw_fdr is None: 
            per_raw_fdr = self.per_raw_fdr

        if per_raw_fdr:
            return self._estimate_fdr_per_raw(
                df, fdr_level=fdr_level
            )

        if fdr_level == 'psm':
            return self._estimate_psm_fdr(df)
        else:
            if fdr_level == 'precursor':
                _df = df.groupby([
                    'sequence','mods','mod_sites','charge','decoy'
                ])['ml_score'].max()
            elif fdr_level == 'peptide':
                _df = df.groupby([
                    'sequence','mods','mod_sites','decoy'
                ])['ml_score'].max()
            else:
                _df = df.groupby(['sequence','decoy'])['ml_score'].max()
            _df = self._estimate_psm_fdr(_df)
            df['fdr'] = fdr_from_ref(
                df['ml_score'].values, _df['ml_score'].values, 
                _df['fdr'].values
            )
        return df

    def _train(self, 
        train_t_df:pd.DataFrame, 
        train_d_df:pd.DataFrame
    ):
        train_t_df = train_t_df[train_t_df.fdr<=self.training_fdr]

        if len(train_t_df) > self.max_training_sample:
            train_t_df = train_t_df.sample(
                n=self.max_training_sample, 
                random_state=1337
            )
        if len(train_d_df) > self.max_training_sample:
            train_d_df = train_d_df.sample(
                n=self.max_training_sample,
                random_state=1337
            )

        train_df = pd.concat((train_t_df, train_d_df))
        train_label = np.ones(len(train_df),dtype=np.int32)
        train_label[len(train_t_df):] = 0

        self._ml_model.fit(
            train_df[self.feature_list].values, 
            train_label
        )

    def _predict(self, test_df):
        try:
            test_df['ml_score'] = self._ml_model.decision_function(
                test_df[self.feature_list].values
            )
        except AttributeError:
            test_df['ml_score'] = self._ml_model.predict_proba(
                test_df[self.feature_list].values
            )
        return test_df

    def _train_and_score(self,
        df:pd.DataFrame
    )->pd.DataFrame:

        df_target = df[df.decoy == 0]
        df_decoy = df[df.decoy != 0]

        if (
            np.sum(df_target.fdr<=self.training_fdr) < 
            self.min_training_sample or
            len(df_decoy) < self.min_training_sample
        ):
            return df
        
        self._train(df_target, df_decoy)
        test_df = pd.concat(
            [df_target, df_decoy],
            ignore_index=True
        )
    
        return self._predict(test_df)

    def _cv_score(self, df:pd.DataFrame)->pd.DataFrame:
        """
        Apply cross-validation for rescoring.

        It will split `df` into K folds. For each fold, 
        its ML scores are predicted by a model which 
        is trained by other K-1 folds .

        Parameters
        ----------
        df : pd.DataFrame
            PSMs to be rescored

        Returns
        -------
        pd.DataFrame
            PSMs after rescoring
        """

        if self.cv_fold <= 1:
            return self._train_and_score(df)

        df = df.sample(
            frac=1, random_state=1337
        ).reset_index(drop=True)

        df_target = df[df.decoy == 0]
        df_decoy = df[df.decoy != 0]

        if (
            np.sum(df_target.fdr<=self.training_fdr) < 
            self.min_training_sample*self.cv_fold 
            or len(df_decoy) < 
            self.min_training_sample*self.cv_fold
        ):
            return df
        
        test_df_list = []
        for i in range(self.cv_fold):
            t_mask = np.ones(len(df_target), dtype=bool)
            _slice = slice(i, len(df_target), self.cv_fold)
            t_mask[_slice] = False
            train_t_df = df_target[t_mask]
            test_t_df = df_target[_slice]
            
            d_mask = np.ones(len(df_decoy), dtype=bool)
            _slice = slice(i, len(df_decoy), self.cv_fold)
            d_mask[_slice] = False
            train_d_df = df_decoy[d_mask]
            test_d_df = df_decoy[_slice]

            self._train(train_t_df, train_d_df)

            test_df = pd.concat((test_t_df, test_d_df))
            test_df_list.append(self._predict(test_df))
    
        return pd.concat(test_df_list, ignore_index=True)
    

In [None]:
#| hide
from nbdev.showdoc import show_doc

### Properties of `Percolator`

In [None]:
show_doc(Percolator.ml_model)

---

[source](https://github.com/MannLabs/alphabase/blob/main/alphabase/scoring/ml_scoring_base.py#L46){target="_blank" style="float:right; font-size:smaller"}

### Percolator.ml_model

>      Percolator.ml_model ()

ML model in Percolator.
It can be sklearn models or other models but implement 
the methods `fit()` and `decision_function()` (or `predict_proba()`) 
which are the same as sklearn models.

In [None]:
show_doc(Percolator.feature_extractor)

---

[source](https://github.com/MannLabs/alphabase/blob/main/alphabase/scoring/ml_scoring_base.py#L66){target="_blank" style="float:right; font-size:smaller"}

### Percolator.feature_extractor

>      Percolator.feature_extractor ()

The feature extractor inherited from `BaseFeatureExtractor`

In [None]:
show_doc(Percolator.feature_list)

---

[source](https://github.com/MannLabs/alphabase/blob/main/alphabase/scoring/ml_scoring_base.py#L37){target="_blank" style="float:right; font-size:smaller"}

### Percolator.feature_list

>      Percolator.feature_list ()

Get extracted feature_list. Property, read-only

### Methods of `Percolator`

In [None]:
show_doc(Percolator.extract_features)

---

[source](https://github.com/MannLabs/alphabase/blob/main/alphabase/scoring/ml_scoring_base.py#L69){target="_blank" style="float:right; font-size:smaller"}

### Percolator.extract_features

>      Percolator.extract_features (psm_df:pandas.core.frame.DataFrame, *args,
>                                   **kwargs)

Extract features for rescoring.

*args and **kwargs are used for 
`self.feature_extractor.extract_features`.

|    | **Type** | **Details** |
| -- | -------- | ----------- |
| psm_df | DataFrame | PSM DataFrame |
| args |  |  |
| kwargs |  |  |
| **Returns** | **DataFrame** | **psm_df with feature columns appended inplace.** |

In [None]:
show_doc(Percolator.rescore)

---

[source](https://github.com/MannLabs/alphabase/blob/main/alphabase/scoring/ml_scoring_base.py#L95){target="_blank" style="float:right; font-size:smaller"}

### Percolator.rescore

>      Percolator.rescore (df:pandas.core.frame.DataFrame)

Estimate ML scores and then FDRs (q-values)

|    | **Type** | **Details** |
| -- | -------- | ----------- |
| df | DataFrame | psm_df |
| **Returns** | **DataFrame** | **psm_df with `ml_score` and `fdr` columns updated inplace** |

In [None]:
show_doc(Percolator.run_rescore_workflow)

---

[source](https://github.com/MannLabs/alphabase/blob/main/alphabase/scoring/ml_scoring_base.py#L171){target="_blank" style="float:right; font-size:smaller"}

### Percolator.run_rescore_workflow

>      Percolator.run_rescore_workflow (psm_df:pandas.core.frame.DataFrame,
>                                       *args, **kwargs)

Run percolator workflow:

- self.extract_features()
- self.rescore()

*args and **kwargs are used for 
`self.feature_extractor.extract_features`.

|    | **Type** | **Details** |
| -- | -------- | ----------- |
| psm_df | DataFrame | PSM DataFrame |
| args |  |  |
| kwargs |  |  |
| **Returns** | **DataFrame** | **psm_df with feature columns appended inplace.** |

In [None]:
show_doc(Percolator.run_rerank_workflow)

---

[source](https://github.com/MannLabs/alphabase/blob/main/alphabase/scoring/ml_scoring_base.py#L117){target="_blank" style="float:right; font-size:smaller"}

### Percolator.run_rerank_workflow

>      Percolator.run_rerank_workflow (top_k_psm_df:pandas.core.frame.DataFrame,
>                                      rerank_column:str='spec_idx', *args,
>                                      **kwargs)

Run percolator workflow with reranking 
the peptides for each spectrum.

- self.extract_features()
- self.rescore()

*args and **kwargs are used for 
`self.feature_extractor.extract_features`.

|    | **Type** | **Default** | **Details** |
| -- | -------- | ----------- | ----------- |
| top_k_psm_df | DataFrame |  | PSM DataFrame |
| rerank_column | str | spec_idx | The column use to rerank PSMs. <br><br>For example, use the following code to select <br>the top-ranked peptide for each spectrum.<br>```<br>rerank_column = 'spec_idx' # scan_num<br>idx = top_k_psm_df.groupby(<br>    ['raw_name',rerank_column]<br>)['ml_score'].idxmax()<br>psm_df = top_k_psm_df.loc[idx].copy()<br>``` |
| args |  |  |  |
| kwargs |  |  |  |
| **Returns** | **DataFrame** |  |  |

## Simple Examples

In [None]:
df = pd.DataFrame({
    'score': list(np.random.uniform(0,100,100))+list(np.random.uniform(0,10,100)),
    'nAA': list(np.random.randint(7,30,200)),
    'charge': list(np.random.randint(2,4,200)),
    'decoy': [0]*100+[1]*100,
    'spec_idx': np.repeat(np.arange(100),2),
    'raw_name': 'raw',
})
perc = Percolator()
perc.min_training_sample = 10
perc.run_rescore_workflow(df)

Unnamed: 0,score,nAA,charge,decoy,spec_idx,raw_name,ml_score,fdr
0,99.851979,26,3,0,18,raw,138.142766,0.000000
1,98.746052,7,3,0,12,raw,133.867779,0.000000
2,97.415167,16,2,0,16,raw,133.447761,0.000000
3,96.857314,14,3,0,15,raw,131.877318,0.000000
4,94.606208,17,3,0,48,raw,128.785713,0.000000
...,...,...,...,...,...,...,...,...
195,0.346523,18,2,1,89,raw,-17.008649,0.979798
196,0.703782,15,3,1,82,raw,-17.292748,0.989899
197,0.058571,22,3,1,77,raw,-17.352293,1.000000
198,0.901983,9,2,1,64,raw,-17.357704,1.000000


In [None]:
perc.run_rerank_workflow(df, rerank_column='spec_idx')

Unnamed: 0,score,nAA,charge,decoy,spec_idx,raw_name,ml_score,fdr
54,44.986000,25,2,0,0,raw,23.239871,0.000000
6,94.020658,7,3,0,1,raw,61.762973,0.000000
73,23.028068,14,2,0,2,raw,5.346026,0.000000
17,79.163537,28,3,0,3,raw,50.584693,0.000000
36,61.673923,23,2,0,4,raw,36.500728,0.000000
...,...,...,...,...,...,...,...,...
170,2.306086,7,3,1,95,raw,-11.475920,0.744898
105,8.107192,8,2,1,96,raw,-6.765049,0.191011
92,9.717331,10,3,1,97,raw,-5.459666,0.044944
143,4.381494,29,3,1,98,raw,-9.100027,0.565217
