In [1]:
#default_exp rescore.percolator

In [2]:
#export
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
import numpy as np
import pandas as pd
from tqdm import tqdm
import torch

from alphabase.peptide.fragment import get_charged_frag_types

from alphadeep.rescore.feature_extractor import (
    ScoreFeatureExtractor, ScoreFeatureExtractor_wo_MS2
)

from alphadeep.rescore.fdr import (
    fdr_from_ref, fdr_to_q_values, calc_fdr_for_df
)

from alphadeep.pretrained_models import ModelManager

In [3]:
#export

class DeepLearningScore(torch.nn.Module):
    def __init__(self, in_features, nlayer=8):
        super().__init__()
        hidden = 128
        self.nn = torch.nn.Sequential(
            torch.nn.Linear(in_features, hidden),
            torch.nn.ReLU(),
            *[]
        )

class Percolator:
    def __init__(self,
        *,
        ml_type='logistic_regression', #or 'random_forest'
        cv_fold = 2,
        n_iteration = 5,
        use_ms2_features=True,
        ms2_ppm = True, ms2_tol=30,
        model_mgr:ModelManager = None,
        **sklearn_kwargs
    ):
        self.charged_frag_types = get_charged_frag_types(
            ['b','y'], 2
        )
        self.ms2_ppm = ms2_ppm
        self.ms2_tol = ms2_tol
        if ml_type == 'logistic_regression':
            ml_type = 'lr'
        self.ml_type = ml_type
        self.fdr_level = 'psm'
        self.fdr = 0.01
        self.cv_fold = cv_fold
        self.n_iter = n_iteration

        if ml_type == 'lr':
            self.model = LogisticRegression(
                solver='liblinear', **sklearn_kwargs
            )
        else:
            self.model = RandomForestClassifier(**sklearn_kwargs)
        if use_ms2_features:
            self.feature_extractor = ScoreFeatureExtractor(
                model_mgr=model_mgr,
                ppm=ms2_ppm, tol=ms2_tol
            )
        else:
            self.feature_extractor = ScoreFeatureExtractor_wo_MS2(
                model_mgr=model_mgr,
            )
        self.feature_list = self.feature_extractor.score_feature_list
        self.feature_list += ['score','nAA','charge']
        #self.feature_list.append('ml_score')

    def enable_model_fine_tuning(self):
        self.feature_extractor.model_fine_tuning = True
    def disable_model_fine_tuning(self):
        self.feature_extractor.model_fine_tuning = False

    def _estimate_fdr(self, df:pd.DataFrame)->pd.DataFrame:
        df = df.sort_values(['ml_score','decoy'], ascending=False)
        df = df.reset_index(drop=True)
        if self.fdr_level == 'psm':
            target_values = 1-df['decoy'].values
            decoy_cumsum = np.cumsum(df['decoy'].values)
            target_cumsum = np.cumsum(target_values)
            fdr_values = decoy_cumsum/target_cumsum
            df['fdr'] = fdr_to_q_values(fdr_values)
        else:
            if self.fdr_level == 'precursor':
                _df = df.groupby([
                    'sequence','mods','mod_sites','charge','decoy'
                ])['ml_score'].max()
            elif self.fdr_level == 'peptide':
                _df = df.groupby([
                    'sequence','mods','mod_sites','decoy'
                ])['ml_score'].max()
            else:
                _df = df.groupby(['sequence','decoy'])['ml_score'].max()
            _df = _df.reset_index()
            _df = _df.sort_values(['ml_score','decoy'], ascending=False)
            target_values = 1-_df['decoy'].values
            decoy_cumsum = np.cumsum(_df['decoy'].values)
            target_cumsum = np.cumsum(target_values)
            fdr_values = decoy_cumsum/target_cumsum
            _df['fdr'] = fdr_to_q_values(fdr_values)
            df['fdr'] = fdr_from_ref(
                df['ml_score'].values, _df['ml_score'].values, 
                _df['fdr'].values
            )
        return df

    def _cv_score(self, df:pd.DataFrame)->pd.DataFrame:
        df = df.sample(frac=1).reset_index(drop=True)
        df_target = df[df.decoy == 0]
        df_decoy = df[df.decoy != 0]
        
        if self.cv_fold > 1:
            test_df_list = []
            for i in range(self.cv_fold):
                t_mask = np.ones(len(df_target), dtype=bool)
                _slice = slice(i, len(df_target), self.cv_fold)
                t_mask[_slice] = False
                cv_df_target = df_target[t_mask]
                train_t_df = cv_df_target[
                    cv_df_target.fdr <= self.fdr
                ]
                test_t_df = df_target[_slice]
                
                d_mask = np.ones(len(df_decoy), dtype=bool)
                _slice = slice(i, len(df_decoy), self.cv_fold)
                d_mask[_slice] = False
                train_d_df = df_decoy[d_mask]
                test_d_df = df_decoy[_slice]

                train_df = pd.concat((train_t_df, train_d_df))
                train_label = np.ones(len(train_df),dtype=np.int32)
                train_label[len(train_t_df):] = 0
                test_df = pd.concat((test_t_df, test_d_df))

                self.model.fit(
                    train_df[
                        self.feature_list
                    ].values, train_label
                )
                if self.ml_type == 'lr':
                    test_df['ml_score'] = self.model.decision_function(
                        test_df[self.feature_list].values
                    )
                else:
                    test_df['ml_score'] = self.model.predict_proba(
                        test_df[self.feature_list].values
                    )[:,1]
                test_df_list.append(test_df)
        
            return pd.concat(test_df_list)
        else:
            train_t_df = df_target[df_target.fdr <= self.fdr]
            train_df = pd.concat((train_t_df, df_decoy))
            train_label = np.ones(len(train_df),dtype=np.int32)
            train_label[len(train_t_df):] = 0
            test_df = pd.concat((df_target, df_decoy))

            self.model.fit(train_df[self.feature_list].values, train_label)
            if self.ml_type == 'lr':
                test_df['ml_score'] = self.model.decision_function(
                    test_df[self.feature_list].values
                )
            else:
                test_df['ml_score'] = self.model.predict_proba(
                    test_df[self.feature_list].values
                )[:,1]
        
            return test_df

    def extract_features(self,
        psm_df:pd.DataFrame, ms2_file_dict:dict, ms2_file_type:str
    )->pd.DataFrame:
        psm_df['ml_score'] = psm_df.score
        psm_df = self._estimate_fdr(psm_df)
        print('Extracting features ...')
        psm_df = self.feature_extractor.extract_features(
            psm_df, ms2_file_dict, 
            ms2_file_type, 
            psm_tune_df=psm_df[(psm_df.fdr<0.01)&(psm_df.decoy==0)],
            frag_types_to_match=self.charged_frag_types, 
            ms2_ppm=self.ms2_ppm, ms2_tol=self.ms2_tol
        )
        print('End extracting features ...')
        return psm_df

    def re_score(self, df:pd.DataFrame)->pd.DataFrame:
        print(f'{len(df[(df.fdr<=self.fdr) & (df.decoy==0)])} target PSMs at {self.fdr} psm-level FDR')
        for i in range(self.n_iter):
            print(f'[RUN] Iteration {i+1} of Percolator ...')
            df = self._cv_score(df)
            df = self._estimate_fdr(df)
            print(f'[RUN] {len(df[(df.fdr<=self.fdr) & (df.decoy==0)])} target PSMs at {self.fdr} psm-level FDR')
        df = self._estimate_fdr(df)
        print(f'[END] {len(df[(df.fdr<=self.fdr) & (df.decoy==0)])} target PSMs at {self.fdr} {self.fdr_level}-level FDR')
        return df

    def run(self,
        psm_df:pd.DataFrame, ms2_file_dict:dict, ms2_file_type:str
    )->pd.DataFrame:
        df = self.extract_features(
            psm_df, ms2_file_dict, ms2_file_type
        )
        return self.re_score(df)

In [6]:
#hide
import os
from alphabase.io.psm_reader.pfind_reader import pFindReader
perc = Percolator(use_ms2_features=False)
pfind_filtered = os.path.expanduser('~/Workspace/Data/peptidome/pFind-Filtered.spectra')
pfind_txt = os.path.expanduser('~/Workspace/Data/peptidome/pFind.spectra')
if os.path.isfile(pfind_filtered):
    pfind_filtered_df = pFindReader().import_file(pfind_filtered)
    display(pfind_filtered_df)
if os.path.isfile(pfind_txt):
    pfind_df = pFindReader(fdr=1, keep_decoy=True).import_file(pfind_txt)
    perc.n_iter = 2
    df = perc.run(pfind_df, None, None)
    display(df[df.fdr<0.01])

Unnamed: 0,sequence,charge,raw_name,query_id,spec_idx,score,proteins,uniprot_ids,fdr,decoy,nAA,mods,mod_sites,precursor_mz
0,ILLKVY,2,20140306_EXQ6_MiBa_SA_MM5-HLAp-2,20140306_EXQ6_MiBa_SA_MM5-HLAp-2.28034.28034.2...,28034,1.001345,REV_tr|F8VSC5|F8VSC5_HUMAN/tr|H0Y4R8|H0Y4R8_HU...,REV_tr|F8VSC5|F8VSC5_HUMAN/tr|H0Y4R8|H0Y4R8_HU...,0.009922,0,6,,,374.752008
1,VITEEM,1,20140306_EXQ6_MiBa_SA_MM5-HLAp-2,20140306_EXQ6_MiBa_SA_MM5-HLAp-2.10658.10658.1...,10658,2.635344,tr|E9PHY5|E9PHY5_HUMAN/tr|E9PK52|E9PK52_HUMAN/...,tr|E9PHY5|E9PHY5_HUMAN/tr|E9PK52|E9PK52_HUMAN/...,0.001474,0,6,Oxidation@M,6,737.338583
2,IARMLY,2,20140306_EXQ6_MiBa_SA_MM5-HLAp-2,20140306_EXQ6_MiBa_SA_MM5-HLAp-2.15623.15623.2...,15623,1.126552,tr|B0QZE2|B0QZE2_HUMAN/tr|E7ENI9|E7ENI9_HUMAN/...,tr|B0QZE2|B0QZE2_HUMAN/tr|E7ENI9|E7ENI9_HUMAN/...,0.007822,0,6,Oxidation@M,4,391.715099
3,KMIEKY,2,20140306_EXQ6_MiBa_SA_MM5-HLAp-2,20140306_EXQ6_MiBa_SA_MM5-HLAp-2.5402.5402.2.0...,5402,3.929352,tr|F8W6C1|F8W6C1_HUMAN/tr|G3XAE9|G3XAE9_HUMAN/...,tr|F8W6C1|F8W6C1_HUMAN/tr|G3XAE9|G3XAE9_HUMAN/...,0.000000,0,6,Oxidation@M,2,414.220214
4,VFPLAM,2,20140306_EXQ6_MiBa_SA_MM5-HLAp-2,20140306_EXQ6_MiBa_SA_MM5-HLAp-2.28555.28555.2...,28555,2.265183,tr|D6RDF8|D6RDF8_HUMAN/tr|H0Y8M4|H0Y8M4_HUMAN/...,tr|D6RDF8|D6RDF8_HUMAN/tr|H0Y8M4|H0Y8M4_HUMAN/...,0.001813,0,6,Oxidation@M,6,347.185643
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8821,TQPRGPPASSPAPAPKFSPVTPKFTPVASK,5,20140306_EXQ6_MiBa_SA_MM5-HLAp-2,20140306_EXQ6_MiBa_SA_MM5-HLAp-2.25523.25523.5...,25523,2.497027,tr|B4DQX7|B4DQX7_HUMAN/tr|H0Y2Y8|H0Y2Y8_HUMAN/...,tr|B4DQX7|B4DQX7_HUMAN/tr|H0Y2Y8|H0Y2Y8_HUMAN/...,0.001474,0,30,,,609.936225
8822,TAPVQAPPAPVIVTETPEPAMTSGVYRPPG,3,20140306_EXQ6_MiBa_SA_MM5-HLAp-2,20140306_EXQ6_MiBa_SA_MM5-HLAp-2.34545.34545.3...,34545,3.965872,tr|D6R9V8|D6R9V8_HUMAN/tr|D6RAV0|D6RAV0_HUMAN/...,tr|D6R9V8|D6R9V8_HUMAN/tr|D6RAV0|D6RAV0_HUMAN/...,0.000000,0,30,Oxidation@M,21,1016.189984
8823,HGNRGETGPSGPVGPAGAVGPRGPSGPQGIR,5,20140306_EXQ6_MiBa_SA_MM5-HLAp-2,20140306_EXQ6_MiBa_SA_MM5-HLAp-2.18726.18726.5...,18726,1.566479,sp|P08123|CO1A2_HUMAN/,sp|P08123|CO1A2_HUMAN/,0.003885,0,31,,,576.098273
8824,SGSSGTGSTGNQNPGSPRPGSTGTWNPGSSER,3,20140306_EXQ6_MiBa_SA_MM5-HLAp-2,20140306_EXQ6_MiBa_SA_MM5-HLAp-2.17657.17657.3...,17657,6.077528,sp|P02671|FIBA_HUMAN/sp|P02671-2|FIBA_HUMAN/,sp|P02671|FIBA_HUMAN/sp|P02671-2|FIBA_HUMAN/,0.000000,0,32,,,1021.124529


Extracting features ...
End extracting features ...
9388 target PSMs at 0.01 psm-level FDR
[RUN] Iteration 1 of Percolator ...
[RUN] 9409 target PSMs at 0.01 psm-level FDR
[RUN] Iteration 2 of Percolator ...
[RUN] 9404 target PSMs at 0.01 psm-level FDR
[END] 9404 target PSMs at 0.01 psm-level FDR


Unnamed: 0,sequence,charge,raw_name,query_id,spec_idx,score,proteins,uniprot_ids,fdr,decoy,nAA,mods,mod_sites,precursor_mz,ml_score,rt_delta,rt_delta_abs,mobility_delta,mobility_delta_abs
0,HQGVMVGMGQKDSYVGDEAQSK,3,20140306_EXQ6_MiBa_SA_MM5-HLAp-2,20140306_EXQ6_MiBa_SA_MM5-HLAp-2.17280.17280.3...,17280,16.724100,tr|A6NL76|A6NL76_HUMAN/tr|B8ZZJ2|B8ZZJ2_HUMAN/...,tr|A6NL76|A6NL76_HUMAN/tr|B8ZZJ2|B8ZZJ2_HUMAN/...,0.000000,0,22,Oxidation@M,5,789.694975,138.028371,0,0,0,0
1,HAAENPGKYNILGTNTIMDK,3,20140306_EXQ6_MiBa_SA_MM5-HLAp-2,20140306_EXQ6_MiBa_SA_MM5-HLAp-2.21634.21634.3...,21634,15.454800,sp|Q00839|HNRPU_HUMAN/sp|Q00839-2|HNRPU_HUMAN/,sp|Q00839|HNRPU_HUMAN/sp|Q00839-2|HNRPU_HUMAN/,0.000000,0,20,Oxidation@M,18,735.031915,125.713062,0,0,0,0
2,HRHPDEAAFFDTASTGK,2,20140306_EXQ6_MiBa_SA_MM5-HLAp-2,20140306_EXQ6_MiBa_SA_MM5-HLAp-2.17970.17970.2...,17970,15.065901,sp|P02671|FIBA_HUMAN/sp|P02671-2|FIBA_HUMAN/,sp|P02671|FIBA_HUMAN/sp|P02671-2|FIBA_HUMAN/,0.000000,0,17,,,943.942638,124.156803,0,0,0,0
3,TQEKNPLPSKETIEQEKQAGES,3,20140306_EXQ6_MiBa_SA_MM5-HLAp-2,20140306_EXQ6_MiBa_SA_MM5-HLAp-2.17147.17147.3...,17147,14.723999,sp|P62328|TYB4_HUMAN/,sp|P62328|TYB4_HUMAN/,0.000000,0,22,,,824.413518,120.502182,0,0,0,0
4,HQGVMVGMGQKDSYVGDEAQSK,3,20140306_EXQ6_MiBa_SA_MM5-HLAp-2,20140306_EXQ6_MiBa_SA_MM5-HLAp-2.18983.18983.3...,18983,14.390601,tr|A6NL76|A6NL76_HUMAN/tr|B8ZZJ2|B8ZZJ2_HUMAN/...,tr|A6NL76|A6NL76_HUMAN/tr|B8ZZJ2|B8ZZJ2_HUMAN/...,0.000000,0,22,,,784.363337,117.580729,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9493,HLDLGILYY,2,20140306_EXQ6_MiBa_SA_MM5-HLAp-2,20140306_EXQ6_MiBa_SA_MM5-HLAp-2.39754.39754.2...,39754,0.921379,tr|H7C2L6|H7C2L6_HUMAN/sp|Q9H3H5|GPT_HUMAN/sp|...,tr|H7C2L6|H7C2L6_HUMAN/sp|Q9H3H5|GPT_HUMAN/sp|...,0.009996,0,9,,,553.797675,1.259418,0,0,0,0
9494,EAWQRHKM,3,20140306_EXQ6_MiBa_SA_MM5-HLAp-2,20140306_EXQ6_MiBa_SA_MM5-HLAp-2.8324.8324.3.0...,8324,0.901887,tr|B5MEG5|B5MEG5_HUMAN/tr|E7EST9|E7EST9_HUMAN/...,tr|B5MEG5|B5MEG5_HUMAN/tr|E7EST9|E7EST9_HUMAN/...,0.009996,0,8,,,362.515154,1.258769,0,0,0,0
9495,DVVTGYLAL,1,20140306_EXQ6_MiBa_SA_MM5-HLAp-2,20140306_EXQ6_MiBa_SA_MM5-HLAp-2.40647.40647.1...,40647,0.919406,tr|K7EIY8|K7EIY8_HUMAN/tr|K7EQ79|K7EQ79_HUMAN/...,tr|K7EIY8|K7EIY8_HUMAN/tr|K7EQ79|K7EQ79_HUMAN/...,0.009996,0,9,,,950.519324,1.256428,0,0,0,0
9496,EGFLKAQAL,2,20140306_EXQ6_MiBa_SA_MM5-HLAp-2,20140306_EXQ6_MiBa_SA_MM5-HLAp-2.26448.26448.2...,26448,0.918090,tr|Q5T8D1|Q5T8D1_HUMAN/tr|Q5T8D2|Q5T8D2_HUMAN/...,tr|Q5T8D1|Q5T8D1_HUMAN/tr|Q5T8D2|Q5T8D2_HUMAN/...,0.009996,0,9,,,488.776742,1.255860,0,0,0,0
