In [None]:
#| default_exp rescore.fdr

# Functionalities to calculate FDRs

### In peptdeep dataframes, we refer fdr values as q_values without loss of generacity.

In [None]:
#| export
import numba
import numpy as np
import pandas as pd

@numba.njit
def fdr_to_q_values(
    fdr_values:np.ndarray
)->np.ndarray:
    """convert FDR values to q_values.

    Args:
        fdr_values (np.ndarray): FDR values, they should be 
        sorted according to the descending order of the `score`

    Returns:
        np.ndarray: q_values
    """
    q_values = np.zeros_like(fdr_values)
    min_q_value = np.max(fdr_values)
    for i in range(len(fdr_values) - 1, -1, -1):
        fdr = fdr_values[i]
        if fdr < min_q_value:
            min_q_value = fdr
        q_values[i] = min_q_value
    return q_values

def calc_fdr(
    df:pd.DataFrame, 
    score_column:str, 
    decoy_column:str='decoy'
)->pd.DataFrame:
    """Calculate FDR values (q_values in fact) for the given dataframe

    Args:
        df (pd.DataFrame): PSM dataframe to calculate FDRs
        score_column (str): score column to sort in decending order
        decoy_column (str, optional): decoy column in the dataframe. 
        1=target, 0=decoy. Defaults to 'decoy'.

    Returns:
        pd.DataFrame: PSM dataframe with 'fdr' column added
    """
    df = df.reset_index(drop=True).sort_values(
        [score_column,decoy_column], ascending=False
    )
    target_values = 1-df[decoy_column].values
    decoy_cumsum = np.cumsum(df[decoy_column].values)
    target_cumsum = np.cumsum(target_values)
    fdr_values = decoy_cumsum/target_cumsum
    df['fdr'] = fdr_to_q_values(fdr_values)
    return df

#wrapper
calc_fdr_for_df = calc_fdr

@numba.njit
def fdr_from_ref(
    sorted_scores:np.ndarray, 
    ref_scores:np.ndarray, 
    ref_fdr_values:np.ndarray
)->np.ndarray:
    """ Calculate FDR values from the given reference scores and fdr_values. 
    It is used to extend peptide-level or sequence-level FDR (reference) 
    to each PSM, as PSMs are more useful for quantification.

    Args:
        sorted_scores (np.array): the scores to calculate FDRs, 
          they must be sorted in decending order.
        ref_scores (np.array): reference scores that used to 
          calculate ref_fdr_values, also sorted in decending order.
        ref_fdr_values (np.array): fdr values corresponding to ref_scores

    Returns:
        np.array: fdr values corresponding to sorted_scores.
    """
    q_values = np.zeros_like(sorted_scores)
    i,j = 0,0
    while i < len(sorted_scores) and j < len(ref_scores):
        if sorted_scores[i] >= ref_scores[j]:
            q_values[i] = ref_fdr_values[j]
            i += 1
        else:
            j += 1
    while i < len(sorted_scores):
        q_values[i] = ref_fdr_values[-1]
        i += 1
    return q_values

def calc_fdr_from_ref(
    df: pd.DataFrame,
    ref_scores:np.ndarray, 
    ref_fdr_values:np.ndarray,
    score_column:str, 
    decoy_column:str='decoy'
)->pd.DataFrame:
    """ Calculate FDR values for a PSM dataframe from the given reference
     scores and fdr_values. It is used to extend peptide-level or 
     sequence-level FDR (reference) to each PSM, as PSMs are more useful 
     for quantification.
    ``

    Args:
        df (pd.DataFrame): PSM dataframe
        ref_scores (np.array): reference scores that used to 
          calculate ref_fdr_values, also sorted in decending order.
        ref_fdr_values (np.array): fdr values corresponding to ref_scores
        score_column (str): score column in the dataframe
        decoy_column (str, optional): decoy column in the dataframe. 
        1=target, 0=decoy. Defaults to 'decoy'.

    Returns:
        pd.DataFrame: dataframe with 'fdr' column added
    """
    df = df.reset_index(drop=True).sort_values(
        [score_column,decoy_column], ascending=False
    )
    sorted_idxes = np.argsort(ref_fdr_values)
    ref_scores = ref_scores[sorted_idxes]
    ref_q_values = ref_fdr_values[sorted_idxes]
    df['fdr'] = fdr_from_ref(
        df.score.values, ref_scores, ref_q_values
    )
    return df

calc_fdr_from_ref_for_df = calc_fdr_from_ref

In [None]:
df = pd.DataFrame(
    {
        'score': np.random.random(500)*10+11,
        'decoy': 0,
        'kind': True,
    }
)
f_score = np.random.random(500)*9.9
df = df.append(
    pd.DataFrame(
        {
            'score': f_score+0.01,
            'decoy': 1,
            'kind': False
        }
    )
)
df = df.append(
    pd.DataFrame(
        {
            'score': f_score,
            'decoy': 0,
            'kind': False
        }
    )
)
df = df.append(
    pd.DataFrame(
        {
            'score': np.random.random(5)+10,
            'decoy': 1,
            'kind': False
        }
    )
)

df = calc_fdr(df, 'score', 'decoy')
df

Unnamed: 0,score,decoy,kind,fdr
347,20.999467,0,True,0.000000
366,20.978207,0,True,0.000000
266,20.976567,0,True,0.000000
498,20.900502,0,True,0.000000
213,20.893660,0,True,0.000000
...,...,...,...,...
1216,0.064770,0,False,0.504008
669,0.062011,1,False,0.504505
1169,0.052011,0,False,0.504505
614,0.038895,1,False,0.505000


In [None]:
df[(df.fdr < 0.01)&(df.decoy==0)]

Unnamed: 0,score,decoy,kind,fdr
347,20.999467,0,True,0.0
366,20.978207,0,True,0.0
266,20.976567,0,True,0.0
498,20.900502,0,True,0.0
213,20.893660,0,True,0.0
...,...,...,...,...
312,11.078605,0,True,0.0
493,11.052031,0,True,0.0
487,11.047116,0,True,0.0
363,11.044195,0,True,0.0


In [None]:
#| hide
assert len(df[(df.fdr < 0.01)&(df.decoy==0)]) == 500

In [None]:
#| hide
dff = pd.DataFrame(
    {
        'score': np.random.random(500)*10+11,
        'decoy': 0
    }
)
f_score = np.random.random(500)*9.9
dff = dff.append(
    pd.DataFrame(
        {
            'score': f_score+0.01,
            'decoy': 1
        }
    )
)
dff = dff.append(
    pd.DataFrame(
        {
            'score': f_score,
            'decoy': 0
        }
    )
)
dff = dff.append(
    pd.DataFrame(
        {
            'score': np.random.random(5)+10,
            'decoy': 1
        }
    )
)

dff['fdr'] = fdr_from_ref(dff.score.values, df.score.values, df.fdr.values)

assert len(dff[(dff.fdr < 0.01)&(dff.decoy==0)]) == 500

In [None]:
#| hide
dff = calc_fdr_from_ref(dff, df.score.values, df.fdr.values, 'score')
assert len(dff[(dff.fdr < 0.01)&(dff.decoy==0)]) == 500