In [None]:
# default_exp matching

# Matching

> Functions related to matching

In [None]:
#hide
from nbdev.showdoc import *

## Dataset Alignment

Align datasets via comparing shared precursors and calculating the median offset.
All files will be compared with each other, and a linear equation system is used to calculate the best offset.

Offset is either applied relative (mz, mobility) or absolute (rt).

In [None]:
#export
import logging
import pandas as pd
from itertools import combinations 
import numpy as np
import os
import alphapept.io
import functools
from sklearn.linear_model import LinearRegression


def calculate_distance(table_1, table_2, offset_cols, calib = False):
    """
    Calculate the distance, either relative or absolute
    TODO: We could use a weighting factor
    """
    
    shared_precursors = list(set(table_1.index).intersection(set(table_2.index)))
    
    table_1_ = table_1.loc[shared_precursors]
    table_2_ = table_2.loc[shared_precursors] 
    
    table_1_ = table_1_.groupby('precursor').mean()
    table_2_ = table_2_.groupby('precursor').mean()
    
    deltas = []
    
    for col in list(offset_cols.keys()):
        if calib:
            col_ = col+'_calib'
        else:
            col_ = col
            
        if offset_cols[col] == 'absolute':
            deltas.append(np.nanmedian(table_1_[col_] - table_2_[col_]))
        elif offset_cols[col] == 'relative':
            deltas.append(np.nanmedian((table_1_[col_] - table_2_[col_]) / (table_1_[col_] + table_2_[col_]) * 2))
        else:
            raise NotImplementedError(offset_cols[col_])
            
    return deltas, len(shared_precursors)

def calib_table(table, delta, offset_cols):
    """
    Apply offset to a table
    If not _calib table exist, create a new one.
    
    """
    for col in list(offset_cols.keys()):
        
        if (col not in table.columns) and (col+'_apex' in table.columns):
            col_ = col+'_apex'
        else:
            col_ = col

        if offset_cols[col] == 'absolute':
            table[col+'_calib'] =  table[col_]-delta[col]
        elif offset_cols[col] == 'relative':
            table[col+'_calib'] = (1-delta[col_])*table[col]
        else:
            raise NotImplementedError(offset_cols[col])
            
def align(deltas, filenames, weights=None):
    """
    Solve equation system
    """
    matrix = []

    for i in range(len(deltas)):
        start, end = deltas.index[i]

        start_idx = filenames.index(start)
        end_idx = filenames.index(end)

        lines = np.zeros(len(filenames)-1)
        lines[start_idx:end_idx] = 1
        matrix.append(lines)
        
    # Remove nan values
    
    not_nan = ~deltas.isnull().any(axis=1)
    matrix = np.array(matrix)
    matrix = matrix[not_nan]
    deltas_ = deltas[not_nan]
    
    if len(deltas) < matrix.shape[1]:
        logging.info('Low overlap between datasets detected. Alignment may fail.')
        
    if weights is not None:
        reg = LinearRegression(fit_intercept=False).fit(matrix, deltas_.values, sample_weight = weights[not_nan])
        score= reg.score(matrix, deltas_.values)
    else:
        reg = LinearRegression(fit_intercept=False).fit(matrix, deltas_.values)
        score= reg.score(matrix, deltas_.values)
        
    logging.info(f"Regression score is {score}")

    x= reg.predict(np.eye(len(filenames)-1))

    #x = np.linalg.lstsq(matrix, deltas_.values, rcond=None)[0] #Alternative w/o weights
    
    return x


def calculate_deltas(combos, calib = False, callback=None):
    """
    Calculate offsets for multiple files
    TODO: Parallelize
    """
    
    
    offset_cols = {}

    callback = None

    deltas = pd.DataFrame()
    weights = []

    for i, combo in enumerate(combos):

        file1 = os.path.splitext(combo[0])[0] + '.ms_data.hdf'
        file2 = os.path.splitext(combo[1])[0] + '.ms_data.hdf'

        df_1 = alphapept.io.MS_Data_File(file1).read(dataset_name="peptide_fdr")
        df_2 = alphapept.io.MS_Data_File(file2).read(dataset_name="peptide_fdr")

        if not offset_cols:
            offset_cols = {'mz':'relative', 'rt':'absolute'}
            if 'mobility' in df_1.columns:
                logging.info("Also using mobility for calibration.")
                offset_cols['mobility'] = 'relative'
            cols = list(offset_cols.keys())

        if len(deltas) == 0:
             deltas = pd.DataFrame(columns = cols)

        dists, weight = calculate_distance(df_1, df_2, offset_cols, calib = calib)
        deltas = deltas.append(pd.DataFrame([dists], columns = cols, index=[combo]))

        weights.append(weight)
        
        if callback:
            callback((i+1)/len(combos))
        
    return deltas, np.array(weights), offset_cols


def align_files(filenames, alignment, offset_cols):

    for idx, filename in enumerate(filenames):
        
        file = os.path.splitext(filename)[0] + '.ms_data.hdf'

        for column in ['peptide_fdr', 'feature_table']:
            df = alphapept.io.MS_Data_File(file).read(dataset_name=column)
            calib_table(df, alignment.iloc[idx], offset_cols)
            logging.info(f"Saving {file} - {column}.")
            ms_file = alphapept.io.MS_Data_File(file, is_overwritable=True)
        
            ms_file.write(df, dataset_name=column)


def align_datasets(settings, callback=None):
    filenames = settings['experiment']['file_paths']
    
    if callback:
        def progress_wrapper(current, step, n_steps):
            callback(step+current/n_steps)
            
        cb = functools.partial(progress_wrapper, 0, 2)
    else:
        cb = None
        
    if len(filenames) > 1:
        combos = list(combinations(filenames, 2))

        deltas, weights, offset_cols = calculate_deltas(combos, callback=cb)

        cols = list(offset_cols.keys())

        before_sum = deltas.abs().sum().to_dict()
        before_mean = deltas.abs().mean().to_dict()

        logging.info(f'Total deviation before calibration {before_sum}')
        logging.info(f'Mean deviation before calibration {before_mean}')

        logging.info(f'Solving equation system')

        alignment = pd.DataFrame(align(deltas, filenames, weights), columns = cols)
        alignment = pd.concat([pd.DataFrame(np.zeros((1, alignment.shape[1])), columns= cols), alignment])
        alignment -= alignment.mean()

        logging.info(f'Solving equation system complete.')

        logging.info(f'Applying offset')

        align_files(filenames, -alignment, offset_cols)
        
        if cb:
            cb = functools.partial(progress_wrapper, 1, 2)

        deltas, weights, offset_cols = calculate_deltas(combos, calib=True, callback=cb)

        after_sum = deltas.abs().sum().to_dict()
        after_mean = deltas.abs().mean().to_dict()

        logging.info(f'Total deviation after calibration {after_sum}')
        logging.info(f'Mean deviation after calibration {after_mean}')

        change_sum = {k:v/before_sum[k] for k,v in after_sum.items()}
        change_mean = {k:v/before_mean[k] for k,v in after_mean.items()}

        logging.info(f'Change (after/before) total deviation {change_sum}')
        logging.info(f'Change (after/before) mean deviation {change_mean}')
    
    else:
        logging.info('Only 1 dataset present. Skipping alignment.')

## Matching 

Transfer MS2 identifications to similar MS1 fatures.

Brief outline of the computational task

- Start with aligned datasets
- Combine all datasets in one dataframe
- Group by percursor and calculate expected location (rt, mz, mobility) and standard deviation to calculate a reference
- For each dataset, calculate the subset of precursors that were not identified but are present in the reference
- Search for the closest neighbor for each element in the subset in the identified features
- The distance is calculated using the median standard deviation of the reference
- Use the `Mahalanobis` distance to calculate a probability that one featue belongs to the distribution

For Mahalanobis distance, see here:
https://stats.stackexchange.com/questions/331283/how-to-calculate-the-probability-of-a-data-point-belonging-to-a-multivariate-nor

In [None]:
#export

from sklearn.neighbors import KDTree
from alphapept.utils import assemble_df
from scipy import stats

def get_probability(df, ref, sigma, index):

    sigma = sigma.iloc[index].values
    sigma = sigma*np.eye(len(sigma))
    
    mu = ref.iloc[index].values
    
    x = df.iloc[index].values

    try:
        m_dist_x = np.dot((x-mu).transpose(), np.linalg.inv(sigma))
        m_dist_x = np.dot(m_dist_x, (x-mu))
        _ = stats.chi2.cdf(m_dist_x, len(mu))
    except Exception as e:
        _ = np.nan
    
    return _

def match_datasets(settings, callback = None):
    
    if len(settings['experiment']['file_paths']) > 2:
        xx = alphapept.utils.assemble_df(settings, field='peptide_fdr')

        base_col = ['precursor']
        alignment_cols = ['mz_calib','rt_calib']
        extra_cols = ['score','decoy','target']

        if 'mobility' in xx.columns:
            alignment_cols += ['mobility_calib']
            use_mobility = True
        else:
            use_mobility = False

        grouped = xx[base_col + alignment_cols + extra_cols].groupby('precursor').mean()
        std_ = xx[base_col + alignment_cols].groupby('precursor').std()

        grouped[[_+'_std' for _ in alignment_cols]] = std_

        std_range = np.nanmedian(std_.values, axis=0)

        min_match_p = settings['matching']['min_match_p']
        min_match_d = settings['matching']['min_match_d']

        filenames = settings['experiment']['file_paths']

        lookup_dict = xx.set_index('precursor')[['sequence']].to_dict()

        for idx, filename in enumerate(filenames):
            file = os.path.splitext(filename)[0] + '.ms_data.hdf'

            df = alphapept.io.MS_Data_File(file).read(dataset_name='peptide_fdr')
            features = alphapept.io.MS_Data_File(file).read(dataset_name='feature_table')
            features['feature_idx'] = features.index

            matching_set = set(grouped.index) - set(df['precursor'])
            logging.info(f'Trying to match file {file} with database of {len(matching_set):,} unidentified candidates')

            mz_range = std_range[0]
            rt_range = std_range[1]

            tree_points = features[alignment_cols].values
            tree_points[:,0] = tree_points[:,0]/mz_range 
            tree_points[:,1] = tree_points[:,1]/rt_range

            query_points = grouped.loc[matching_set][alignment_cols].values
            query_points[:,0] = query_points[:,0]/mz_range 
            query_points[:,1] = query_points[:,1]/rt_range

            if use_mobility:
                logging.info("Using mobility")
                i_range = std_range[2]

                tree_points[:,2] = tree_points[:,2]/i_range
                query_points[:,2] = query_points[:,2]/i_range

            matching_tree = KDTree(tree_points, metric="minkowski")

            dist, idx = matching_tree.query(query_points, k=1)

            matched = features.iloc[idx[:,0]]

            for _ in extra_cols:
                matched[_] = grouped.loc[matching_set, _].values

            to_keep = dist < min_match_d

            matched = matched[to_keep]

            ref = grouped.loc[matching_set][alignment_cols][to_keep]
            sigma = std_.loc[matching_set][to_keep]

            logging.info(f'{len(matched):,} possible features for matching based on distance of {min_match_d}')

            matched['matching_p'] = [get_probability(matched[alignment_cols], ref, sigma, i) for i in range(len(matched))]
            matched['precursor'] = grouped.loc[matching_set][to_keep].index.values

            matched = matched[matched['matching_p']< min_match_p]

            logging.info(f'{len(matched):,} possible features for matching based on probability of {min_match_p}')

            matched['type'] = 'matched'

            for _ in lookup_dict.keys():
                matched[_] = [lookup_dict[_][x] for x in matched['precursor']]

            df['type'] = 'msms'
            df['matching_p'] = np.nan

            shared_columns = set(matched.columns).intersection(set(df.columns))

            df_ = pd.concat([df, matched[shared_columns]], ignore_index=True)

            logging.info(f"Saving {file} - peptide_fdr.")
            ms_file = alphapept.io.MS_Data_File(file, is_overwritable=True)

            ms_file.write(df_, dataset_name='peptide_fdr')
    else:
        logging.info('Less than 3 datasets present. Skipping matching.')

In [None]:
#hide
from nbdev.export import *
notebook2script()

Converted 00_settings.ipynb.
Converted 01_chem.ipynb.
Converted 02_io.ipynb.
Converted 03_fasta.ipynb.
Converted 04_feature_finding.ipynb.
Converted 05_search.ipynb.
Converted 06_score.ipynb.
Converted 07_recalibration.ipynb.
Converted 08_quantification.ipynb.
Converted 09_matching.ipynb.
Converted 10_constants.ipynb.
Converted 11_interface.ipynb.
Converted 12_speed.ipynb.
Converted 13_export.ipynb.
Converted Alignment Revision.ipynb.
Converted Alignment_revision2-Copy1.ipynb.
Converted Alignment_revision2.ipynb.
Converted FF Fix.ipynb.
Converted index.ipynb.
Converted Untitled.ipynb.
Converted Untitled1.ipynb.
Converted Untitled2.ipynb.
Converted Untitled3.ipynb.
