In [1]:

import refquant.multi_run_table_creation as multi_run_table_creation
import refquant.table_import as table_import
import refquant.refquant_classes as refquant_classes
import refquant.loading.all_precursor_loading as all_precursor_loading

test_input = "../test_data/diann_test_input.tsv"



In [2]:
import pandas as pd
df_diann = pd.read_csv(test_input, sep="\t")
reference_table = table_import.TableReformatterDIANN(test_input).outfile_name

input file
/Users/constantin/workspace/refquant/test_data
../test_data/diann_test_input.tsv
using input type diann_fragion_isotopes_mDIA_raw
using input type diann_precursors_mDIA


### Append the relevant properties to the singlelabelledprecursors. We will compare these properties with the result dataframes written out by the software

In [3]:
import numpy as np

class TargetPrecursorAnnotatorDIANN(refquant_classes.TargetPrecursorAnnotator):
    def __init__(self,reference_precursor, target_precursor):
        super().__init__(reference_precursor, target_precursor)
        self._annotate_ms1_ratio()
        self._annotate_derived_ratio()

    def _annotate_derived_ratio(self):
        if self.target_precursor.number_of_ratios_used == 0:
            self.target_precursor.derived_ratio = np.nan
            return
        self.target_precursor.median_ratio_to_reference = np.median(self._ratios_to_reference)
        sorted_ratios = np.sort(self._ratios_to_reference)
        idx_quantile_min = self._get_index_of_quantile(0.1)
        idx_quantile = self._get_index_of_quantile(0.25)
        self.target_precursor.min_ratio_to_reference = sorted_ratios[idx_quantile_min]
        self.target_precursor.ratio_to_reference = sorted_ratios[idx_quantile]
    
    def _get_index_of_quantile(self,quantile):
        return int(quantile * len(self._ratios_to_reference))

    def _annotate_ms1_ratio(self):
        is_ms1 = ["MS1" in x for x in self._list_of_intersection_ions]
        if sum(is_ms1)==1:
            ms1_ratio = self._ratios_to_reference[is_ms1][0]
        elif sum(is_ms1) == 0:
            ms1_ratio = np.nan
        else:
            raise ValueError("More than one MS1 ion in intersection")

        self.target_precursor.ms1_ratio_to_reference = ms1_ratio
    
    

In [4]:
#refquant_classes.TargetPrecursorAnnotator = TargetPrecursorAnnotatorDIANN

In [5]:
class PrecursorTableCreatorQuantityBySearchEngine(multi_run_table_creation.PrecursorQuantityTableCreator):
    def _get_quantitative_values(self, list_of_precursors):
        return [precursor.search_engine_derived_quantity for precursor in list_of_precursors]

class PrecursorTableCreatorQuantityFromMS1(multi_run_table_creation.PrecursorQuantityTableCreator):
    def _get_quantitative_values(self, list_of_precursors):
        return [precursor.derived_reference_quantity_static + precursor.ms1_ratio_to_reference for precursor in list_of_precursors]

class PrecursorTableCreatorQuantityFromMedian(multi_run_table_creation.PrecursorQuantityTableCreator):
    def _get_quantitative_values(self, list_of_precursors):
        return [precursor.derived_reference_quantity_static + precursor.median_ratio_to_reference for precursor in list_of_precursors]

class PrecursorTableCreatorQuantityFromMin(multi_run_table_creation.PrecursorQuantityTableCreator):
    def _get_quantitative_values(self, list_of_precursors):
        return [precursor.derived_reference_quantity_static + precursor.min_ratio_to_reference for precursor in list_of_precursors]


class PrecursorTableCreatorQuantityDefault(multi_run_table_creation.PrecursorQuantityTableCreator):
    pass


class PrecursorTableCreatorQuantityDefaultNonStatic(multi_run_table_creation.PrecursorQuantityTableCreator):
    def _get_quantitative_values(self, list_of_precursors):
        return [precursor.comparison_derived_quantity for precursor in list_of_precursors]

In [6]:
single_labelled_precursors = all_precursor_loading.get_all_single_labelled_precursors_in_dataset_diann(reference_table, test_input)

processing 10 runs
run:  20220730_TIMS06_MCT_SA_HeLa_whi40_M07_Ref0s4s8_c1_AID8_01_S5-A1_1_3970
0/958


  self.target_precursor.summed_quantity_reference = np.log2(np.sum(2**sorted_intensities_descending[:5]))


run:  20220730_TIMS06_MCT_SA_HeLa_whi40_M07_Ref0s4s8_c1_AID8_02_S5-A2_1_3972
0/925


  self.target_precursor.summed_quantity_reference = np.log2(np.sum(2**sorted_intensities_descending[:5]))


run:  20220730_TIMS06_MCT_SA_HeLa_whi40_M07_Ref0s4s8_c1_AID8_03_S5-A3_1_3974
0/890


  self.target_precursor.summed_quantity_reference = np.log2(np.sum(2**sorted_intensities_descending[:5]))


run:  20220730_TIMS06_MCT_SA_HeLa_whi40_M07_Ref0s4s8_c1_AID8_04_S5-A4_1_3976
0/916


  self.target_precursor.summed_quantity_reference = np.log2(np.sum(2**sorted_intensities_descending[:5]))


run:  20220730_TIMS06_MCT_SA_HeLa_whi40_M07_Ref0s4s8_c1_AID8_05_S5-A5_1_3978
0/845


  self.target_precursor.summed_quantity_reference = np.log2(np.sum(2**sorted_intensities_descending[:5]))


run:  20220730_TIMS06_MCT_SA_HeLa_whi40_M07_Ref0s4s8_c2_AID8_01_S5-A6_1_3971
0/945


  self.target_precursor.summed_quantity_reference = np.log2(np.sum(2**sorted_intensities_descending[:5]))


run:  20220730_TIMS06_MCT_SA_HeLa_whi40_M07_Ref0s4s8_c2_AID8_02_S5-A7_1_3973
0/877


  self.target_precursor.summed_quantity_reference = np.log2(np.sum(2**sorted_intensities_descending[:5]))


run:  20220730_TIMS06_MCT_SA_HeLa_whi40_M07_Ref0s4s8_c2_AID8_03_S5-A8_1_3975
0/896


  self.target_precursor.summed_quantity_reference = np.log2(np.sum(2**sorted_intensities_descending[:5]))


run:  20220730_TIMS06_MCT_SA_HeLa_whi40_M07_Ref0s4s8_c2_AID8_04_S5-A9_1_3977
0/899


  self.target_precursor.summed_quantity_reference = np.log2(np.sum(2**sorted_intensities_descending[:5]))


run:  20220730_TIMS06_MCT_SA_HeLa_whi40_M07_Ref0s4s8_c2_AID8_05_S5-A10_1_3979
0/902


  self.target_precursor.summed_quantity_reference = np.log2(np.sum(2**sorted_intensities_descending[:5]))


In [7]:
precursor_table_df_search_engine = PrecursorTableCreatorQuantityBySearchEngine(single_labelled_precursors).precursorquantitytable
precursor_table_df_ms1 = PrecursorTableCreatorQuantityFromMS1(single_labelled_precursors).precursorquantitytable
precursor_table_df_median = PrecursorTableCreatorQuantityFromMedian(single_labelled_precursors).precursorquantitytable
precursor_table_df_min = PrecursorTableCreatorQuantityFromMin(single_labelled_precursors).precursorquantitytable
precursor_table_df_default = PrecursorTableCreatorQuantityDefault(single_labelled_precursors).precursorquantitytable
precursor_table_df_default_nonstatic = PrecursorTableCreatorQuantityDefaultNonStatic(single_labelled_precursors).precursorquantitytable

Trying to assign ms1_quantity_reference to 14719 precursors
Trying to assign search_engine_derived_quantity_reference to 116 precursors
Trying to assign summed_quantity_reference to 258 precursors
Trying to assign ms1_quantity_reference to 0 precursors
Trying to assign search_engine_derived_quantity_reference to 0 precursors
Trying to assign summed_quantity_reference to 240 precursors
Trying to assign ms1_quantity_reference to 0 precursors
Trying to assign search_engine_derived_quantity_reference to 0 precursors
Trying to assign summed_quantity_reference to 240 precursors


  r, k = function_base._ureduce(a, func=_nanmedian, axis=axis, out=out,
  series_quantities = pd.Series(series_values, index = series_index).drop_duplicates()
  series_quantities = pd.Series(series_values, index = series_index).drop_duplicates()
  series_quantities = pd.Series(series_values, index = series_index).drop_duplicates()
  series_quantities = pd.Series(series_values, index = series_index).drop_duplicates()
  series_quantities = pd.Series(series_values, index = series_index).drop_duplicates()
  series_quantities = pd.Series(series_values, index = series_index).drop_duplicates()
  series_quantities = pd.Series(series_values, index = series_index).drop_duplicates()
  series_quantities = pd.Series(series_values, index = series_index).drop_duplicates()
  series_quantities = pd.Series(series_values, index = series_index).drop_duplicates()
  series_quantities = pd.Series(series_values, index = series_index).drop_duplicates()
  series_quantities = pd.Series(series_values, index = ser

Trying to assign ms1_quantity_reference to 0 precursors
Trying to assign search_engine_derived_quantity_reference to 0 precursors
Trying to assign summed_quantity_reference to 240 precursors
Trying to assign ms1_quantity_reference to 0 precursors
Trying to assign search_engine_derived_quantity_reference to 0 precursors
Trying to assign summed_quantity_reference to 240 precursors
Trying to assign ms1_quantity_reference to 0 precursors
Trying to assign search_engine_derived_quantity_reference to 0 precursors
Trying to assign summed_quantity_reference to 240 precursors


  series_quantities = pd.Series(series_values, index = series_index).drop_duplicates()
  series_quantities = pd.Series(series_values, index = series_index).drop_duplicates()
  series_quantities = pd.Series(series_values, index = series_index).drop_duplicates()
  series_quantities = pd.Series(series_values, index = series_index).drop_duplicates()
  series_quantities = pd.Series(series_values, index = series_index).drop_duplicates()
  series_quantities = pd.Series(series_values, index = series_index).drop_duplicates()
  series_quantities = pd.Series(series_values, index = series_index).drop_duplicates()
  series_quantities = pd.Series(series_values, index = series_index).drop_duplicates()
  series_quantities = pd.Series(series_values, index = series_index).drop_duplicates()
  series_quantities = pd.Series(series_values, index = series_index).drop_duplicates()
  series_quantities = pd.Series(series_values, index = series_index).drop_duplicates()
  series_quantities = pd.Series(series_valu

In [8]:
precursor_table_df_search_engine.to_csv("../test_data/precursor_table_df_search_engine.tsv", sep = "\t", index = False)
precursor_table_df_ms1.to_csv("../test_data/precursor_table_df_ms1.tsv", sep = "\t", index = False)
precursor_table_df_median.to_csv("../test_data/precursor_table_df_median.tsv", sep = "\t", index = False)
precursor_table_df_min.to_csv("../test_data/precursor_table_df_min.tsv", sep = "\t", index = False)
precursor_table_df_default.to_csv("../test_data/precursor_table_df_default.tsv", sep = "\t", index = False)
precursor_table_df_default_nonstatic.to_csv("../test_data/precursor_table_df_default_nonstatic.tsv", sep = "\t", index = False)


table_locations = ["../test_data/precursor_table_df_search_engine.tsv", "../test_data/precursor_table_df_ms1.tsv", "../test_data/precursor_table_df_median.tsv", 
"../test_data/precursor_table_df_min.tsv", "../test_data/precursor_table_df_default.tsv", "../test_data/precursor_table_df_default_nonstatic.tsv"]

relevant_variables = ["search_engine_derived_quantity", "ms1_ratio_to_reference", "median_ratio_to_reference", "min_ratio_to_reference", "ratio_to_reference", "comparison_derived_quantity"]

table2variable = dict(zip(table_locations, relevant_variables))

In [9]:
import numpy as np
def go_through_tables_and_check_consistency(tablelocation2variable, single_labelled_precursors):
    for table_location, variable in tablelocation2variable.items():
        print(table_location)
        precursor_df = pd.read_csv(table_location, sep = "\t")
        compare_similarities_between_singlelabelledprecursors_and_table(single_labelled_precursors, precursor_df, variable)


def compare_similarities_between_singlelabelledprecursors_and_table(single_labelled_precursors, precursor_df, relevant_variable):
    precursor2singlelabelledprecursors = get_precursor2singlelabelledprecursors(single_labelled_precursors)
    precursor_df = precursor_df.set_index("ion").replace(0, np.nan).drop(columns = ["protein"])
    precursor_df = np.log2(precursor_df)
    precursor_copy = precursor_df.copy().reset_index()


    for precursor_name in precursor_df.index.unique():
        single_labelled_precursors_for_precursor = precursor2singlelabelledprecursors[precursor_name]
        precursor_df_for_precursor = precursor_df.loc[[precursor_name]]
        assert_that_the_reference_intensity_is_constant_over_runs(single_labelled_precursors_for_precursor, precursor_df_for_precursor, relevant_variable)



def get_precursor2singlelabelledprecursors(single_labelled_precursors):
    precursor2singlelabelledprecursors = {}
    for precursor in single_labelled_precursors:
        if precursor.name not in precursor2singlelabelledprecursors:
            precursor2singlelabelledprecursors[precursor.name] = []
        precursor2singlelabelledprecursors[precursor.name].append(precursor)
    return precursor2singlelabelledprecursors



def assert_that_the_reference_intensity_is_constant_over_runs(singlelabelled_precursors_for_precursor, row_of_precursor_df, relevant_variable):
    channel2run2quantity  = get_channel2run2quantity(singlelabelled_precursors_for_precursor, relevant_variable)
    ratios = []
    for channel in channel2run2quantity.keys():
        runs = channel2run2quantity[channel].keys()
        for run in channel2run2quantity[channel].keys():
            run_channel = f"{run}_{channel}"
            quantity_df = row_of_precursor_df[run_channel].values[0]
            quantity_precursor = channel2run2quantity[channel][run]
            if np.isnan(quantity_precursor) or np.isnan(quantity_df):
                if not (np.isnan(quantity_precursor) and np.isnan(quantity_df)):
                    print(f"precursor {singlelabelled_precursors_for_precursor[0].name}, channel {channel}, run {run} supposedly has no nan value")
                assert np.isnan(quantity_df) and np.isnan(quantity_precursor)
                continue
            ratio = quantity_df - quantity_precursor
            ratios.append(ratio)
    if len(ratios)>0:
        assert np.std(ratios) < 0.0001



def get_channel2run2quantity(singlelabelled_precursors_for_peptide, relevant_variable):
    channel2run2quantity = {}

    for precursor in singlelabelled_precursors_for_peptide:
        run = precursor.replicate_name
        channel = precursor.channel_name
        if channel not in channel2run2quantity:
            channel2run2quantity[channel] = {}
        if run not in channel2run2quantity[channel]:
            channel2run2quantity[channel][run] = {}
        quantity = getattr(precursor, relevant_variable, 0)
        channel2run2quantity[channel][run]= quantity

    return channel2run2quantity

In [10]:
go_through_tables_and_check_consistency(table2variable, single_labelled_precursors)


../test_data/precursor_table_df_search_engine.tsv
../test_data/precursor_table_df_ms1.tsv
../test_data/precursor_table_df_median.tsv
../test_data/precursor_table_df_min.tsv
../test_data/precursor_table_df_default.tsv
../test_data/precursor_table_df_default_nonstatic.tsv
