# Ion exclusion from MS2 spectra

In [2]:
import sys
sys.path.append("../../..")

from ion_exclusion import *

from os.path import basename
import numpy as np

In [3]:
name = "20241023_Ex0016_HILICZ_DDAit"
oms_file_handler = OpenMS_File_Handler()

In [5]:
experiments = oms_file_handler.load_experiments_df( f"D:/mine2sirius_pipe/data/converted/{name}", file_ending=".mzML")

Loading names:


100%|██████████| 19/19 [00:00<?, ?it/s]


Loading experiments:


100%|██████████| 19/19 [01:12<00:00,  3.81s/it]


In [10]:
precursor_infos_files = {}
for i, row in experiments.iterrows():
    experiment = row["experiment"]
    ms2_spectra = [ spectrum for spectrum in experiment.getSpectra() if spectrum.getMSLevel() >= 2 ]
    retention_times = []
    precursor_mzs = []
    for ms2_spectrum in ms2_spectra:
        for precursor in ms2_spectrum.getPrecursors():
            retention_times.append( ms2_spectrum.getRT() )
            precursor_mzs.append( precursor.getMZ() )
    precursor_infos_files[basename(experiment.getLoadedFilePath())] = pd.DataFrame( {"m/z": precursor_mzs, "rt": retention_times} )
    

In [11]:
precursor_infos_files

{'AAmix10microM_01_HILICZ_DDA1_neg.mzML':              m/z       rt
 0     286.079024    2.728
 1     555.154220    2.928
 2     166.054533    3.144
 3     981.996403    3.578
 4     248.973371    3.779
 ...          ...      ...
 2072  166.983908  898.625
 2073  166.054533  898.840
 2074  981.996403  899.275
 2075  166.983908  899.476
 2076  806.218923  899.691
 
 [2077 rows x 2 columns],
 'AAmix1microM_01_HILICZ_DDA1_neg.mzML':               m/z       rt
 0      286.079176    2.786
 1      555.154328    2.987
 2      286.079176    3.202
 3      555.154328    3.637
 4      166.054643    3.837
 ...           ...      ...
 2366  1076.296383  898.042
 2367   594.963819  898.477
 2368  1035.955933  898.991
 2369   557.152089  899.508
 2370   136.991815  899.708
 
 [2371 rows x 2 columns],
 'AAmix1microM_02_HILICZ_DDA1_neg.mzML':              m/z       rt
 0     286.079187    2.683
 1     555.154380    2.883
 2     286.079187    3.099
 3     166.054651    3.534
 4     981.996349    3.735
 

In [5]:
quant_df = pd.read_csv(f"/mnt/d/mine2sirius_pipe/data/processed/{name}/{name}_iimn_fbmn_quant.csv")
quant_df = quant_df[[col for col in quant_df.columns if "Unnamed" not in col ]]

In [22]:
retention_time_tolerance = 30.0
mz_in_ms2 = {}
binary = False
for file_name, precursor_infos in precursor_infos_files.items():
    for i, feature in quant_df.iterrows():
        mz_matches = np.isclose( feature["row m/z"], precursor_infos["m/z"], rtol=1e-5, atol=5e-3)
        if retention_time_tolerance is not None:
            retention_time_matches = np.isclose( feature["row retention time"], precursor_infos["rt"],
                                                 rtol=0.0, atol=retention_time_tolerance)
            all_matches = mz_matches & retention_time_matches
        else:
            all_matches = mz_matches
        feature_found = int( np.any ( all_matches ) ) if binary else np.sum( all_matches ) 
        if file_name not in mz_in_ms2.keys():
            mz_in_ms2[file_name] = [feature_found]
        else:
            mz_in_ms2[file_name].append( feature_found )

row_info = pd.DataFrame( {"id": quant_df["row ID"], "m/z": quant_df["row m/z"], "rt": quant_df["row retention time"]} )
ms2_presence_df = pd.DataFrame( mz_in_ms2 )
ms2_presence_df = row_info.join(ms2_presence_df)


In [23]:
ms2_presence_df

Unnamed: 0,id,m/z,rt,AAmix10microM_01_HILICZ_DDA1_neg.mzML,AAmix1microM_01_HILICZ_DDA1_neg.mzML,AAmix1microM_02_HILICZ_DDA1_neg.mzML,E.coli_01_HILICZ_DDA1it_neg.mzML,E.coli_01_HILICZ_DDA1_neg.mzML,E.coli_02_HILICZ_DDA1it_neg.mzML,E.coli_02_HILICZ_DDA1_neg.mzML,...,E.coli_04_HILICZ_DDA1it_neg.mzML,E.coli_05_HILICZ_DDA1it_neg.mzML,E.coli_06_HILICZ_DDA1it_neg.mzML,E.coli_07_HILICZ_DDA1it_neg.mzML,E.coli_08_HILICZ_DDA1it_neg.mzML,E.coli_AAmix10microM_01_HILICZ_DDA1_neg.mzML,E.coli_AAmix1microM_01_HILICZ_DDA1_neg.mzML,E.coli_AAmix1microM_02_HILICZ_DDA1_neg.mzML,Quench_01_HILICZ_DDA1_neg.mzML,Quench_02_HILICZ_DDA1_neg.mzML
0,2661,56.995542,6.070944,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2176,59.014164,2.613818,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
2,2613,60.992957,5.789265,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2095,61.988528,2.413207,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2180,71.014069,2.618727,0,0,0,0,0,0,0,...,2,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
617,1715,1418.020774,0.782284,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
618,1696,1419.026698,0.775181,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
619,1682,1430.021943,0.773765,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
620,1694,1432.033975,0.779181,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Add annotations

In [24]:
local_annotations = pd.read_csv(f"/mnt/d/mine2sirius_pipe/data/processed/{name}/{name}_annotations")
local_annotations = local_annotations.loc[[i for i, add in local_annotations["adduct"].items() if "+" not in add.split("]")[-1]]]

In [25]:
annotated_ms2_presence = pd.merge(ms2_presence_df, local_annotations, left_on="id", right_on="id", how="left")

In [26]:
export_selection = annotated_ms2_presence[[col for col in annotated_ms2_presence.columns if "AAmix" not in col]]
export_selection.columns = [col if "it" not in col else f"Iter_{col}" for col in export_selection.columns]
export_selection = export_selection[[col for col in export_selection.columns if not col.endswith(".mzML")] +
                                    list(np.sort([col for col in export_selection.columns if col.endswith(".mzML")])) ]

export_path = f"/mnt/d/mine2sirius_pipe/substep_batches/ion_exclusion/{name}_annotated"
if retention_time_tolerance:
    export_path = f"{export_path}_rt"
if binary:
    export_path = f"{export_path}_bin"
export_selection.to_csv( f"{export_path}.tsv", sep="\t" )
export_selection

Unnamed: 0,id,m/z,rt_x,compound_name,adduct,score,precursor_mz,Iter_ion_mobility,ccs,rt_y,...,Iter_E.coli_01_HILICZ_DDA1it_neg.mzML,Iter_E.coli_02_HILICZ_DDA1it_neg.mzML,Iter_E.coli_03_HILICZ_DDA1it_neg.mzML,Iter_E.coli_04_HILICZ_DDA1it_neg.mzML,Iter_E.coli_05_HILICZ_DDA1it_neg.mzML,Iter_E.coli_06_HILICZ_DDA1it_neg.mzML,Iter_E.coli_07_HILICZ_DDA1it_neg.mzML,Iter_E.coli_08_HILICZ_DDA1it_neg.mzML,Quench_01_HILICZ_DDA1_neg.mzML,Quench_02_HILICZ_DDA1_neg.mzML
0,2661,56.995542,6.070944,,,,,,,,...,0,0,0,0,0,0,0,0,0,0
1,2176,59.014164,2.613818,Acetate; Glycolaldehyde,[M-H]-,0.660,59.013824,,,,...,0,0,0,0,0,1,0,0,0,0
2,2613,60.992957,5.789265,Bicarbonate,[M-H]-,0.833,60.993124,,,,...,0,0,0,0,0,0,0,0,0,0
3,2095,61.988528,2.413207,Nitrate,[M-H]-,0.796,61.988324,,,,...,0,0,0,0,0,0,0,0,0,0
4,2180,71.014069,2.618727,Methylglyoxal,[M-H]-,0.755,71.013824,,,,...,0,0,0,2,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
663,1715,1418.020774,0.782284,,,,,,,,...,0,0,0,0,0,0,0,0,0,0
664,1696,1419.026698,0.775181,,,,,,,,...,0,0,0,0,0,0,0,0,0,0
665,1682,1430.021943,0.773765,Cyclopropane phosphatidylethanolamine (dihexad...,[2M-H]-,0.835,1430.023124,,,,...,0,0,0,0,0,0,0,0,0,0
666,1694,1432.033975,0.779181,,,,,,,,...,0,0,0,0,0,0,0,0,0,0
