In [1]:
# Imports
import os
import datetime
import json
from typing import overload, Any, List, Dict, Tuple, Set, Sequence, Union, Optional
import numpy as np
import pandas as pd
import pyopenms as oms
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px
from tqdm import tqdm
import shutil
import requests
from sklearn.impute import KNNImputer
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline

# Ignore seaborn warning for future deprecation of module part
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# import methods from FIA python script
from FIA import *

oms.LogConfigHandler().setLogLevel("DEBUG")



## Read in

In [2]:
# set path to your mzML files and workfolder
data_dir = "../../data/example data/exampleA_ecolistrains"
run_dir = "../../runs/fia_explorer"

data_dir = os.path.normpath(os.path.join(os.getcwd(), data_dir))
run_dir = os.path.normpath(os.path.join(os.getcwd(), run_dir))

# clean_dir(run_dir)

## Centroiding
Data reduction with peak picking

In [3]:
# centroid_dir = centroid_batch(data_dir, run_dir, file_ending=".mzXML")
centroid_dir = os.path.join(run_dir, "centroids")

## Merging
We loose temporal information. Could not be desired, because the isotope spectra get less specific.

In [4]:
# merge_dir = merge_batch(centroid_dir, run_dir, block_size=None, mz_binning_width=5.0, mz_binning_width_unit="ppm", average_gaussian_cutoff=0.01, file_ending=".mzML")
# merge_dir = os.path.join(run_dir, "merged")

## Mass trace detection

In [7]:
mass_traces_all = mass_trace_detection_batch(experiments=centroid_dir, file_ending=".mzML", 
                                             mass_error_ppm=10.0, noise_threshold_int= 1000.0, reestimate_mt_sd="true",
                                             quant_method="median", trace_termination_criterion="outlier", trace_termination_outliers=3,
                                             min_trace_length=5.0, max_trace_length=-1.0)

100%|██████████| 6/6 [00:03<00:00,  1.55it/s]


## Elution peak detection
Should mostly come out to one peak, but somtimes separation happens in long pipe

In [13]:
mass_traces_deconvol_all = elution_peak_detection_batch(mass_traces_all=mass_traces_all, chrom_fwhm=10.0, chrom_peak_snr=2.0,
                                                     width_filtering="fixed", min_fwhm=1.0, max_fwhm=60.0,
                                                     masstrace_snr_filtering="false")

100%|██████████| 6/6 [00:02<00:00,  2.48it/s]


## Feature detection

In [19]:
def feature_detection_untargeted(experiment: Union[oms.MSExperiment, str],
                                 mass_traces_deconvol: list = [], isotope_filtering_model="metabolites (2% RMS)",
                                 local_rt_range:float=3.0, local_mz_range:float=5.0, 
                                 charge_lower_bound:int=1, charge_upper_bound:int=3,
                                 chrom_fwhm:float=10.0, report_summed_ints:str="true",
                                 enable_RT_filtering:str="false", mz_scoring_13C:str="false",
                                 use_smoothed_intensities:str="false", report_convex_hulls: str = "true",
                                 report_chromatograms:str="false", remove_single_traces: str = "true",
                                 mz_scoring_by_elements: str = "false", elements:str="CHNOPS") -> oms.FeatureMap:
    """
    Untargeted feature detection
    """
    feature_map = oms.FeatureMap()  # output features
    chrom_out = []  # output chromatograms
    ffm = oms.FeatureFindingMetabo()

    if isinstance(experiment, str):
        feature_map.setPrimaryMSRunPath([experiment.encode()])

    experiment = load_experiment(experiment)

    ffm_par = ffm.getDefaults()
    ffm_par.setValue("local_rt_range", local_rt_range)          # rt range for coeluting mass traces (can be set low (3.0s ~ 2 frames/spectra), because only one peak is expected)
    ffm_par.setValue("local_mz_range", local_mz_range)          # mz range for isotopic traces
    ffm_par.setValue("charge_lower_bound", charge_lower_bound)
    ffm_par.setValue("charge_upper_bound", charge_upper_bound)
    ffm_par.setValue("chrom_fwhm", chrom_fwhm)                  # Set expected chromatographic width according to elution detection parameter
    ffm_par.setValue("report_summed_ints", report_summed_ints)  # Sum intesity over all traces or use monoisotopic peak intensity ? (amplyfies signal with detected isotopes)
    ffm_par.setValue("enable_RT_filtering", enable_RT_filtering) # Require RT overlap. 'false' for direct injection
    ffm_par.setValue("isotope_filtering_model", isotope_filtering_model) # metabolites (2% RMS) = Support Vector Machine, with Root mean square deviation of 2% (for precise machines)
    ffm_par.setValue("mz_scoring_13C", mz_scoring_13C)  # Disable for general metabolomics
    ffm_par.setValue("use_smoothed_intensities", use_smoothed_intensities)  # Use Locally Weighted Scatterplot Smoothed intensities (useful, if intensity is mass-dependent (Orbitraps)) ?
    ffm_par.setValue("report_convex_hulls", report_convex_hulls)
    ffm_par.setValue("report_chromatograms", report_chromatograms)  # 'false', was not performed in Flow-injection
    ffm_par.setValue("remove_single_traces", remove_single_traces)  # 'false', there will be valuable single traces, because we have long traces, that may not match
    ffm_par.setValue("mz_scoring_by_elements", mz_scoring_by_elements) # 'true' to use expected element peaks to detect isotopes 
    ffm_par.setValue("elements", elements) # Elements, that are present in sample: "CHNOPS"  
    ffm.setParameters(ffm_par)

    ffm.run(mass_traces_deconvol, feature_map, chrom_out)
    feature_map.setUniqueIds()  # Assigns a new, valid unique id per feature

    return feature_map

def feature_detection_untargeted_batch(experiments:Union[List[oms.MSExperiment|str], str], file_ending:str=".mzML",
                                       mass_traces_deconvol_all: list[list] = [], isotope_filtering_model="metabolites (2% RMS)",
                                       local_rt_range:float=3.0, local_mz_range:float=5.0, 
                                       charge_lower_bound:int=1, charge_upper_bound:int=3,
                                       chrom_fwhm:float=10.0, report_summed_ints:str="true",
                                       enable_RT_filtering:str="false", mz_scoring_13C:str="false",
                                       use_smoothed_intensities:str="false", report_convex_hulls: str = "true",
                                       report_chromatograms:str="false", remove_single_traces: str = "true",
                                       mz_scoring_by_elements: str = "false", elements:str="CHNOPS") -> list[oms.FeatureMap]:
    feature_maps = []
    if isinstance(experiments, str):
        experiments = [os.path.join(experiments, file) for file in os.listdir(experiments) if file.endswith(file_ending)]
    for i, experiment in enumerate(tqdm(experiments)):
        feature_maps.append(
            feature_detection_untargeted(experiment=experiment,
                                         mass_traces_deconvol=mass_traces_deconvol_all[i], 
                                         isotope_filtering_model=isotope_filtering_model,
                                         local_rt_range=local_rt_range, local_mz_range=local_mz_range, 
                                         charge_lower_bound=charge_lower_bound, charge_upper_bound=charge_upper_bound,
                                         chrom_fwhm=chrom_fwhm, report_summed_ints=report_summed_ints,
                                         enable_RT_filtering=enable_RT_filtering, mz_scoring_13C=mz_scoring_13C,
                                         use_smoothed_intensities=use_smoothed_intensities, report_convex_hulls=report_convex_hulls,
                                         report_chromatograms=report_chromatograms, remove_single_traces=remove_single_traces,
                                         mz_scoring_by_elements=mz_scoring_by_elements, elements=elements)
        )

    return feature_maps

In [22]:
feature_maps = feature_detection_untargeted_batch(experiments=centroid_dir,
                                                  mass_traces_deconvol_all=mass_traces_deconvol_all, isotope_filtering_model="metabolites (2% RMS)",
                                                  local_rt_range=3.0, local_mz_range=5.0, 
                                                  charge_lower_bound=1, charge_upper_bound=3,
                                                  chrom_fwhm=10.0, report_summed_ints="true",
                                                  enable_RT_filtering="false", mz_scoring_13C="false",
                                                  use_smoothed_intensities="false", report_convex_hulls="true",
                                                  report_chromatograms="false", remove_single_traces="true",
                                                  mz_scoring_by_elements="false", elements="CHNOPS")

100%|██████████| 6/6 [00:07<00:00,  1.18s/it]


In [30]:
feature_maps[3].get_df()

Unnamed: 0_level_0,peptide_sequence,peptide_score,ID_filename,ID_native_id,charge,RT,mz,RTstart,RTend,MZstart,MZend,quality,intensity
feature_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
14841806439465190535,,,,,1,3.278,84.080468,2.564,14.708,84.080363,85.084158,0.000421,92905.492188
2278316379206883097,,,,,1,14.708,105.028532,8.993,29.708,105.028084,106.032963,0.000541,117844.164062
11923495444197646664,,,,,1,14.708,114.091074,2.564,29.708,114.090557,115.096137,0.000150,33178.593750
7862005751953330614,,,,,1,15.422,130.158792,2.564,29.708,130.157784,131.163722,0.000062,14022.859375
8818712505812277634,,,,,1,10.422,148.060168,2.564,29.708,148.058831,149.065015,0.000315,69323.648438
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8119065683242516267,,,,,3,20.422,1460.542414,8.279,29.708,1460.530501,1460.905277,0.000028,8166.180664
9698110462512209987,,,,,3,13.993,1462.016225,8.279,29.708,1461.999122,1462.348638,0.000031,7990.146484
12125663703087055322,,,,,3,12.565,1468.042431,8.279,29.708,1468.029331,1468.396629,0.000025,7643.970703
17529297411202260356,,,,,2,14.708,1492.030079,8.279,29.708,1492.026204,1492.534964,0.000048,11526.248047


In [21]:
store_feature_maps(feature_maps, os.path.join(run_dir, "features"), file_ending=".mzML")

Storing feature maps:


  0%|          | 0/6 [00:00<?, ?it/s]


TypeError: 'NoneType' object is not subscriptable

In [18]:
ffm = oms.FeatureFindingMetabo()
ffm_par = ffm.getParameters()
print_params(ffm_par)

Param: b'local_rt_range' Value: 10.0 Description: RT range where to look for coeluting mass traces
Param: b'local_mz_range' Value: 6.5 Description: MZ range where to look for isotopic mass traces
Param: b'charge_lower_bound' Value: 1 Description: Lowest charge state to consider
Param: b'charge_upper_bound' Value: 3 Description: Highest charge state to consider
Param: b'chrom_fwhm' Value: 5.0 Description: Expected chromatographic peak width (in seconds).
Param: b'report_summed_ints' Value: false Description: Set to true for a feature intensity summed up over all traces rather than using monoisotopic trace intensity alone.
Param: b'enable_RT_filtering' Value: true Description: Require sufficient overlap in RT while assembling mass traces. Disable for direct injection data..
Param: b'isotope_filtering_model' Value: metabolites (5% RMS) Description: Remove/score candidate assemblies based on isotope intensities. SVM isotope models for metabolites were trained with either 2% or 5% RMS error