In [2]:
# Imports
import os
import datetime
import json
from typing import overload, Any, List, Dict, Tuple, Set, Sequence, Union, Optional
import numpy as np
import pandas as pd
import pyopenms as oms
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px
from tqdm import tqdm
import shutil
import requests
from sklearn.impute import KNNImputer
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline

# Ignore seaborn warning for future deprecation of module part
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# import methods from FIA python script
from FIA import *

oms.LogConfigHandler().setLogLevel("DEBUG")

## Read in

In [8]:
# set path to your mzML files and workfolder
data_dir = "../../data/example data/exampleA_ecolistrains"
run_dir = "../../runs/fia_explorer"

data_dir = os.path.normpath(os.path.join(os.getcwd(), data_dir))
run_dir = os.path.normpath(os.path.join(os.getcwd(), run_dir))

# clean_dir(run_dir)

## Centroiding
Data reduction with peak picking

In [9]:
# centroid_dir = centroid_batch(data_dir, run_dir, file_ending=".mzXML")
centroid_dir = os.path.join(run_dir, "centroids")

100%|██████████| 6/6 [00:39<00:00,  6.53s/it]


## Merging
We loose temporal information. Could not be desired, because the isotope spectra get less specific.

In [10]:
# merge_dir = merge_batch(centroid_dir, run_dir, block_size=None, mz_binning_width=5.0, mz_binning_width_unit="ppm", average_gaussian_cutoff=0.01, file_ending=".mzML")
merge_dir = os.path.join(run_dir, "merged")

100%|██████████| 6/6 [00:09<00:00,  1.53s/it]


## Mass trace detection

In [11]:
def mass_trace_detection(experiment_path: str, experiment: Optional[oms.MSExperiment] = None,
                         mass_error_ppm: float = 10.0, noise_threshold_int: float = 1000.0, reestimate_mt_sd:str="true",
                         quant_method:str="median", trace_termination_criterion:str="outlier", trace_termination_outliers:int=3,
                         min_trace_length:float=5.0, max_trace_length:float=-1.0) -> list:
    """
    Mass trace detection
    """
    experiment = load_experiment(experiment_path, experiment)
    
    mass_traces = ([])
    mtd = oms.MassTraceDetection()
    mtd_par = mtd.getDefaults()
    mtd_par.setValue("mass_error_ppm", mass_error_ppm)
    mtd_par.setValue("noise_threshold_int", noise_threshold_int)
    mtd_par.setValue("reestimate_mt_sd", reestimate_mt_sd)              # Dynamic re-estimation of m/z variance
    mtd_par.setValue("quant_method", quant_method)                      # Method of quantification for mass traces. "median" is recommended for direct injection
    mtd_par.setValue("trace_termination_criterion", trace_termination_criterion) # outlier
    mtd_par.setValue("trace_termination_outliers", trace_termination_outliers)
    mtd_par.setValue("min_trace_length", min_trace_length)
    mtd_par.setValue("max_trace_length", max_trace_length)
    mtd.setParameters(mtd_par)
    mtd.run(experiment, mass_traces, 0)

    return mass_traces


In [12]:
mtd = oms.MassTraceDetection()
mtd_par = mtd.getDefaults()
print_params(mtd_par)

Param: b'mass_error_ppm' Value: 20.0 Description: Allowed mass deviation (in ppm).
Param: b'noise_threshold_int' Value: 10.0 Description: Intensity threshold below which peaks are removed as noise.
Param: b'chrom_peak_snr' Value: 3.0 Description: Minimum intensity above noise_threshold_int (signal-to-noise) a peak should have to be considered an apex.
Param: b'reestimate_mt_sd' Value: true Description: Enables dynamic re-estimation of m/z variance during mass trace collection stage.
Param: b'quant_method' Value: area Description: Method of quantification for mass traces. For LC data 'area' is recommended, 'median' for direct injection data. 'max_height' simply uses the most intense peak in the trace.
Param: b'trace_termination_criterion' Value: outlier Description: Termination criterion for the extension of mass traces. In 'outlier' mode, trace extension cancels if a predefined number of consecutive outliers are found (see trace_termination_outliers parameter). In 'sample_rate' mode, t