In [1]:
# Imports
import os

# import methods from FIA python script
from FIA import *

oms.LogConfigHandler().setLogLevel("DEBUG")



## Read in

In [2]:
# set path to your mzML files and workfolder
data_dir = "../../data/example data/exampleA_ecolistrains"
run_dir = "../../runs/fia_explorer"

data_dir = os.path.normpath(os.path.join(os.getcwd(), data_dir))
run_dir = os.path.normpath(os.path.join(os.getcwd(), run_dir))

# clean_dir(run_dir)

## Centroiding
Data reduction with peak picking

In [3]:
# centroid_dir = centroid_batch(data_dir, run_dir, file_ending=".mzXML")
centroid_dir = os.path.join(run_dir, "centroids")

## Merging
We loose temporal information. Could not be desired, because the isotope spectra get less specific.

In [4]:
# merge_dir = merge_batch(centroid_dir, run_dir, block_size=None, mz_binning_width=5.0, mz_binning_width_unit="ppm", average_gaussian_cutoff=0.01, file_ending=".mzML")
# merge_dir = os.path.join(run_dir, "merged")

## Mass trace detection

In [5]:
mass_traces_all = mass_trace_detection_batch(experiments=centroid_dir, file_ending=".mzML", 
                                             mass_error_ppm=10.0, noise_threshold_int= 1000.0, reestimate_mt_sd="true",
                                             quant_method="median", trace_termination_criterion="outlier", trace_termination_outliers=3,
                                             min_trace_length=5.0, max_trace_length=-1.0)

100%|██████████| 6/6 [00:08<00:00,  1.43s/it]


## Elution peak detection
Should mostly come out to one peak, but somtimes separation happens in long pipe

In [6]:
mass_traces_deconvol_all = elution_peak_detection_batch(mass_traces_all=mass_traces_all, chrom_fwhm=10.0, chrom_peak_snr=2.0,
                                                     width_filtering="fixed", min_fwhm=1.0, max_fwhm=60.0,
                                                     masstrace_snr_filtering="false")

100%|██████████| 6/6 [00:02<00:00,  2.83it/s]


In [None]:
import plotly.express as px

px.line()

## Feature detection

In [7]:
feature_maps = feature_detection_untargeted_batch(experiments=centroid_dir,
                                                  mass_traces_deconvol_all=mass_traces_deconvol_all, isotope_filtering_model="metabolites (2% RMS)",
                                                  local_rt_range=3.0, local_mz_range=5.0, 
                                                  charge_lower_bound=1, charge_upper_bound=3,
                                                  chrom_fwhm=10.0, report_summed_ints="true",
                                                  enable_RT_filtering="false", mz_scoring_13C="false",
                                                  use_smoothed_intensities="false", report_convex_hulls="true",
                                                  report_chromatograms="false", remove_single_traces="true",
                                                  mz_scoring_by_elements="false", elements="CHNOPS")

100%|██████████| 6/6 [00:06<00:00,  1.02s/it]


In [8]:
store_feature_maps(feature_maps, os.path.join(run_dir, "features"), names=centroid_dir, file_ending=".mzML")

Storing feature maps:


100%|██████████| 6/6 [00:00<00:00, 25.72it/s]


## Feature refinement

In [9]:
feature_maps_pol = assign_feature_maps_polarity(feature_maps)
feature_maps_pos, feature_maps_neg = separate_feature_maps_pos_neg(feature_maps=feature_maps_pol)

Assign polarity to feature maps:


100%|██████████| 6/6 [00:00<00:00, 201.37it/s]


Separating feature maps:


100%|██████████| 6/6 [00:00<?, ?it/s]


In [10]:
pos_adducts = [b"H:+:0.6", b"K:+:0.1", b"Na:+:0.1", b"NH4:+:0.1", b"H-1O-1:+:0.05", b"H-3O-2:+:0.05"]
feature_maps_pos_adducts = detect_adducts(feature_maps_pos, potential_adducts=pos_adducts, q_try="feature", mass_max_diff=10.0, unit="ppm", max_minority_bound=2, verbose_level=3)
store_feature_maps(feature_maps_pos_adducts, os.path.join(run_dir, "adduct_features"), names=centroid_dir, file_ending=".mzML")

Detecting adducts:


100%|██████████| 3/3 [00:01<00:00,  2.22it/s]


Storing feature maps:


100%|██████████| 3/3 [00:00<00:00, 16.65it/s]


In [11]:
neg_adducts = [b"H-1:-:0.6", b"H-3O-1:-:0.2", b"Cl:-:0.1", b"Na-1:-:0.05", b"K-1:-:0.025", b"Br:-:0.025"]
feature_maps_neg_adducts = detect_adducts(feature_maps_neg, potential_adducts=neg_adducts, q_try="feature", mass_max_diff=10.0, unit="ppm", max_minority_bound=2, verbose_level=3)
store_feature_maps(feature_maps_neg_adducts, os.path.join(run_dir, "adduct_features"), names=centroid_dir, file_ending=".mzML")

Detecting adducts:


100%|██████████| 3/3 [00:01<00:00,  2.73it/s]


Storing feature maps:


100%|██████████| 3/3 [00:00<00:00, 36.22it/s]


In [12]:
consensus_map_pos = consensus_features_linking(feature_maps_pos_adducts, feature_grouper_type="QT")
consensus_map_pos = assign_feature_maps_polarity([consensus_map_pos], "positive")[0]
consensus_map_neg = consensus_features_linking(feature_maps_neg_adducts, feature_grouper_type="QT")
consensus_map_neg = assign_feature_maps_polarity([consensus_map_neg], "negative")[0]

Assign polarity to feature maps:


100%|██████████| 1/1 [00:00<00:00, 504.30it/s]


Assign polarity to feature maps:


100%|██████████| 1/1 [00:00<?, ?it/s]


In [13]:
cm_pos_df = consensus_map_to_df(consensus_map_pos)
filtered_cm_pos_df = filter_consensus_map_df(cm_pos_df, max_missing_values=1, min_feature_quality=0.80)
imputed_cm_pos_df = impute_consensus_map_df(filtered_cm_pos_df, n_nearest_neighbours=2)

cm_neg_df = consensus_map_to_df(consensus_map_neg)
filtered_cm_neg_df = filter_consensus_map_df(cm_neg_df, max_missing_values=1, min_feature_quality=0.80)
imputed_cm_neg_df = impute_consensus_map_df(filtered_cm_neg_df, n_nearest_neighbours=2)

## Centroiding

In [14]:
def extract_centroids_cm_df(cm_df:pd.DataFrame) -> pd.DataFrame:
    data_cols = [c for c in cm_df.columns if c not in ["RT", "mz", "quality"]]
    intensities = cm_df[data_cols]
    intensities = np.where(intensities == 0.0, np.nan, intensities)
    centr_df = cm_df[["RT", "mz", "quality"]].copy()
    centr_df["centroided_intensity"] = np.nanmean(intensities, axis=1)
    return centr_df

In [15]:
cm_neg_centroid_df = extract_centroids_cm_df(cm_neg_df)
cm_pos_centroid_df = extract_centroids_cm_df(cm_pos_df)

## Labeling

In [16]:
mass_search_pos_df = accurate_mass_search(consensus_map_pos, "../../databases/HMDB/", os.path.join(run_dir, "tmp"),
 "PositiveAdducts.tsv", "NegativeAdducts.tsv", "HMDBMappingFile.tsv", "HMDB2StructMapping.tsv", ionization_mode="positive")
mass_search_neg_df = accurate_mass_search(consensus_map_neg, "../../databases/HMDB/", os.path.join(run_dir, "tmp"),
 "PositiveAdducts.tsv", "NegativeAdducts.tsv", "HMDBMappingFile.tsv", "HMDB2StructMapping.tsv", ionization_mode="negative")

## Annotation

In [17]:
clean_dir(os.path.join(run_dir, "results"))
id_pos_df = annotate_consensus_map_df(cm_pos_centroid_df, mass_search_pos_df, os.path.join(run_dir, "results", "pos.tsv"), mz_tolerance=1e-04)
id_neg_df = annotate_consensus_map_df(cm_neg_centroid_df, mass_search_neg_df, os.path.join(run_dir, "results", "neg.tsv"), mz_tolerance=1e-04)

## Merge

In [18]:
len(id_pos_df.index) + len(id_neg_df.index)

1437

In [19]:
merged_results_df = merge_by_mz(id_pos_df, id_neg_df)
merged_results_df

Unnamed: 0,mz,centroided_intensity,identifier
0,767.426869,44872.417969,
1,404.206718,863380.062500,HMDB:HMDB15589;HMDB:HMDB30810;HMDB:HMDB35404;H...
2,654.581318,24298.349609,HMDB:HMDB39232;HMDB:HMDB07021;HMDB:HMDB07049;H...
3,265.111163,68061.914062,HMDB:HMDB00235
4,482.335418,9448.641602,nan;nan
...,...,...,...
1424,885.161623,6197.661133,nan;HMDB:HMDB35504;HMDB:HMDB37672;HMDB:HMDB376...
1425,508.055308,8057.011230,
1426,340.203391,10670.057617,HMDB:HMDB37270
1427,567.248394,5064.908691,HMDB:HMDB31751;HMDB:HMDB36884


In [20]:
merged_results_df.to_csv(os.path.join(run_dir, "results", "merged.tsv"), sep="\t", index=False)

## Measurements

In [21]:
rmsd = float(np.mean((mass_search_pos_df["exp_mass_to_charge"] - mass_search_pos_df["calc_mass_to_charge"])**2))**(1/2)
print(f"Overall mass deviation of metabolites by RMSD: {round(rmsd,5)} Da")

Overall mass deviation of metabolites by RMSD: 0.00162 Da
