In [None]:
# Imports
import datetime
import json
from numpy import *
import pandas as pd
import pyopenms as oms
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px
from tqdm import tqdm

# Ignore seaborn warning for future deprecation of module part
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# import methods from FIA python script
from FIA import *

## Loading
Reads in files

In [2]:
experiment_path = "../data/example data/exampleA_ecolistrains/mg1655_P3-A8_neg.mzXML"
experiment = read_mzxml(experiment_path)

## Data preparation

### Limit spectrum
Limits range and size by subsampling

In [3]:
# lim_exp = limit_experiment(experiment, 51, 1699, 10**6, deepcopy=True)

In [4]:
# quick_plot(spectrum=lim_exp[0], xlim=[60.975, 61.04], plottype="line")

### Smoothing
Can reduce noise, if peaks are non-Gaussian

In [5]:
# smooth_exp = smooth_spectra(lim_exp, 0.01, deepcopy=True)

In [6]:
# quick_plot(spectrum=smooth_exp[0], xlim=[60.975, 61.04], plottype="line")

$\Rightarrow$ Smoothing not needed, as peaks are sufficently on point and approximately gaussian in distribution

### Centroiding
Reduces data, while retaining peaks

In [7]:
# quick_plot(spectrum=smooth_exp[0], xlim=[60.975, 61.04], plottype="scatter")

In [8]:
centroid_exp = centroid_experiment(experiment, deepcopy=True)

In [9]:
oms.MzMLFile().store("../runs/centroided_experiment.MzML", centroid_exp)

In [10]:
# quick_plot(spectrum=centroid_exp[0], xlim=[60.975, 61.04], plottype="scatter")

### Merging
Merging spectra in a specified retention time can increase detection

In [11]:
# quick_plot(spectrum=centroid_exp[0])

In [12]:
# merge_exp = merge_spectra(centroid_exp, block_size=centroid_exp.getNrSpectra(), deepcopy=True)

In [13]:
# quick_plot(spectrum=merge_exp[0])

### Normalization
Useful to compare peaks over spectra

In [14]:
# norm_exp = normalize_spectra(merge_exp, deepcopy=True)

In [15]:
# quick_plot(spectrum=norm_exp[0], plottype="line")

### De-isotoping
Adjusting mass/charge-ratio (m/z) for charge and isotopes

In [16]:
# dynamic_plot(norm_exp)

In [17]:
#deisotop_exp = deisotope_experiment(norm_exp,
#                                    fragment_tolerance=0.1, fragment_unit_ppm=False, min_charge=1, max_charge=1,
#                                    keep_only_deisotoped=True, min_isopeaks=2, max_isopeaks=5, make_single_charged=True, annotate_charge=True,
#                                    annotate_iso_peak_count=True, use_decreasing_model=True, start_intensity_check=False, add_up_intensity=False,
#                                    deepcopy=True)

In [18]:
# dynamic_plot(deisotop_exp)

### Merging positive & negative spectra

In [19]:
# merge positive and negative spectra to amplify signals

## Spectrum Alignment

### Obtaining meabolites
All possible metabolites from BiGG, ChEBI, ModelSEED, MetaNetX

In [20]:
# from urllib.request import urlretrieve
# urlretrieve( "http://bigg.ucsd.edu/static/namespace/bigg_models_metabolites.txt", "../data/databases/BiGG/metabolites.txt")
# urlretrieve( "https://ftp.ebi.ac.uk/pub/databases/chebi/Flat_file_tab_delimited/chemical_data.tsv", "../data/databases/ChEBI/chemical_data.tsv")
# urlretrieve( "https://raw.githubusercontent.com/ModelSEED/ModelSEEDDatabase/master/Biochemistry/compounds.tsv", "../data/databases/SEED/compounds.tsv")
# urlretrieve( "https://www.metanetx.org/cgi-bin/mnxget/mnxref/chem_prop.tsv", "../data/databases/MetaNetX/chem_prop.tsv")

### Condense metabolite info
Parse data to obtain a smaller table with metabolite info

In [21]:
# mnx_df = read_mnx("../data/databases/MetaNetX/chem_prop.tsv")
# mnx_to_oms(mnx_df).to_csv("../data/databases/compounds.tsv", sep="\t", index=False)

In [22]:
# compounds = pd.read_csv("../data/databases/compounds.tsv", sep="\t")

In [23]:
# join_df_by(mnx_df.dropna().iloc[0:10000], "mass", "name")

MetaNetX is periodically refreshing their database from BiGG, ChEBI, enviPath, HMDB, KEGG, LipidMaps, MetaCyc, Reactome, SABIO-RK, SwissLipids and ModelSEED. Therefore it should hold the most complete database.

### Defining the theoretical spectrum
Assigning theoretical spectra to the metabolites that can be present

In [24]:
#seq = oms.EmpiricalFormula("H6C2H")
#seq_formula = oms.EmpiricalFormula("H6C2H-1")
#isotopes = seq_formula.getIsotopeDistribution(oms.CoarseIsotopePatternGenerator(6))

In [25]:
#print("[M-H]- weight:", seq_formula.getMonoWeight())

### Assign metabolites to spectrum
Detect peaks at metabolite masses and obtain relative prevalence

In [26]:
# 

## Feature Detection

### Untargeted

In [27]:
untargeted_feature_detection(experiment, filepath="../runs/untargeted.featureXML")

Progress of 'mass trace detection':
-- done [took 0.24 s (CPU), 0.25 s (Wall)] -- 
Progress of 'elution peak detection':
-- done [took 1.97 s (CPU), 0.31 s (Wall)] -- 
Progress of 'assembling mass traces to features':
-- done [took 22.62 s (CPU), 4.29 s (Wall)] -- 


<pyopenms._dataframes._FeatureMapDF at 0x7a413fb06f80>

In [28]:
import os

def untargeted_feature_detection(filepath):
    feature_maps = []
    for file in mzML_files:
        exp = oms.MSExperiment()
        oms.MzMLFile().load(file, exp)

        # mass trace detection
        mass_traces = ( [] )
        mtd = oms.MassTraceDetection()
        mtd_par = ( mtd.getDefaults() )
        mtd_par.setValue("mass_error_ppm", 10.0)  # high-res instrument, orbitraps
        mtd_par.setValue("noise_threshold_int", 1.0e04 )  # data-dependent (usually works for orbitraps)
        mtd.setParameters(mtd_par)  # set the new parameters
        mtd.run(exp, mass_traces, 0)  # run mass trace detection

        # elution peak detection
        mass_traces_deconvol = []
        epd = oms.ElutionPeakDetection()
        epd_par = epd.getDefaults()
        epd_par.setValue(
            "width_filtering", "fixed"
        )  # The fixed setting filters out mass traces outside the [min_fwhm: 1.0, max_fwhm: 60.0] interval
        epd.setParameters(epd_par)
        epd.detectPeaks(mass_traces, mass_traces_deconvol)

        # feature detection
        feature_map = oms.FeatureMap()  # output features
        chrom_out = []  # output chromatograms
        ffm = oms.FeatureFindingMetabo()
        ffm_par = ffm.getDefaults()
        ffm_par.setValue(
            "remove_single_traces", "true"
        )  # remove mass traces without satellite isotopic traces
        ffm.setParameters(ffm_par)
        ffm.run(mass_traces_deconvol, feature_map, chrom_out)
        feature_map.setUniqueIds()  # Assigns a new, valid unique id per feature
        feature_map.setPrimaryMSRunPath(
            [file.encode()]
        )  # Sets the file path to the primary MS run (usually the mzML file)
        feature_maps.append(feature_map)

### Targeted

In [31]:
def targeted_feature_detection(experiment:oms.MSExperiment, experiment_file:str, compound_library_file:str, 
                               mz_window:float=5.0, rt_window:float=20.0, peak_width:float=3.0) -> oms.FeatureMap:
    """
    @mz_window: ppm
    @rt_window: s
    @peak_width: s
    returns: pyopenms.FeatureMap
    """
    # read library generate a metabo table with compounds
    metab_df = pd.read_csv(compound_library_file, sep="\t", engine="pyarrow")
    print("Defining metabolite table...")
    metab_table = []
    metab_df.apply(lambda row: 
                    metab_table.append(oms.FeatureFinderMetaboIdentCompound(
                                       row["CompoundName"], row["SumFormula"], row["Mass"], [ int(row["Charge"]) ],
                                       [ row["RetentionTime"] ], [ row["RetentionTimeRange"] ], [ row["IsotopeDistribution"] ] ) ), axis=1)
    print("Metabolite table defined...")

    # FeatureMap to store results
    fm = oms.FeatureMap()

    # create FeatureFinderAlgorithmMetaboIdent and assign ms data
    ff = oms.FeatureFinderAlgorithmMetaboIdent()
    ff.setMSData(experiment)
    params = ff.getParameters()
    params[b"extract:mz_window"] = mz_window
    params[b"extract:rt_window"] = rt_window
    params[b"detect:peak_width"] = peak_width
    ff.setParameters(params)
    print("Feature finder parameters set...")

    # run the FeatureFinderMetaboIdent with the metabo_table and mzML file path -> store results in fm
    ff.run(metab_table, fm, experiment_file)
    print("Feature map created.")

    return fm

In [32]:
fm = targeted_feature_detection(centroid_exp, "", "../data/databases/compounds.tsv", mz_window=5.0, rt_window=500.0, peak_width=3.0)
oms.FeatureXMLFile().store("../runs/targeted.featureXML", fm)

Defining metabolite table...
Metabolite table defined...
Feature finder parameters set...
Path or file name of primary MS run is empty. This might be the result of incomplete conversion. Not that tracing back e.g. identification results to the original file might more difficult.
To ensure tracability of results please prefer mzML files as primary MS run.
<Path or file name of primary MS run is empty. This might be the result of incomplete conversion. Not that tracing back e.g. identification results to the original file might more difficult.> occurred 2 times
Filename: ''


RuntimeError: 'Ra' found. in: Unknown element 'Ra'

### Plot

In [None]:
feature_map = read_feature_map_XML("../runs/untargeted.featureXML")

In [None]:
def extract_feature_coord(feature, mzs, retention_times, intensities, labels, sub:bool=False):
    for hull in feature.getConvexHulls():
        hull_split = hsplit(hull.getHullPoints(), 2)
        if sub:
            mzs = append(mzs, tile(feature.getMZ(), len(hull_split[0])))
            retention_times = append(retention_times, hull_split[1])
            intensities = append(intensities, hull_split[0])
            labels = append(labels, tile(feature.getMetaValue("label"), len(hull_split[0])))
        else:
            mzs = append(mzs, feature.getMZ())
            retention_times = append(retention_times, feature.getRT())
            intensities = append(intensities, feature.getIntensity())
            labels = append(labels, feature.getMetaValue("label"))
        

    return [mzs, retention_times, intensities, labels]

def plot_features_3D(feature_map:oms.FeatureMap, plottype:str=None) -> None:
    """
    Represents found features in 3D
    """
    mzs = empty([0])
    retention_times = empty([0])
    intensities = empty([0])
    labels = empty([0])

    for feature in feature_map:
        if feature.getSubordinates():
            for i, sub_feat in enumerate(feature.getSubordinates()):
                mzs, retention_times, intensities, labels = extract_feature_coord(sub_feat, mzs, retention_times, intensities, labels, True)
        else:
            mzs, retention_times, intensities, labels = extract_feature_coord(feature, mzs, retention_times, intensities, labels)

    df = pd.DataFrame({"m/z": mzs, "rt": retention_times, "intensity": intensities, "labels": labels})
    
    if plottype == "surface":
        fig = go.Figure(data=[go.Surface(z=df)])
        fig.update_layout(title='3D plot of features', autosize=False,
                    width=500, height=500,
                    margin=dict(l=65, r=50, b=65, t=90))
    elif plottype == "line":
        fig = px.line_3d(data_frame=df, x="m/z", y="rt", z="intensity", color="labels")
    elif plottype == "scatter":
        fig = px.scatter_3d(data_frame=df, x="m/z", y="rt", z="intensity", color="labels", size_max=8)
 
    if plottype:
        fig.show()
    
    return df

In [None]:
df = plot_features_3D(feature_map, plottype="scatter")

In [None]:
df

Unnamed: 0,m/z,rt,intensity,feature
0,57.032701,3.017,57.032684,1.092705e+19
1,57.032701,3.732,57.032696,1.092705e+19
2,57.032701,4.446,57.032703,1.092705e+19
3,57.032701,5.161,57.032700,1.092705e+19
4,57.032701,5.875,57.032711,1.092705e+19
...,...,...,...,...
75397,1129.255824,12.305,1130.256592,6.417598e+18
75398,1129.255824,11.591,1130.257202,6.417598e+18
75399,1129.255824,10.876,1130.258179,6.417598e+18
75400,1129.255824,10.162,1130.257812,6.417598e+18


In [None]:
z_data = pd.read_csv('https://raw.githubusercontent.com/plotly/datasets/master/api_docs/mt_bruno_elevation.csv')
z_data.values.shape

(25, 25)

In [None]:
feature = feature_map[0]