In [4]:
# Imports
import os
import datetime
import json
from numpy import *
import pandas as pd
import pyopenms as oms
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px
from tqdm import tqdm

# Ignore seaborn warning for future deprecation of module part
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# import methods from FIA python script
from FIA import *

## Loading
Reads in files

In [49]:
experiment_path = "../data/example data/exampleA_ecolistrains/mg1655_P3-A8_neg.mzXML"
experiment = read_mzxml(experiment_path)

## Data preparation

### Limit spectrum
Limits range and size by subsampling

In [50]:
# lim_exp = limit_experiment(experiment, 51, 1699, 10**6, deepcopy=True)

In [51]:
# quick_plot(spectrum=lim_exp[0], xlim=[60.975, 61.04], plottype="line")

### Smoothing
Can reduce noise, if peaks are non-Gaussian

In [52]:
# smooth_exp = smooth_spectra(lim_exp, 0.01, deepcopy=True)

In [53]:
# quick_plot(spectrum=smooth_exp[0], xlim=[60.975, 61.04], plottype="line")

$\Rightarrow$ Smoothing not needed, as peaks are sufficently on point and approximately gaussian in distribution

### Centroiding
Reduces data, while retaining peaks

In [54]:
# quick_plot(spectrum=smooth_exp[0], xlim=[60.975, 61.04], plottype="scatter")

In [55]:
centroid_exp = centroid_experiment(experiment, deepcopy=True)

In [56]:
oms.MzMLFile().store("../runs/centroided_experiment.MzML", centroid_exp)

In [57]:
# quick_plot(spectrum=centroid_exp[0], xlim=[60.975, 61.04], plottype="scatter")

### Merging
Merging spectra in a specified retention time can increase detection

In [58]:
# quick_plot(spectrum=centroid_exp[0])

In [59]:
# merge_exp = merge_spectra(centroid_exp, block_size=centroid_exp.getNrSpectra(), deepcopy=True)

In [60]:
# quick_plot(spectrum=merge_exp[0])

### Normalization
Useful to compare peaks over spectra

In [61]:
# norm_exp = normalize_spectra(merge_exp, deepcopy=True)

In [62]:
# quick_plot(spectrum=norm_exp[0], plottype="line")

### De-isotoping
Adjusting mass/charge-ratio (m/z) for charge and isotopes

In [63]:
# dynamic_plot(norm_exp)

In [64]:
#deisotop_exp = deisotope_experiment(norm_exp,
#                                    fragment_tolerance=0.1, fragment_unit_ppm=False, min_charge=1, max_charge=1,
#                                    keep_only_deisotoped=True, min_isopeaks=2, max_isopeaks=5, make_single_charged=True, annotate_charge=True,
#                                    annotate_iso_peak_count=True, use_decreasing_model=True, start_intensity_check=False, add_up_intensity=False,
#                                    deepcopy=True)

In [65]:
# dynamic_plot(deisotop_exp)

### Merging positive & negative spectra

In [66]:
# merge positive and negative spectra to amplify signals

## Spectrum Alignment

### Obtaining meabolites
All possible metabolites from BiGG, ChEBI, ModelSEED, MetaNetX

In [67]:
# from urllib.request import urlretrieve
# urlretrieve( "http://bigg.ucsd.edu/static/namespace/bigg_models_metabolites.txt", "../data/databases/BiGG/metabolites.txt")
# urlretrieve( "https://ftp.ebi.ac.uk/pub/databases/chebi/Flat_file_tab_delimited/chemical_data.tsv", "../data/databases/ChEBI/chemical_data.tsv")
# urlretrieve( "https://raw.githubusercontent.com/ModelSEED/ModelSEEDDatabase/master/Biochemistry/compounds.tsv", "../data/databases/SEED/compounds.tsv")
# urlretrieve( "https://www.metanetx.org/cgi-bin/mnxget/mnxref/chem_prop.tsv", "../data/databases/MetaNetX/chem_prop.tsv")

### Condense metabolite info
Parse data to obtain a smaller table with metabolite info

In [68]:
# mnx_df = read_mnx("../data/databases/MetaNetX/chem_prop.tsv")
# mnx_to_oms(mnx_df).to_csv("../data/databases/compounds.tsv", sep="\t", index=False)

In [69]:
# compounds = pd.read_csv("../data/databases/compounds.tsv", sep="\t")

In [70]:
# join_df_by(mnx_df.dropna().iloc[0:10000], "mass", "name")

MetaNetX is periodically refreshing their database from BiGG, ChEBI, enviPath, HMDB, KEGG, LipidMaps, MetaCyc, Reactome, SABIO-RK, SwissLipids and ModelSEED. Therefore it should hold the most complete database.

### Defining the theoretical spectrum
Assigning theoretical spectra to the metabolites that can be presen. This is more for peptide detection !

In [71]:
#seq = oms.EmpiricalFormula("H6C2H")
#seq_formula = oms.EmpiricalFormula("H6C2H-1")
#isotopes = seq_formula.getIsotopeDistribution(oms.CoarseIsotopePatternGenerator(6))

In [72]:
#print("[M-H]- weight:", seq_formula.getMonoWeight())

### Assign metabolites to spectrum
Detect peaks at metabolite masses and obtain relative prevalence

In [73]:
# 

## Feature Detection

### Untargeted

In [74]:
filepaths = ["../runs/centroided_experiment.MzML"]
fms = [ untargeted_feature_detection(filepath=filepath,
                                     feature_filepath="../runs/untargeted.featureXML",
                                     mass_error_ppm=10.0,
                                     noise_threshold_int=10000.0,
                                     remove_single_traces="true") 
        for filepath in filepaths ]
fms_rt = align_retention_time(fms)
fms_a = detect_adducts(fms_rt)

Progress of 'mass trace detection':
-- done [took 0.01 s (CPU), 0.01 s (Wall)] -- 
Progress of 'elution peak detection':
-- done [took 0.09 s (CPU), 0.02 s (Wall)] -- 
Progress of 'assembling mass traces to features':
-- done [took 0.07 s (CPU), 0.01 s (Wall)] -- 
MassExplainer table size: 312
#Spectra that needed to and could be picked by MS-level:
<RT window size calculated as 240 seconds.> occurred 14 times
  MS-level 1: 38 / 38
38 spectra and 0 chromatograms stored.
Generating Masses with threshold: -6.90776 ...
done
2 of 4 valid net charge compomer results did not pass the feature charge constraints
Inferring edges raised edge count from 2 to 2
Found 2 putative edges (of 23) and avg hit-size of 2.5
Using solver 'coinor' ...
Optimal solution found!
 Branch and cut took 0.020667 seconds,  with objective value: 0.12.
ILP score is: 0.12
Agreeing charges: 4/4


In [75]:
store_feature_maps(fms_a)

In [76]:
cm = consensus_features(fms_a)
oms.ConsensusXMLFile().store("../runs/FeatureMatrix_.consensusXML", cm)

RuntimeError: At least two maps must be given!

In [ ]:
df = cm.get_df()
df

#### Link metabolites to features

In [ ]:
# Link it 

### Targeted

CANT FIND THE ERROR

In [5]:
def feature_detection_targeted(filepath: str, metab_table:list, experiment: oms.MSExperiment = None,
                               mz_window:float=5.0, rt_window:float=20.0, peak_width:float=3.0) -> oms.FeatureMap:
    """
    Feature detection
    """

    if not experiment:
        experiment = oms.MSExperiment()
        oms.MzMLFile().load(filepath, experiment)

     # FeatureMap to store results
    feature_map = oms.FeatureMap()
    
    # create FeatureFinderAlgorithmMetaboIdent and assign ms data
    ff = oms.FeatureFinderAlgorithmMetaboIdent()
    ff.setMSData(experiment)
    ff_par = ff.getDefaults()
    ff_par.setValue(b"extract:mz_window_",  mz_window)
    ff_par.setValue(b"extract:rt_window_",  rt_window)
    ff_par.setValue(b"extract:peak_width_",  peak_width)
    ff.setParameters(ff_par)
    
    # run the FeatureFinderMetaboIdent with the metabo_table and mzML file path -> store results in fm
    ff.run(metab_table, feature_map, filepath)
    
    feature_map.setUniqueIds()  # Assigns a new, valid unique id per feature
    feature_map.setPrimaryMSRunPath([filepath.encode()])
    
    return feature_map

def targeted_feature_detection(filepath: str, experiment:oms.MSExperiment, compound_library_file:str, 
                               mz_window:float=5.0, rt_window:float=20.0, peak_width:float=3.0) -> oms.FeatureMap:
    """
    @mz_window: ppm
    @rt_window: s
    @peak_width: s
    returns: pyopenms.FeatureMap
    """
    if not experiment:
        experiment = oms.MSExperiment()
        oms.MzMLFile().load(filepath, experiment)
    
    print("Defining metabolite table...")
    metab_table = define_metabolite_table(compound_library_file)
    print("Metabolite table defined...")
    
    feature_map = feature_detection_targeted("", metab_table, experiment, mz_window, rt_window, peak_width)
    print("Feature map created.")
    
    return feature_map

In [6]:
mt = define_metabolite_table("../data/databases/compounds.tsv")

In [None]:
fm = feature_detection_targeted("../runs/centroided_experiment.MzML", mt, mz_window=5.0, rt_window=200.0, peak_width=3.0)

In [None]:
# fm = targeted_feature_detection("../runs/centroided_experiment.MzML", centroid_exp, "../data/databases/compounds.tsv", mz_window=5.0, rt_window=500.0, peak_width=3.0)
oms.FeatureXMLFile().store("../runs/targeted.featureXML", fm)

### Plot

In [None]:
feature_map = read_feature_map_XML("../runs/untargeted.featureXML")

In [None]:
def extract_feature_coord(feature, mzs, retention_times, intensities, labels, sub:bool=False):
    for hull in feature.getConvexHulls():
        hull_split = hsplit(hull.getHullPoints(), 2)
        if sub:
            mzs = append(mzs, tile(feature.getMZ(), len(hull_split[0])))
            retention_times = append(retention_times, hull_split[1])
            intensities = append(intensities, hull_split[0])
            labels = append(labels, tile(feature.getMetaValue("label"), len(hull_split[0])))
        else:
            mzs = append(mzs, feature.getMZ())
            retention_times = append(retention_times, feature.getRT())
            intensities = append(intensities, feature.getIntensity())
            labels = append(labels, feature.getMetaValue("label"))
        

    return [mzs, retention_times, intensities, labels]

def plot_features_3D(feature_map:oms.FeatureMap, plottype:str=None) -> None:
    """
    Represents found features in 3D
    """
    mzs = empty([0])
    retention_times = empty([0])
    intensities = empty([0])
    labels = empty([0])

    for feature in feature_map:
        if feature.getSubordinates():
            for i, sub_feat in enumerate(feature.getSubordinates()):
                mzs, retention_times, intensities, labels = extract_feature_coord(sub_feat, mzs, retention_times, intensities, labels, True)
        else:
            mzs, retention_times, intensities, labels = extract_feature_coord(feature, mzs, retention_times, intensities, labels)

    df = pd.DataFrame({"m/z": mzs, "rt": retention_times, "intensity": intensities, "labels": labels})
    
    if plottype == "surface":
        fig = go.Figure(data=[go.Surface(z=df)])
        fig.update_layout(title='3D plot of features', autosize=False,
                    width=500, height=500,
                    margin=dict(l=65, r=50, b=65, t=90))
    elif plottype == "line":
        fig = px.line_3d(data_frame=df, x="m/z", y="rt", z="intensity", color="labels")
    elif plottype == "scatter":
        fig = px.scatter_3d(data_frame=df, x="m/z", y="rt", z="intensity", color="labels", size_max=8)
 
    if plottype:
        fig.show()
    
    return df

In [None]:
df = plot_features_3D(feature_map, plottype="scatter")

In [None]:
df

In [None]:
z_data = pd.read_csv('https://raw.githubusercontent.com/plotly/datasets/master/api_docs/mt_bruno_elevation.csv')
z_data.values.shape

In [None]:
feature = feature_map[0]