# Metabolomics data filtering

In [None]:
%pip install acore

In [46]:
import acore
from acore import filter_metabolomics as fm

import pandas as pd
import os
import importlib

importlib.reload(acore)

<module 'acore' from '/Users/fcasc/Documents/03_CORE/acore/src/acore/__init__.py'>

Load in your data. We will use an example data set from MetaboLights. It can be found in example_data/MTBLS733.

In [3]:
data_path = "../../example_data/MTBLS733/MetaboLights-MTBLS733-Nextflow4MS-DIAL.csv"
data = pd.read_csv(data_path)

  data = pd.read_csv(data_path)


Let's look more into the data.

- The .dtypes function shows which columns are numeric (int64, float64) and which are categorical (object, bool).

- The .describe function summarises the numeric columns.

- With .columns, we can see which columns our data has.

In [4]:
data.dtypes

Unnamed: 0                                int64
identifier                               object
Alignment ID                             object
Average Mz                               object
Average Rt(min)                          object
duplicate_flag                           object
adduct_flag                              object
isotope_flag                             object
isotope_phr                             float64
Metabolite name                          object
Adduct type                              object
Post curation result                     object
Fill %                                   object
MS/MS assigned                             bool
Reference RT                            float64
Reference m/z                           float64
Formula                                  object
Ontology                                 object
INCHIKEY                                 object
SMILES                                   object
Annotation tag (VS1.0)                  

In [5]:
data.describe()

Unnamed: 0.1,Unnamed: 0,isotope_phr,Reference RT,Reference m/z,Comment,Isotope tracking parent ID,Isotope tracking weight number,Total score,RT similarity,Dot product,...,SA1,SA2,SA3,SA4,SA5,SB1,SB2,SB3,SB4,SB5
count,25414.0,1920.0,1030.0,2381.0,0.0,0.0,0.0,2381.0,1030.0,0.0,...,25414.0,25414.0,25414.0,25414.0,25414.0,25414.0,25414.0,25414.0,25414.0,25414.0
mean,12706.5,34.766,12.121,334.69,,,,96.61,94.222,,...,9957555.225,10335501.583,10077343.175,9825950.091,10211771.35,9977547.643,9104556.034,9016387.519,8570219.095,8266045.133
std,7336.534,288.94,9.533,135.854,,,,4.252,7.827,,...,108000249.16,98034114.909,97551159.385,95307243.82,94928044.814,97761012.632,94167685.885,95227828.327,91395034.615,91592654.566
min,0.0,0.002,0.57,102.091,,,,85.0,70.2,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,6353.25,0.358,4.48,239.074,,,,94.2,90.8,,...,168714.807,159768.617,155587.153,154291.057,157803.98,113235.423,86203.745,71135.36,60665.533,53483.863
50%,12706.5,0.951,8.15,318.243,,,,98.8,98.35,,...,503910.47,506800.64,485047.35,476528.2,500428.8,450998.47,378236.56,346933.22,316411.64,299009.81
75%,19059.75,2.766,19.683,416.172,,,,99.9,99.9,,...,1961267.675,2009861.375,1959383.625,1940029.05,2052489.775,1919519.975,1718390.75,1660114.975,1557695.875,1520460.4
max,25413.0,7120.68,34.31,1270.449,,,,100.0,100.0,,...,12498892000.0,11776076000.0,11890802000.0,11584689000.0,11384590000.0,11422494000.0,11702578000.0,11545868000.0,11181568000.0,11133594000.0


In [6]:
print(f"There are {data.shape[0]} rows and {data.shape[1]} columns in our data.")
print("Our data has the following columns:")
for colname in data.columns.values.tolist():
    print("\t",colname)

There are 25414 rows and 48 columns in our data.
Our data has the following columns:
	 Unnamed: 0
	 identifier
	 Alignment ID
	 Average Mz
	 Average Rt(min)
	 duplicate_flag
	 adduct_flag
	 isotope_flag
	 isotope_phr
	 Metabolite name
	 Adduct type
	 Post curation result
	 Fill %
	 MS/MS assigned
	 Reference RT
	 Reference m/z
	 Formula
	 Ontology
	 INCHIKEY
	 SMILES
	 Annotation tag (VS1.0)
	 RT matched
	 m/z matched
	 MS/MS matched
	 Comment
	 Manually modified for quantification
	 Manually modified for annotation
	 Isotope tracking parent ID
	 Isotope tracking weight number
	 Total score
	 RT similarity
	 Dot product
	 Reverse dot product
	 Fragment presence %
	 S/N average
	 Spectrum reference file name
	 MS1 isotopic spectrum
	 MS/MS spectrum
	 SA1
	 SA2
	 SA3
	 SA4
	 SA5
	 SB1
	 SB2
	 SB3
	 SB4
	 SB5


It looks like the m/z and RT columns contain categorical data, so we need to change that first before we can filter.

In [17]:
numeric_data = fm.make_numeric.convert_to_numeric(data, ["Average Mz", "Average Rt(min)"], print_na_summary=True)

Column Average Mz has been converted successfully, no NaN values.
Column Average Rt(min) has been converted successfully, no NaN values.


In [18]:
numeric_data

Unnamed: 0.1,Unnamed: 0,identifier,Alignment ID,Average Mz,Average Rt(min),duplicate_flag,adduct_flag,isotope_flag,isotope_phr,Metabolite name,...,SA1,SA2,SA3,SA4,SA5,SB1,SB2,SB3,SB4,SB5
0,0,0.03_171.99,1848,171.994,0.031,,,,,Unknown,...,4075218.800,3107714.500,3224671.200,3100795.200,3218963.500,4109884.800,3473687.000,3555931.000,3700617.800,4714087.500
1,1,0.04_182.99,2151,182.986,0.038,,,,,Unknown,...,8745378.000,6435527.500,6564399.000,6728224.000,6689435.000,8370828.000,6570529.000,6653673.500,6556864.500,7956248.500
2,2,0.04_196.15,2554,196.155,0.042,,,,,Unknown,...,3511845.200,2745459.000,2600677.000,2587100.200,2586849.200,2931167.500,2311969.000,2228402.200,2217159.000,2885036.200
3,3,0.41_118.44,554,118.440,0.407,,,,,Unknown,...,35294.020,30813.000,50259.050,24106.030,33674.960,31959.850,25801.860,21137.220,19737.760,18970.890
4,4,0.43_118.09,539,118.086,0.427,Match #065: Possible duplicate of 0.52_118.09 ...,,,,w/o MS2:Valine; LC-ESI-QTOF; MS2; CE,...,57559904.000,47708448.000,47418344.000,46128584.000,47533796.000,57822920.000,47149300.000,46561192.000,46621160.000,44220552.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25409,25409,37.14_157.04,1483,157.036,37.136,,,,,Unknown,...,26608880.000,27224586.000,38846140.000,38032408.000,34086672.000,25443546.000,34003720.000,39564276.000,43354640.000,38120636.000
25410,25410,37.33_157.04,1479,157.036,37.326,,,,,Unknown,...,29026352.000,30503354.000,36343592.000,37094708.000,38317424.000,25160338.000,33506156.000,32792946.000,35234460.000,31790370.000
25411,25411,37.50_157.04,1482,157.036,37.501,,,,,Unknown,...,24405672.000,24932406.000,35285380.000,38054020.000,30952190.000,31250330.000,29900362.000,38608168.000,35456848.000,33066372.000
25412,25412,37.76_157.04,1477,157.035,37.764,,,,,Unknown,...,22355124.000,23754568.000,33393302.000,37931356.000,29980842.000,22872974.000,28784320.000,30042418.000,35670736.000,35484512.000


In [19]:
numeric_data.dtypes

Unnamed: 0                                int64
identifier                               object
Alignment ID                             object
Average Mz                              float64
Average Rt(min)                         float64
duplicate_flag                           object
adduct_flag                              object
isotope_flag                             object
isotope_phr                             float64
Metabolite name                          object
Adduct type                              object
Post curation result                     object
Fill %                                   object
MS/MS assigned                             bool
Reference RT                            float64
Reference m/z                           float64
Formula                                  object
Ontology                                 object
INCHIKEY                                 object
SMILES                                   object
Annotation tag (VS1.0)                  

Now the Mz and RT columns are dtype float, so they are numeric. That means that now we can proceed with filtering.

We want to filter based on RT, by removing all features that have a RT below a certain time, to exclude features that are at the dead volume. In our case, the cut-off will be 0.8 minutes.

We also want to filter out features that have an m/z value below 600 an dhave m/z decimals between 0.3 and 0.9.

We can do both of those things with the filtering function.

In [21]:
filtered_data, removed_features = fm.filter_mz_rt(
    numeric_data, 
    "Average Rt(min)", 
    "Average Mz", 
    mz_decimals=(0.3, 0.9), 
    mz_low=600,
    rt_dead_volume=0.8,
    save_removed = True)



Column Average Rt(min) has been converted successfully, no NaN values.
Column Average Mz has been converted successfully, no NaN values.
Filtering based on RT completed.
Filtering based on m/z was completed.


Let's look at our filtered data.

In [26]:
print(f"There are {data.shape[0]} rows and {data.shape[1]} columns in our original data.")
print(f"There are {filtered_data.shape[0]} rows and {filtered_data.shape[1]} columns in our filtered data.")
print(f"{removed_features.shape[0]} features were removed from our data.")

There are 25414 rows and 48 columns in our original data.
There are 18917 rows and 48 columns in our filtered data.
6497 features were removed from our data.


We can also look into our removed features.

In [45]:
bdv = removed_features[removed_features["RemovalReason"] == "BelowDeadVolume"].shape
nbr = removed_features[removed_features["RemovalReason"] == "NotBiologicallyRelevant"].shape
print(f"{bdv[0]} features were removed because they were below the dead volume.")
print(f"{nbr[0]} features were removed because they were deemed not biologically relevant (m/z-based filtering).")

removed_features

1247 features were removed because they were below the dead volume.
5250 features were removed because they were deemed not biologically relevant (m/z-based filtering).


Unnamed: 0.1,Unnamed: 0,identifier,Alignment ID,Average Mz,Average Rt(min),duplicate_flag,adduct_flag,isotope_flag,isotope_phr,Metabolite name,...,SA2,SA3,SA4,SA5,SB1,SB2,SB3,SB4,SB5,RemovalReason
0,0,0.03_171.99,1848,171.994,0.031,,,,,Unknown,...,3107714.500,3224671.200,3100795.200,3218963.500,4109884.800,3473687.000,3555931.000,3700617.800,4714087.500,BelowDeadVolume
1,1,0.04_182.99,2151,182.986,0.038,,,,,Unknown,...,6435527.500,6564399.000,6728224.000,6689435.000,8370828.000,6570529.000,6653673.500,6556864.500,7956248.500,BelowDeadVolume
2,2,0.04_196.15,2554,196.155,0.042,,,,,Unknown,...,2745459.000,2600677.000,2587100.200,2586849.200,2931167.500,2311969.000,2228402.200,2217159.000,2885036.200,BelowDeadVolume
3,3,0.41_118.44,554,118.440,0.407,,,,,Unknown,...,30813.000,50259.050,24106.030,33674.960,31959.850,25801.860,21137.220,19737.760,18970.890,BelowDeadVolume
4,4,0.43_118.09,539,118.086,0.427,Match #065: Possible duplicate of 0.52_118.09 ...,,,,w/o MS2:Valine; LC-ESI-QTOF; MS2; CE,...,47708448.000,47418344.000,46128584.000,47533796.000,57822920.000,47149300.000,46561192.000,46621160.000,44220552.000,BelowDeadVolume
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6492,25397,36.34_361.84,10872,361.841,36.344,,,,,Unknown,...,41851.130,51202.730,33478.470,47338.650,47860.230,32742.440,28718.350,37940.480,34328.280,NotBiologicallyRelevant
6493,25399,36.40_304.89,7687,304.891,36.398,Match #1247,,,,Unknown,...,2669204.200,3097163.200,2243482.800,2396253.500,2772311.000,2555962.000,2751671.800,2578197.000,2751743.000,NotBiologicallyRelevant
6494,25400,36.43_199.83,2668,199.827,36.432,,,,,Unknown,...,25709.320,51583.420,37137.120,0.000,27534.100,12466.370,14449.100,0.000,19478.900,NotBiologicallyRelevant
6495,25401,36.44_182.31,2144,182.308,36.435,,,,,Unknown,...,34318.300,14549.960,11305.220,14131.370,18496.300,14793.800,12596.850,14604.950,23033.660,NotBiologicallyRelevant
