Merge pull request #275 from Mesnage-Org/ns-rse/273-further-column-pa…

…rametrisation
Mesnage-Org · Mar 20, 2024 · f2d1e74 · f2d1e74
2 parents f0920f6 + 9efcf2f
commit f2d1e74
Show file tree

Hide file tree

Showing 13 changed files with 42 additions and 77 deletions.
diff --git a/lib/docs/data_dictionary.md b/lib/docs/data_dictionary.md
@@ -1,11 +1,5 @@
 # Data Dictionary
 
----
-
-*We are currently updating the nomenclature of many of the data in `pgfinder`. For now, use this page with caution.*
-
----
-
 Effective use of `pgfinder` requires an understanding of the inputs and outputs of the software.
 
 ## Inputs
@@ -60,46 +54,20 @@ with a `.csv` extension. `pgfinder` has built-in mass lists for *Escherichia col
 
 `pgfinder` outputs `CSV` (`.csv`) files. The columns in these files depend on the input file format.
 
-### FTRS Output Files
-
-The column name of the first column contains [embedded metadata](#embedded-metadata) on the provenance of the file. Subsequent columns are defined as follows:
-
-| Column              | Description                                                                    | Unit |
-|---------------------|--------------------------------------------------------------------------------|------|
-| ID                  | Feature identified from ions corresponding to the same mass and retention time | NA   |
-| xicStart            | Extracted ion chromatogram starting time point                                 | min  |
-| xicEnd              | Extracted ion chromatogram starting time point                                 | min  |
-| ionCount            | Number of occurrence for ions corresponding to the same feature                | NA   |
-| chargeOrder         | Observed ion charge states                                                     | NA   |
-| rt                  | Retention time                                                                 | min  |
-| mwMonoisotopic      | Observed monoisotopic mass                                                     | Da   |
-| theo_mwMonoisotopic | Theoretical monoisotopic mass                                                  | Da   |
-| inferredStructure   | Inferred muropeptide structure                                                 | NA   |
-| maxIntensity        | Signal intensity calculated from Extracted Ion Chromatograms                   | NA   |
-
-### MaxQuant Output Files
-
-The column name of the first column contains [embedded metadata](#embedded-metadata) on the provenance of the file. Subsequent columns are defined as follows:
-
-| Column | Description | Unit |
-|---|---|---|
-| ID | Feature identified from ions corresponding to the same mass and retention time | NA |
-| rt | Retention time | min |
-| rt_length | Time window used to quantify signal intensity based on Extracted Ion Chromatograms | min |
-| mwMonoisotopic | Observed monoisotopic mass | Da |
-| theo_mwMonoisotopic | Theoretical monoisotopic mass | Da |
-| inferredStructure | Inferred muropeptide structure | NA |
-| maxIntensity | Signal intensity calculated from Extracted Ion Chromatograms  | NA |
 
 ### Embedded Metadata
 
+The first column contains the following metadata
+
 | Data | Description |
 |---|---|
-| file | Input data file |
-| masses_file | Mass list file |
-| modifications | List of [modifications](#modifications) |
-| ppm | ppm tolerance |
-| rt_window | Window used for in-source decay correction (min) |
+| `file` | Input data file |
+| `masses_file` | Mass list file |
+| `rt_window` | Retention time window |
+| `modifications` | List of [modifications](#modifications) |
+| `ppm` | ppm tolerance |
+| `consolidation_ppm` | ppm tolerance for consolidation |
+| `version` | PGFinder version used in analysis |
 
 ### PGFinder Output
 

diff --git a/lib/pgfinder/config/columns.yaml b/lib/pgfinder/config/columns.yaml
@@ -21,11 +21,13 @@ maxquant:
 pgfinder:
   # Columns from files are renamed to these values
   input:
-    - ID
-    - RT (min)
-    - Charge
-    - Obs (Da)
-    - Intensity
+    id: ID
+    rt: RT (min)
+    charge: Charge
+    obs: Obs (Da)
+    intensity: Intensity
+  # Name of column for difference between observer and theoretical parts per million
+  delta: Delta (ppm)
   # The inferred mass and structure columns
   inferred:
     mass: Theo (Da)
@@ -39,4 +41,4 @@ pgfinder:
     Inferred structure (best match): Structure
     RT (min): Consolidated RT (min)
     Theo (Da): Consolidated Theo (Da)
-    Delta ppm: Consolidated Delta ppm
+    Delta (ppm): Consolidated Delta (ppm)
diff --git a/lib/pgfinder/matching.py b/lib/pgfinder/matching.py
@@ -416,9 +416,9 @@ def apply_modification(mod):
 
 def calculate_ppm_delta(
     df: pd.DataFrame,
-    observed: str = "Obs (Da)",
-    theoretical: str = "Theo (Da)",
-    diff: str = "Delta ppm",
+    observed: str = COLUMNS["input"]["obs"],
+    theoretical: str = COLUMNS["inferred"]["mass"],
+    diff: str = COLUMNS["delta"],
 ) -> pd.DataFrame:
     """
     Calculate the difference in Parts Per Million between observed and theoretical masses.
@@ -494,19 +494,18 @@ def add_most_likely_structure(group) -> pd.DataFrame:
         """
         # Sort by lowest absolute ppm first, then break ties with structures (short to long)
         group.sort_values(
-            by=["Delta ppm", columns["inferred"]["structure"]],
+            by=[columns["delta"], columns["inferred"]["structure"]],
             ascending=[True, False],
             key=lambda k: abs(k) if is_numeric_dtype(k) else k,
             inplace=True,
             kind="stable",
         )
         group.reset_index(drop=True, inplace=True)
 
-        # The hard coded 'Delta ppm' and 'Intensity column names aren't yet handled dynamically
-        abs_min_ppm = group["Delta ppm"].loc[0]
-        abs_min_intensity = group["Intensity"].loc[0]
+        abs_min_ppm = group[columns["delta"]].loc[0]
+        abs_min_intensity = group[columns["input"]["intensity"]].loc[0]
 
-        min_ppm_structure_idxs = abs(abs(abs_min_ppm) - abs(group["Delta ppm"])) < consolidation_ppm
+        min_ppm_structure_idxs = abs(abs(abs_min_ppm) - abs(group[columns["delta"]])) < consolidation_ppm
         min_ppm_structures = ",   ".join(group[columns["inferred"]["structure"]].loc[min_ppm_structure_idxs])
 
         group.at[0, f"Inferred structure ({columns['best_match_suffix']})"] = min_ppm_structures
@@ -529,13 +528,12 @@ def consolidate_results(
     df: pd.DataFrame,
     intensity_column: str = f"Intensity ({COLUMNS['best_match_suffix']})",
     structure_column: str = f"Inferred structure ({COLUMNS['best_match_suffix']})",
-    rt_column: str = "RT (min)",
-    theo_column: str = "Theo (Da)",
-    ppm_column: str = "Delta ppm",
-    abundance_column: str = "Abundance (%)",
+    rt_column: str = COLUMNS["input"]["rt"],
+    theo_column: str = COLUMNS["inferred"]["mass"],
+    ppm_column: str = COLUMNS["delta"],
+    abundance_column: str = COLUMNS["consolidation"]["Abundance (%)"],
     oligomer_column: str = "Oligomerisation",
-    total_column: str = "Total Intensity",
-    suffix: str = "candidate",
+    total_column: str = COLUMNS["consolidation"]["Total Intensity"],
     columns: dict = COLUMNS,
 ) -> pd.DataFrame:
     """
@@ -561,9 +559,6 @@ def consolidate_results(
         Oligomer column.
     total_column : str
         Total column.
-    suffix : str
-        Suffix appended to all consolidation columns to avoid duplicate column names. If a column has '(consolidated)'
-        in it already this suffix is included within the parentheses.
 
     Returns
     -------

diff --git a/lib/pgfinder/pgio.py b/lib/pgfinder/pgio.py
@@ -87,12 +87,12 @@ def ftrs_reader(file: str | Path, columns: dict = COLUMNS) -> pd.DataFrame:
 
         if is_ftrs_52:
             ff.rename(
-                columns=dict(zip(columns["ftrs_52"], columns["pgfinder"]["input"])),
+                columns=dict(zip(columns["ftrs_52"], columns["pgfinder"]["input"].values())),
                 inplace=True,
             )
         elif is_ftrs_311:
             ff.rename(
-                columns=dict(zip(columns["ftrs_311"], columns["pgfinder"]["input"])),
+                columns=dict(zip(columns["ftrs_311"], columns["pgfinder"]["input"].values())),
                 inplace=True,
             )
         else:
@@ -122,7 +122,7 @@ def _select_and_order_columns(df: pd.DataFrame, columns: dict = COLUMNS) -> pd.D
     pd.DataFrame
         Subset of data frame with selected columns in specified order.
     """
-    cols_order = columns["pgfinder"]["input"] + list(columns["pgfinder"]["inferred"].values())
+    cols_order = list(columns["pgfinder"]["input"].values()) + list(columns["pgfinder"]["inferred"].values())
     # Move Intensity column to the end to match required order
     cols_order.append(cols_order.pop(cols_order.index("Intensity")))
     return df[cols_order].copy()

diff --git a/lib/pgfinder/validation.py b/lib/pgfinder/validation.py
@@ -55,7 +55,7 @@ def validate_raw_data_df(raw_data_df: pd.DataFrame, columns: dict = COLUMNS["pgf
     if not raw_data_df.attrs["file"]:
         raise ValueError("raw_data_df must have a file attribute.")
 
-    colnames = columns["input"] + list(columns["inferred"].values())
+    colnames = list(columns["input"].values()) + list(columns["inferred"].values())
 
     if not set(colnames).issubset(set(raw_data_df.columns.to_list())):
         raise ValueError("raw_data_df column names are incorrect")

diff --git a/...regtest_outputs/test_regression.test_matching_ftrs_baseline[ftrs_test_data_3.11.ftrs].out b/...regtest_outputs/test_regression.test_matching_ftrs_baseline[ftrs_test_data_3.11.ftrs].out
@@ -1,4 +1,4 @@
-Metadata,ID,RT (min),Charge,Obs (Da),Theo (Da),Delta ppm,Inferred structure,Intensity,Inferred structure (best match),Intensity (best match),Total Intensity,Structure,Abundance (%),Consolidated RT (min),Consolidated Theo (Da),Consolidated Delta ppm
+Metadata,ID,RT (min),Charge,Obs (Da),Theo (Da),Delta (ppm),Inferred structure,Intensity,Inferred structure (best match),Intensity (best match),Total Intensity,Structure,Abundance (%),Consolidated RT (min),Consolidated Theo (Da),Consolidated Delta (ppm)
 file : ftrs_test_data_3.11.ftrs,518,6.4482,2;1,941.413,941.4077,5.6422,gm-AEJA|1,435325463,gm-AEJA|1,435325463.0,1770046705.0,"gm-gm|0,   gm (x2)|0",0.0048,7.19,976.386,0.4
 masses_file : e_coli_monomer_masses.csv,651,9.0145,2;3,1864.8089,1864.8046,2.3238,gm-AEJAA=gm-AEJ|2,154801130,"gm-AEJAA=gm-AEJ|2,   gm-AEJA=gm-AEJA|2",154801130.0,,gm|0,0.0021,7.2,498.2061,-1.4
 rt_window : 0.5,651,9.0145,2;3,1864.8089,1864.8046,2.3238,gm-AEJA=gm-AEJA|2,154801130,,,,gm (Anh)|0,0.0019,7.37,478.1799,0.6

diff --git a/..._regtest_outputs/test_regression.test_matching_ftrs_baseline[ftrs_test_data_5.2.ftrs].out b/..._regtest_outputs/test_regression.test_matching_ftrs_baseline[ftrs_test_data_5.2.ftrs].out
@@ -1,4 +1,4 @@
-Metadata,ID,RT (min),Charge,Obs (Da),Theo (Da),Delta ppm,Inferred structure,Intensity,Inferred structure (best match),Intensity (best match),Total Intensity,Structure,Abundance (%),Consolidated RT (min),Consolidated Theo (Da),Consolidated Delta ppm
+Metadata,ID,RT (min),Charge,Obs (Da),Theo (Da),Delta (ppm),Inferred structure,Intensity,Inferred structure (best match),Intensity (best match),Total Intensity,Structure,Abundance (%),Consolidated RT (min),Consolidated Theo (Da),Consolidated Delta (ppm)
 file : ftrs_test_data_5.2.ftrs,1,6.7636,2;1,941.4046,941.4077,-3.3049,gm-AEJA|1,609868880.0,gm-AEJA|1,609868880.0,2335433116.0,gm (-Ac)|0,0.0017,2.45,456.1956,-1.7
 masses_file : e_coli_monomer_masses.csv,2,9.5372,2;3;1,1864.7989,1864.8046,-3.0439,gm-AEJAA=gm-AEJ|2,567751842.0,"gm-AEJAA=gm-AEJ|2,   gm-AEJA=gm-AEJA|2",567751842.0,,gm|0,0.0013,3.57,498.2061,-1.4
 rt_window : 0.5,2,9.5372,2;3;1,1864.7989,1864.8046,-3.0439,gm-AEJA=gm-AEJA|2,567751842.0,,,,gm (Anh)|0,0.0002,5.96,478.1799,-2.1

diff --git a/lib/tests/_regtest_outputs/test_regression.test_matching_mq_baseline.out b/lib/tests/_regtest_outputs/test_regression.test_matching_mq_baseline.out
@@ -1,4 +1,4 @@
-Metadata,ID,RT (min),Charge,Obs (Da),Theo (Da),Delta ppm,Inferred structure,Intensity,Inferred structure (best match),Intensity (best match),Total Intensity,Structure,Abundance (%),Consolidated RT (min),Consolidated Theo (Da),Consolidated Delta ppm
+Metadata,ID,RT (min),Charge,Obs (Da),Theo (Da),Delta (ppm),Inferred structure,Intensity,Inferred structure (best match),Intensity (best match),Total Intensity,Structure,Abundance (%),Consolidated RT (min),Consolidated Theo (Da),Consolidated Delta (ppm)
 file : maxquant_test_data.txt,3323,6.526,2,1144.4879,1144.4855,2.097,gm-AEJFF (Anh)|1,2666553380.0,gm-AEJFF (Anh)|1,2666553380.0,31353126785.9,gm (x2) (-Ac)|0,0.0036,4.82,934.3755,-1.6
 masses_file : e_coli_monomer_masses.csv,3323,6.526,2,1144.4879,1144.4932,-4.6309,gm-AEJHH|1,2666553380.0,,,,"gm-gm|0,   gm (x2)|0",0.0031,7.19,976.386,-0.5
 rt_window : 0.5,7485,10.734,2,1570.6633,1570.6536,6.1758,Lac-AEJYG=Lac-AEJ=Lac-AEJ|3,2032876150.0,"Lac-AEJYG=Lac-AEJ=Lac-AEJ|3,   Lac-AEJGQ=gm-AEJA|2,   gm-AEJGQ=Lac-AEJA|2",2032876150.0,,gm (Anh)|0,0.0007,7.19,478.1799,-0.0

diff --git a/lib/tests/conftest.py b/lib/tests/conftest.py
@@ -141,7 +141,7 @@ def df_diff_ppm(sample_df: pd.DataFrame) -> pd.DataFrame:
     """Return a target data frame for tests with diff_pm included."""
     DELTA_DF = pd.DataFrame(
         {
-            "Delta ppm": [
+            "Delta (ppm)": [
                 -500000.0,
                 427577.82345296827,
                 -0.5742528357381609,
@@ -154,7 +154,7 @@ def df_diff_ppm(sample_df: pd.DataFrame) -> pd.DataFrame:
     )
     DELTA_DF = pd.concat([sample_df, DELTA_DF], axis=1)
     DELTA_DF = DELTA_DF.convert_dtypes()
-    return DELTA_DF[["id", "obs", "exp", "Delta ppm", "inferred", "intensity"]]
+    return DELTA_DF[["id", "obs", "exp", "Delta (ppm)", "inferred", "intensity"]]
 
 
 @pytest.fixture

diff --git a/lib/tests/resources/consolidated.csv b/lib/tests/resources/consolidated.csv
@@ -1,4 +1,4 @@
-RT (min),Theo (Da),Delta ppm,Inferred structure (best match),Intensity (best match),Total Intensity,Structure,Abundance (%),Consolidated RT (min),Consolidated Theo (Da),Consolidated Delta ppm
+RT (min),Theo (Da),Delta (ppm),Inferred structure (best match),Intensity (best match),Total Intensity,Structure,Abundance (%),Consolidated RT (min),Consolidated Theo (Da),Consolidated Delta (ppm)
 1,100,0.1,gm|0,100,360,gm|0,0.5278,1,100,0.1
 2,200,0.2,"gm-AEJN|1,   gm-AEJGG|1,   gm-AEJX|1",50,,"gm-AEJN|1,   gm-AEJGG|1,   gm-AEJX|1",0.4167,4,400,0.4
 3,300,0.3,gm-AEJR|1,20,,gm-AEJR|1,0.0556,3,300,0.3

diff --git a/lib/tests/resources/long_results.csv b/lib/tests/resources/long_results.csv
@@ -1,4 +1,4 @@
-ID,RT (min),Charge,Obs (Da),Theo (Da),Delta ppm,Inferred structure,Intensity
+ID,RT (min),Charge,Obs (Da),Theo (Da),Delta (ppm),Inferred structure,Intensity
 919,3.5943,1,498.2054,498.2061,-1.404,gm|0,1679841
 1442,5.4292,2;1,984.4119,984.4145,-2.6412,gm-AEJX|1,718331
 1442,5.4292,2;1,984.4119,984.4136,1.7272,gm-AEJGG|1,718331

diff --git a/lib/tests/resources/unconsolidated.csv b/lib/tests/resources/unconsolidated.csv
@@ -1,4 +1,4 @@
-RT (min),Theo (Da),Delta ppm,Inferred structure (best match),Intensity (best match)
+RT (min),Theo (Da),Delta (ppm),Inferred structure (best match),Intensity (best match)
 1,100,0.1,gm|0,100
 2,200,0.2,"gm-AEJN|1,   gm-AEJGG|1,   gm-AEJX|1",50
 3,300,0.3,gm-AEJR|1,20

diff --git a/lib/tests/resources/wide_results.csv b/lib/tests/resources/wide_results.csv
@@ -1,4 +1,4 @@
-ID,RT (min),Charge,Obs (Da),Theo (Da),Delta ppm,Inferred structure,Intensity,Inferred structure (best match),Intensity (best match)
+ID,RT (min),Charge,Obs (Da),Theo (Da),Delta (ppm),Inferred structure,Intensity,Inferred structure (best match),Intensity (best match)
 919,3.5943,1,498.2054,498.2061,-1.404,gm|0,1679841,gm|0,1679841
 1442,5.4292,2;1,984.4119,984.4136,-1.7272,gm-AEJN|1,718331,"gm-AEJN|1,   gm-AEJGG|1,   gm-AEJX|1",718331
 1442,5.4292,2;1,984.4119,984.4136,1.7272,gm-AEJGG|1,718331,,