# Parallelized Data Filtering Pipeline
## Prerequisites

In [1]:
import glob
import os
import re
from typing import List, Dict

import pandas as pd
import wget
from openpyxl import load_workbook

from mmproteo.utils import log
from mmproteo.utils.formats.mz import filter_files
from mmproteo.utils.utils import ensure_dir_exists

In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 2000)

In [3]:
logger = log.DummyLogger(verbose=True)

INFO: Printing to Stdout


## Data Import

In [4]:
pwd

'/tf/workspace/notebooks'

Code to run before running this pipeline to get/create all relevant data files:

```
cd ../datasets
mkdir PXD010000
cd PXD010000
mmproteo -p PXD010000 -e mzml,mzid --thread-count 0 download extract mz2parquet
```

In [5]:
PROJECT = "PXD010000"
DATA_PATH = os.path.join("..", "datasets", PROJECT)
DUMP_PATH = os.path.join("..", "dumps", PROJECT)
TRAINING_COLUMNS_DUMP_PATH = os.path.join(DUMP_PATH, "training_columns")
MZMLID_FILES_PATH = os.path.join(DATA_PATH, "*_mzmlid.parquet")

In [6]:
# taken from https://www.biorxiv.org/content/10.1101/428334v2.supplementary-material
SPECIES_MAPPING_FILE_DOWNLOAD_URL = 'https://www.biorxiv.org/highwire/filestream/128716/field_highwire_adjunct_files/1/428334-2.xlsx'
SPECIES_MAPPING_FILE_PATH = os.path.join(DUMP_PATH, 'file_to_species_mapping.xlsx')

In [7]:
ensure_dir_exists(TRAINING_COLUMNS_DUMP_PATH)

In [8]:
MZMLID_FILE_PATHS = glob.glob(MZMLID_FILES_PATH)
len(MZMLID_FILE_PATHS)

40

In [9]:
def download_species_mapping(download_url: str, output_file: str):
    if os.path.exists(output_file):
        print(f"skipping, because '{output_file}' already exists")
    else:
        print(wget.download(download_url, out=output_file))

def read_xlsx(file: str) -> pd.DataFrame:
    workbook = load_workbook(file)
    worksheet = workbook[workbook.worksheets[0].title]
    data = worksheet.values
    columns = next(data)[0:]
    df = pd.DataFrame(data=data, columns=columns)
    return df

def get_species_mapping(download_url: str = SPECIES_MAPPING_FILE_DOWNLOAD_URL,
                        output_file: str = SPECIES_MAPPING_FILE_PATH) -> pd.DataFrame:
    download_species_mapping(download_url=download_url, output_file=output_file)
    df = read_xlsx(file=output_file)
    df = df.drop(columns='id')
    df = df.set_index('mgf_file')
    return df

def join_species_mapping(data_df: pd.DataFrame, species_df: pd.DataFrame) -> pd.DataFrame:
    df = data_df.copy()
    
    # remove '.mzml' extension
    df['mgf_file'] = df['mzml_filename'].str.slice(stop=-5)
    df = df.join(species_df, on='mgf_file')
    
    return df

In [10]:
species_df = get_species_mapping()
species_df.head(1)

skipping, because '../dumps/PXD010000/file_to_species_mapping.xlsx' already exists


Unnamed: 0_level_0,species,num_PSM,istrain
mgf_file,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Biodiversity_A_cryptum_FeTSB_anaerobic_1_01Jun16_Pippin_16-03-39,Acidiphilium_cryptum_JF-5,6659,Train


In [11]:
PEPTIDE_SEQUENCE = 'SpectrumIdentificationItem__1__PeptideSequence'

In [12]:
output_columns = [
    'peptide_sequence',
    'mz_array',
    'intensity_array',
    'species',
    'istrain'
]

In [13]:
column_patterns_to_drop = [
    "SpectrumIdentificationItem__(?!1__)",
    "SpectrumIdentificationItem__1__PeptideEvidenceRef__(?!0__)"  # they seem to be filled with NaN and None anyway
]

In [14]:
def drop_column_patterns(df: pd.DataFrame, column_patterns: List[str]) -> pd.DataFrame:
    for column_pattern in column_patterns:
        re_pattern = re.compile(column_pattern)
        columns_to_drop = [col for col in df.columns if re_pattern.match(col) is not None]
        df = df.drop(columns=columns_to_drop)
    return df

In [15]:
def join_species(df: pd.DataFrame) -> pd.DataFrame:
    df = join_species_mapping(data_df=df, species_df=species_df)
    return df

In [16]:
def build_modification_columns_dict(modification_columns: List[str], separator: str = '__') -> List[Dict[str, str]]:
    modification_columns_dict: Dict[str, Dict[str, str]] = dict()

    # this assumes that there is only one 'SpectrumIdentificationItem', otherwise the 
    # later ones overwrite the previous ones
    for col in modification_columns:
        parts = col.split(separator)
        modification = modification_columns_dict.get(parts[-2], dict())
        modification_columns_dict[parts[-2]] = modification
        modification[parts[-1]] = col

    modification_columns_dicts: List[Dict[str, str]] = [
        column_dict for _, column_dict in sorted(modification_columns_dict.items())
    ]
    return modification_columns_dicts

class ModificationAnnotator:
    def __init__(self,
                 modification_columns_dicts: List[Dict[str, str]],
                 modification_location_suffix: str = 'location',
                 modification_name_suffix: str = 'name'):
        self.modification_columns_dicts = modification_columns_dicts
        self.modification_location_suffix = modification_location_suffix
        self.modification_name_suffix = modification_name_suffix

    def __call__(self, col: pd.Series) -> None:
        is_na = col.isna()
        for modification_columns_dict in self.modification_columns_dicts:
            if is_na[modification_columns_dict[self.modification_location_suffix]]:
                break
            modification_location = int(
                col[modification_columns_dict[self.modification_location_suffix]]
            ) - 1  # 1-indexed
            modification_name = col[modification_columns_dict[self.modification_name_suffix]]
            col[PEPTIDE_SEQUENCE][modification_location] += f"({modification_name})"
        return None

def annotate_modifications(df: pd.DataFrame,
                           modification_location_suffix: str = 'location',
                           modification_name_suffix: str = 'name',
                           peptide_sequence_col: str = PEPTIDE_SEQUENCE) -> pd.DataFrame:
    modification_columns = [
        col for col in df.columns 
        if 'modification' in col.lower()
        and 'peptideevidenceref' not in col.lower()
    ]
    modification_columns_dicts = build_modification_columns_dict(modification_columns)
    modification_annotator = ModificationAnnotator(
        modification_columns_dicts=modification_columns_dicts,
        modification_location_suffix=modification_location_suffix,
        modification_name_suffix=modification_name_suffix
    )

    df[peptide_sequence_col] = df[peptide_sequence_col].apply(list)
    df.apply(func=modification_annotator, axis=1)
    return df

In [17]:
def rename_columns(df: pd.DataFrame) -> pd.DataFrame:
    df = df.rename(columns={PEPTIDE_SEQUENCE: 'peptide_sequence'})
    return df

In [18]:
def post_processor(df: pd.DataFrame) -> pd.DataFrame:
    df = drop_column_patterns(df=df, column_patterns=column_patterns_to_drop)
    df = join_species(df=df)
    df = annotate_modifications(df=df)
    df = rename_columns(df)
    return df

In [19]:
output_files = filter_files(input_file_paths=MZMLID_FILE_PATHS,
                            output_path=TRAINING_COLUMNS_DUMP_PATH,
                            fdr=0.01,
                            skip_existing=False,
                            output_columns=output_columns,
                            post_processor=post_processor,
                            thread_count=0,
                            logger=logger)
output_files[:1]

DEBUG: Processing items with 8 subprocesses
DEBUG: Trying to fdr-filter 40 mzmlid files
INFO: Finished filtering '../datasets/PXD010000/Biodiversity_B_fragilis_LIB_anaerobic_01_08Feb16_Arwen_15-07-13_mzmlid.parquet' -> '../dumps/PXD010000/training_columns/Biodiversity_B_fragilis_LIB_anaerobic_01_08Feb16_Arwen_15-07-13_mzmlid.parquet'
INFO: Finished filtering '../datasets/PXD010000/Biodiversity_A_cryptum_FeTSB_anaerobic_1_01Jun16_Pippin_16-03-39_mzmlid.parquet' -> '../dumps/PXD010000/training_columns/Biodiversity_A_cryptum_FeTSB_anaerobic_1_01Jun16_Pippin_16-03-39_mzmlid.parquet'
INFO: Finished filtering '../datasets/PXD010000/Biodiversity_A_tumefaciens_R2A_aerobic_1_23Nov16_Pippin_16-09-11_mzmlid.parquet' -> '../dumps/PXD010000/training_columns/Biodiversity_A_tumefaciens_R2A_aerobic_1_23Nov16_Pippin_16-09-11_mzmlid.parquet'
INFO: Finished filtering '../datasets/PXD010000/Biodiversity_B_fragilis_LIB_aerobic_01_01Feb16_Arwen_15-07-13_mzmlid.parquet' -> '../dumps/PXD010000/training_column

INFO: Finished filtering '../datasets/PXD010000/Biodiversity_A_cryptum_FeTSB_anaerobic_2_01Jun16_Pippin_16-03-39_mzmlid.parquet' -> '../dumps/PXD010000/training_columns/Biodiversity_A_cryptum_FeTSB_anaerobic_2_01Jun16_Pippin_16-03-39_mzmlid.parquet'
INFO: Finished filtering '../datasets/PXD010000/Biodiversity_B_fragilis_CMgluc_anaerobic_03_01Feb16_Arwen_15-07-13_mzmlid.parquet' -> '../dumps/PXD010000/training_columns/Biodiversity_B_fragilis_CMgluc_anaerobic_03_01Feb16_Arwen_15-07-13_mzmlid.parquet'
INFO: Finished filtering '../datasets/PXD010000/Biodiversity_B_subtilis_NCIB3610_24h_plates_2_13Jun16_Pippin_16-03-39_mzmlid.parquet' -> '../dumps/PXD010000/training_columns/Biodiversity_B_subtilis_NCIB3610_24h_plates_2_13Jun16_Pippin_16-03-39_mzmlid.parquet'
INFO: Finished filtering '../datasets/PXD010000/Biodiversity_A_tumefaciens_R2A_aerobic_3_23Nov16_Pippin_16-09-11_mzmlid.parquet' -> '../dumps/PXD010000/training_columns/Biodiversity_A_tumefaciens_R2A_aerobic_3_23Nov16_Pippin_16-09-11_mz

[{'input_file_path': '../datasets/PXD010000/Biodiversity_B_fragilis_01_28Jul15_Arwen_14-12-03_mzmlid.parquet',
  'output_file_path': '../dumps/PXD010000/training_columns/Biodiversity_B_fragilis_01_28Jul15_Arwen_14-12-03_mzmlid.parquet',
  'original_sequence_count': 41598,
  'NaN_decoy_count': 0,
  'above_fdr_count': 14394,
  'left_decoys': 261,
  'left_targets': 26943,
  'fdr': 0.009687117247522548,
  'removed_decoys': 261,
  'removed_by_post_processor': 0,
  'final_sequence_count': 26943}]

In [20]:
processing_result = pd.DataFrame(data=output_files)
print(f"number of processing results = {len(processing_result)}")
processing_result

number of processing results = 40


Unnamed: 0,input_file_path,output_file_path,original_sequence_count,NaN_decoy_count,above_fdr_count,left_decoys,left_targets,fdr,removed_decoys,removed_by_post_processor,final_sequence_count
0,../datasets/PXD010000/Biodiversity_B_fragilis_01_28Jul15_Arwen_14-12-03_mzmlid.parquet,../dumps/PXD010000/training_columns/Biodiversity_B_fragilis_01_28Jul15_Arwen_14-12-03_mzmlid.parquet,41598,0,14394,261,26943,0.009687,261,0,26943
1,../datasets/PXD010000/Biodiversity_B_fragilis_LIB_anaerobic_03_01Feb16_Arwen_15-07-13_mzmlid.parquet,../dumps/PXD010000/training_columns/Biodiversity_B_fragilis_LIB_anaerobic_03_01Feb16_Arwen_15-07-13_mzmlid.parquet,45881,0,19873,246,25762,0.009549,246,0,25762
2,../datasets/PXD010000/Biodiversity_A_cryptum_FeTSB_anaerobic_1_01Jun16_Pippin_16-03-39_mzmlid.parquet,../dumps/PXD010000/training_columns/Biodiversity_A_cryptum_FeTSB_anaerobic_1_01Jun16_Pippin_16-03-39_mzmlid.parquet,29447,0,21422,77,7948,0.009688,77,0,7948
3,../datasets/PXD010000/Biodiversity_B_fragilis_LIB_aerobic_01_01Feb16_Arwen_15-07-13_mzmlid.parquet,../dumps/PXD010000/training_columns/Biodiversity_B_fragilis_LIB_aerobic_01_01Feb16_Arwen_15-07-13_mzmlid.parquet,44744,0,19617,240,24887,0.009644,240,0,24887
4,../datasets/PXD010000/Biodiversity_B_fragilis_LIB_anaerobic_02_01Feb16_Arwen_15-07-13_mzmlid.parquet,../dumps/PXD010000/training_columns/Biodiversity_B_fragilis_LIB_anaerobic_02_01Feb16_Arwen_15-07-13_mzmlid.parquet,46494,0,19946,253,26295,0.009622,253,0,26295
5,../datasets/PXD010000/Biodiversity_B_fragilis_LIB_anaerobic_01_08Feb16_Arwen_15-07-13_mzmlid.parquet,../dumps/PXD010000/training_columns/Biodiversity_B_fragilis_LIB_anaerobic_01_08Feb16_Arwen_15-07-13_mzmlid.parquet,33706,0,32228,14,1464,0.009563,14,0,1464
6,../datasets/PXD010000/Biodiversity_B_fragilis_LIB_aerobic_02_01Feb16_Arwen_15-07-13_mzmlid.parquet,../dumps/PXD010000/training_columns/Biodiversity_B_fragilis_LIB_aerobic_02_01Feb16_Arwen_15-07-13_mzmlid.parquet,43717,0,19597,229,23891,0.009585,229,0,23891
7,../datasets/PXD010000/Biodiversity_B_fragilis_LIB_aerobic_03_01Feb16_Arwen_15-07-13_mzmlid.parquet,../dumps/PXD010000/training_columns/Biodiversity_B_fragilis_LIB_aerobic_03_01Feb16_Arwen_15-07-13_mzmlid.parquet,45547,0,19503,249,25795,0.009653,249,0,25795
8,../datasets/PXD010000/Biodiversity_A_tumefaciens_R2A_aerobic_1_23Nov16_Pippin_16-09-11_mzmlid.parquet,../dumps/PXD010000/training_columns/Biodiversity_A_tumefaciens_R2A_aerobic_1_23Nov16_Pippin_16-09-11_mzmlid.parquet,44899,0,28626,159,16114,0.009867,159,0,16114
9,../datasets/PXD010000/Biodiversity_B_cereus_ATCC14579_LB_aerobic_2_17July16_Samwise_16-04-10_mzmlid.parquet,../dumps/PXD010000/training_columns/Biodiversity_B_cereus_ATCC14579_LB_aerobic_2_17July16_Samwise_16-04-10_mzmlid.parquet,41267,0,12662,276,28329,0.009743,276,0,28329


In [21]:
pd.read_parquet(processing_result['output_file_path'][0]).head(2)

Unnamed: 0,peptide_sequence,mz_array,intensity_array,species,istrain
0,"[T, T, T, P, K, K, P, N, S, A, M(Oxidation), R]","[100.07641, 100.087074, 101.07137, 101.1077, 102.05501, 110.07133, 110.631386, 112.05055, 112.07596, 112.08711, 113.07111, 115.05069, 115.08657, 116.07058, 127.0866, 129.10239, 130.08636, 130.09476, 130.10594, 131.0815, 131.09012, 132.54994, 136.06177, 136.07571, 139.08629, 139.90489, 140.0708, 141.06584, 141.10217, 143.08157, 145.0981, 147.11227, 147.96593, 149.02304, 152.05658, 153.10185, 155.08127, 155.11765, 157.09727, 157.10991, 158.09113, 159.07635, 164.05647, 167.05591, 167.08096, 169.05196, 171.07687, 171.11266, 175.10767, 175.11974, 176.11064, 180.07697, 181.09694, 181.13272, 185.05566, 185.09209, 186.09564, 196.10736, 197.09218, 198.08804, 199.0704, 199.10811, 203.10245, 204.10599, 205.1368, 209.1289, 212.10365, 214.11859, 222.16032, 224.10338, 226.15494, 227.1583, 238.15607, 240.13449, 240.17038, 242.11392, 257.19702, 258.15533, 258.20068, 264.13522, 265.1653, 268.12894, 283.17685, 286.13928, 300.11865, 305.15933, 305.65024, 309.192, 312.04562, 314.16446, 321.8492, 327.2031, 327.5248, 327.8595, 329.19373, 336.19986, 337.64832, 343.18893, 346.16183, 346.5337, ...]","[1044.7223, 3649.1797, 1102.0496, 1357.6985, 844.1217, 4310.1567, 701.75555, 3388.4607, 14474.438, 3396.4175, 3271.5308, 785.9828, 3989.8667, 1080.443, 8620.463, 115111.52, 60642.484, 892.7931, 6232.6123, 1503.7406, 1695.8452, 621.4343, 8136.3633, 3622.0579, 3527.2075, 690.5978, 3141.1062, 9701.572, 1072.0393, 4542.582, 913.5738, 1275.7793, 904.80707, 1183.5809, 1307.1254, 3257.456, 6890.353, 782.2476, 22261.06, 948.4679, 1374.1157, 8412.553, 1015.7166, 942.1996, 4452.6733, 818.5687, 7237.765, 11416.154, 45089.855, 7394.4355, 1465.4384, 3969.8708, 1652.0625, 1171.807, 927.53595, 48796.72, 3459.731, 1612.3837, 884.8939, 1009.1056, 948.3301, 5930.4526, 20944.246, 1297.6293, 876.90686, 699.8927, 2823.239, 4986.422, 1002.72095, 6868.054, 48777.496, 5358.242, 1159.44, 6050.066, 1177.8954, 4301.8447, 13318.132, 1678.9065, 896.4516, 1518.1096, 1104.4894, 3720.5735, 1503.6326, 7302.4346, 1507.2914, 853.93396, 1312.6992, 4806.0015, 1169.0146, 3889.657, 1483.5475, 3854.5623, 6731.8696, 3342.8206, 5410.943, 1120.8875, 888.22064, 883.156, 4437.399, 1748.3259, ...]",Bacteroides_fragilis_638R,Train
3,"[R, G, Q, V, E, G, M(Oxidation), E, S, S, R]","[101.07124, 102.0553, 110.07134, 112.05085, 112.08691, 113.07111, 115.086685, 116.0705, 116.97228, 119.58438, 124.03943, 127.086365, 129.0665, 129.1023, 130.04953, 130.05978, 130.08626, 130.10585, 136.06165, 136.07605, 141.06583, 143.08133, 147.11246, 149.02335, 152.05638, 155.08112, 157.09698, 157.10889, 158.09206, 159.0761, 163.08455, 171.07649, 171.59804, 173.09224, 173.12799, 175.07077, 175.11919, 176.15643, 183.1128, 186.12308, 187.07169, 188.03653, 189.61603, 197.10431, 207.13461, 214.12898, 231.15611, 245.12381, 252.09872, 256.12704, 262.15063, 268.13138, 270.10626, 273.13388, 281.1357, 283.14725, 300.64038, 309.6448, 314.16162, 314.19324, 316.09637, 325.16174, 325.79483, 331.17108, 342.1879, 343.19034, 343.80283, 346.97397, 349.18295, 350.18475, 356.15732, 359.20905, 384.5217, 393.83148, 396.23593, 399.00772, 413.26212, 414.2659, 417.0347, 417.2444, 417.2967, 417.74738, 418.0339, 418.24805, 418.99487, 424.23, 441.25668, 442.25604, 460.2152, 461.21298, 478.22495, 479.226, 488.20978, 520.249, 532.253, 542.3029, 544.235, 561.26013, 570.2995, 571.30084, ...]","[1498.0873, 20642.28, 3844.8838, 1120.5063, 4205.9717, 874.3143, 2515.5444, 786.77234, 926.58484, 730.192, 5138.3696, 2751.0, 1000.0747, 15183.164, 2514.9229, 688.74, 2551.5332, 1013.9507, 6667.6606, 1194.0012, 3992.4336, 1274.8231, 1276.667, 838.5754, 669.43054, 1216.2158, 901.391, 1217.8413, 4599.2, 1217.2018, 970.0999, 3354.2034, 1381.6448, 652.9564, 1240.8739, 806.5449, 12598.689, 618.4468, 1167.6907, 2215.4497, 1226.4055, 1331.2247, 2589.5076, 1103.1534, 3314.474, 5952.45, 3313.3496, 3223.3242, 925.97974, 805.99945, 5254.821, 920.71, 1146.4827, 962.8078, 1564.1729, 842.9065, 1520.9548, 5055.0137, 1073.2836, 9688.531, 794.2665, 1371.2823, 1058.4569, 1372.5068, 36481.03, 4376.838, 958.2829, 5189.961, 14978.933, 2890.5093, 954.04565, 884.45465, 907.48895, 868.85864, 1533.7688, 816.29004, 22754.748, 4653.88, 3068.491, 4746.0464, 883.5928, 810.7221, 887.24255, 1424.3857, 2932.1465, 2775.8826, 21891.256, 3109.1223, 10081.893, 1342.2267, 19438.086, 3235.9631, 2758.4998, 720.628, 1013.52405, 5870.137, 952.2475, 2910.8145, 6807.415, 1152.3481, ...]",Bacteroides_fragilis_638R,Train


In [22]:
processing_result.describe()

Unnamed: 0,original_sequence_count,NaN_decoy_count,above_fdr_count,left_decoys,left_targets,fdr,removed_decoys,removed_by_post_processor,final_sequence_count
count,40.0,40.0,40.0,40.0,40.0,40.0,40.0,40.0,40.0
mean,40638.95,0.0,19931.275,193.025,20514.65,0.009341,193.025,0.0,20514.65
std,5112.383618,0.0,4383.092001,67.728147,6622.403555,0.000612,67.728147,0.0,6622.403555
min,29170.0,0.0,11948.0,14.0,1464.0,0.007884,14.0,0.0,1464.0
25%,36831.25,0.0,18154.5,138.0,16094.0,0.009366,138.0,0.0,16094.0
50%,41494.0,0.0,19909.5,231.5,23881.0,0.009592,231.5,0.0,23881.0
75%,44840.5,0.0,21430.0,248.25,25977.5,0.009708,248.25,0.0,25977.5
max,46909.0,0.0,32228.0,281.0,28437.0,0.009881,281.0,0.0,28437.0


In [23]:
processing_result.drop(columns=['input_file_path', 'output_file_path']).sum()

original_sequence_count      1.625558e+06
NaN_decoy_count              0.000000e+00
above_fdr_count              7.972510e+05
left_decoys                  7.721000e+03
left_targets                 8.205860e+05
fdr                          3.736360e-01
removed_decoys               7.721000e+03
removed_by_post_processor    0.000000e+00
final_sequence_count         8.205860e+05
dtype: float64

In [24]:
processing_result.to_parquet(os.path.join(TRAINING_COLUMNS_DUMP_PATH, "processing_result.parquet"))