# Prototyping a Data Filtering Pipeline
## Prerequisites

In [1]:
import glob
import pandas as pd
from mmproteo.utils.utils import ensure_dir_exists
from mmproteo.utils import log
from mmproteo.utils.formats.mz import FilteringProcessor, filter_files
from mmproteo.utils.processing import ItemProcessor
import os

In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 1000)

In [3]:
logger = log.DummyLogger(verbose=False)

INFO: Printing to Stdout


## Data Import

In [4]:
pwd

'/tf/workspace/notebooks'

In [5]:
PROJECT = "PXD010000"
DATA_PATH = f"../datasets/{PROJECT}"
DUMP_PATH = f"../dumps/{PROJECT}"
TRAINING_COLUMNS_DUMP_PATH = DUMP_PATH + "/training_columns"
MZMLID_FILES_PATH = f"{DATA_PATH}/*_mzmlid.parquet"

In [6]:
ensure_dir_exists(TRAINING_COLUMNS_DUMP_PATH)

In [7]:
MZMLID_FILE_PATHS = glob.glob(MZMLID_FILES_PATH)
len(MZMLID_FILE_PATHS)

235

In [8]:
output_files = filter_files(input_file_paths=MZMLID_FILE_PATHS,
                            output_path=TRAINING_COLUMNS_DUMP_PATH,
                            fdr=0.01,
                            skip_existing=True,
                            thread_count=0,
                            logger=logger)
output_files

INFO: Finished filtering '../datasets/PXD010000/Biodiversity_S_thermosulf_FeYE_anaerobic_1_01Jun16_Pippin_16-03-39_mzmlid.parquet' -> '../dumps/PXD010000/training_columns/Biodiversity_S_thermosulf_FeYE_anaerobic_1_01Jun16_Pippin_16-03-39_mzmlid.parquet'
INFO: Finished filtering '../datasets/PXD010000/Biodiversity_M_smegmatis_BHI_aerobic_3_05Oct16_Pippin_16-05-06_mzmlid.parquet' -> '../dumps/PXD010000/training_columns/Biodiversity_M_smegmatis_BHI_aerobic_3_05Oct16_Pippin_16-05-06_mzmlid.parquet'
INFO: Finished filtering '../datasets/PXD010000/Biodiversity_P_ruminicola_MDM_anaerobic_2_09Jun16_Pippin_16-03-39_mzmlid.parquet' -> '../dumps/PXD010000/training_columns/Biodiversity_P_ruminicola_MDM_anaerobic_2_09Jun16_Pippin_16-03-39_mzmlid.parquet'
INFO: Finished filtering '../datasets/PXD010000/Biodiversity_B_fragilis_LIB_anaerobic_03_01Feb16_Arwen_15-07-13_mzmlid.parquet' -> '../dumps/PXD010000/training_columns/Biodiversity_B_fragilis_LIB_anaerobic_03_01Feb16_Arwen_15-07-13_mzmlid.parquet'


INFO: Finished filtering '../datasets/PXD010000/Biodiversity_S_aurantiaca_CYE_aerobic_1_17July16_Samwise_16-04-10_mzmlid.parquet' -> '../dumps/PXD010000/training_columns/Biodiversity_S_aurantiaca_CYE_aerobic_1_17July16_Samwise_16-04-10_mzmlid.parquet'
INFO: Finished filtering '../datasets/PXD010000/Biodiversity_C_ljungdahlii_CO_anaerobic_2_04Oct16_Pippin_16-05-06_mzmlid.parquet' -> '../dumps/PXD010000/training_columns/Biodiversity_C_ljungdahlii_CO_anaerobic_2_04Oct16_Pippin_16-05-06_mzmlid.parquet'
INFO: Finished filtering '../datasets/PXD010000/Biodiversity_S_agalactiae_LIB_aerobic_03_26Feb16_Arwen_16-01-01_mzmlid.parquet' -> '../dumps/PXD010000/training_columns/Biodiversity_S_agalactiae_LIB_aerobic_03_26Feb16_Arwen_16-01-01_mzmlid.parquet'
INFO: Finished filtering '../datasets/PXD010000/Biodiversity_C_freundii_LIB_01_28Oct15_Arwen_15-07-13_mzmlid.parquet' -> '../dumps/PXD010000/training_columns/Biodiversity_C_freundii_LIB_01_28Oct15_Arwen_15-07-13_mzmlid.parquet'
INFO: Finished filte

INFO: Finished filtering '../datasets/PXD010000/Biodiversity_S_griseorubens_HSM_aerobic_3_23Nov16_Pippin_16-09-11_mzmlid.parquet' -> '../dumps/PXD010000/training_columns/Biodiversity_S_griseorubens_HSM_aerobic_3_23Nov16_Pippin_16-09-11_mzmlid.parquet'
INFO: Finished filtering '../datasets/PXD010000/M_alcali_copp_MeOH_B2_T2_04_QE_23Mar18_Oak_18-01-07_mzmlid.parquet' -> '../dumps/PXD010000/training_columns/M_alcali_copp_MeOH_B2_T2_04_QE_23Mar18_Oak_18-01-07_mzmlid.parquet'
INFO: Finished filtering '../datasets/PXD010000/Biodiversity_P_hydrogenalis_CMgluc_anaerobic_02_26Feb16_Arwen_16-01-01_mzmlid.parquet' -> '../dumps/PXD010000/training_columns/Biodiversity_P_hydrogenalis_CMgluc_anaerobic_02_26Feb16_Arwen_16-01-01_mzmlid.parquet'
INFO: Finished filtering '../datasets/PXD010000/Biodiversity_B_fragilis_CMcarb_anaerobic_03_01Feb16_Arwen_15-07-13_mzmlid.parquet' -> '../dumps/PXD010000/training_columns/Biodiversity_B_fragilis_CMcarb_anaerobic_03_01Feb16_Arwen_15-07-13_mzmlid.parquet'
INFO: Fi

INFO: Finished filtering '../datasets/PXD010000/P_putida_18Nov15_1_21Mar16_Arwen_16-01-03_mzmlid.parquet' -> '../dumps/PXD010000/training_columns/P_putida_18Nov15_1_21Mar16_Arwen_16-01-03_mzmlid.parquet'
INFO: Finished filtering '../datasets/PXD010000/Biodiversity_B_fragilis_LB_anaerobic_03_01Feb16_Arwen_15-07-13_mzmlid.parquet' -> '../dumps/PXD010000/training_columns/Biodiversity_B_fragilis_LB_anaerobic_03_01Feb16_Arwen_15-07-13_mzmlid.parquet'
INFO: Finished filtering '../datasets/PXD010000/Biodiversity_B_cereus_ATCC14579_LB_aerobic_1_17July16_Samwise_16-04-10_mzmlid.parquet' -> '../dumps/PXD010000/training_columns/Biodiversity_B_cereus_ATCC14579_LB_aerobic_1_17July16_Samwise_16-04-10_mzmlid.parquet'
INFO: Finished filtering '../datasets/PXD010000/Biodiversity_HL69_HLA_aerobic_1_05Oct16_Pippin_16-05-06_mzmlid.parquet' -> '../dumps/PXD010000/training_columns/Biodiversity_HL69_HLA_aerobic_1_05Oct16_Pippin_16-05-06_mzmlid.parquet'
INFO: Finished filtering '../datasets/PXD010000/Biodiver

INFO: Finished filtering '../datasets/PXD010000/Biodiversity_M_xanthus_DZ2_plates_1_03May16_Samwise_16-03-32_mzmlid.parquet' -> '../dumps/PXD010000/training_columns/Biodiversity_M_xanthus_DZ2_plates_1_03May16_Samwise_16-03-32_mzmlid.parquet'
INFO: Finished filtering '../datasets/PXD010000/Biodiversity_D_acidovorans_TGY_aerobic_01_29Apr16_Samwise_16-03-32_renamed_mzmlid.parquet' -> '../dumps/PXD010000/training_columns/Biodiversity_D_acidovorans_TGY_aerobic_01_29Apr16_Samwise_16-03-32_renamed_mzmlid.parquet'
INFO: Finished filtering '../datasets/PXD010000/Biodiversity_B_fragilis_Carb_01_28Oct15_Arwen_15-07-13_mzmlid.parquet' -> '../dumps/PXD010000/training_columns/Biodiversity_B_fragilis_Carb_01_28Oct15_Arwen_15-07-13_mzmlid.parquet'
INFO: Finished filtering '../datasets/PXD010000/M_alcali_copp_CH4_B3_T1_11_QE_23Mar18_Oak_18-01-07_mzmlid.parquet' -> '../dumps/PXD010000/training_columns/M_alcali_copp_CH4_B3_T1_11_QE_23Mar18_Oak_18-01-07_mzmlid.parquet'
INFO: Finished filtering '../dataset

INFO: Finished filtering '../datasets/PXD010000/Biodiversity_A_cryptum_FeTSB_anaerobic_2_01Jun16_Pippin_16-03-39_mzmlid.parquet' -> '../dumps/PXD010000/training_columns/Biodiversity_A_cryptum_FeTSB_anaerobic_2_01Jun16_Pippin_16-03-39_mzmlid.parquet'
INFO: Finished filtering '../datasets/PXD010000/Biodiversity_M_xanthus_DZ2_48h_plates_2_13Jun16_Pippin_16-03-39_mzmlid.parquet' -> '../dumps/PXD010000/training_columns/Biodiversity_M_xanthus_DZ2_48h_plates_2_13Jun16_Pippin_16-03-39_mzmlid.parquet'
INFO: Finished filtering '../datasets/PXD010000/Biodiversity_S_thermosulf_FeYE_anaerobic_2_01Jun16_Pippin_16-03-39_mzmlid.parquet' -> '../dumps/PXD010000/training_columns/Biodiversity_S_thermosulf_FeYE_anaerobic_2_01Jun16_Pippin_16-03-39_mzmlid.parquet'
INFO: Finished filtering '../datasets/PXD010000/M_alcali_copp_MeOH_B3_T1_05_QE_23Mar18_Oak_18-01-07_mzmlid.parquet' -> '../dumps/PXD010000/training_columns/M_alcali_copp_MeOH_B3_T1_05_QE_23Mar18_Oak_18-01-07_mzmlid.parquet'
INFO: Finished filtering

INFO: Finished filtering '../datasets/PXD010000/Biodiversity_R_palustris_PM_aerobic_3_01Jun16_Pippin_16-03-39_mzmlid.parquet' -> '../dumps/PXD010000/training_columns/Biodiversity_R_palustris_PM_aerobic_3_01Jun16_Pippin_16-03-39_mzmlid.parquet'
INFO: Finished filtering '../datasets/PXD010000/Biodiversity_M_xanthus_DZ2_pellet_1_03May16_Samwise_16-03-32_mzmlid.parquet' -> '../dumps/PXD010000/training_columns/Biodiversity_M_xanthus_DZ2_pellet_1_03May16_Samwise_16-03-32_mzmlid.parquet'
INFO: Finished filtering '../datasets/PXD010000/Biodiversity_HL111_HLHglutamate_aerobic_2_14July16_Pippin_16-05-01_mzmlid.parquet' -> '../dumps/PXD010000/training_columns/Biodiversity_HL111_HLHglutamate_aerobic_2_14July16_Pippin_16-05-01_mzmlid.parquet'
INFO: Finished filtering '../datasets/PXD010000/Biodiversity_B_thet_CMcarb_anaerobic_03_01Feb16_Arwen_15-07-13_mzmlid.parquet' -> '../dumps/PXD010000/training_columns/Biodiversity_B_thet_CMcarb_anaerobic_03_01Feb16_Arwen_15-07-13_mzmlid.parquet'
INFO: Finished

[{'input_file_path': '../datasets/PXD010000/Biodiversity_B_fragilis_01_28Jul15_Arwen_14-12-03_mzmlid.parquet',
  'output_file_path': '../dumps/PXD010000/training_columns/Biodiversity_B_fragilis_01_28Jul15_Arwen_14-12-03_mzmlid.parquet',
  'original_sequence_count': 41598,
  'NaN_decoy_count': 217,
  'above_fdr_count': 14291,
  'left_decoys': 260,
  'left_targets': 26830,
  'fdr': 0.009690644800596348,
  'final_sequence_count': 26830},
 {'input_file_path': '../datasets/PXD010000/Biodiversity_B_thet_CMgluc_anaerobic_01_01Feb16_Arwen_15-07-13_mzmlid.parquet',
  'output_file_path': '../dumps/PXD010000/training_columns/Biodiversity_B_thet_CMgluc_anaerobic_01_01Feb16_Arwen_15-07-13_mzmlid.parquet',
  'original_sequence_count': 44969,
  'NaN_decoy_count': 306,
  'above_fdr_count': 20199,
  'left_decoys': 235,
  'left_targets': 24229,
  'fdr': 0.009699120888191836,
  'final_sequence_count': 24229},
 {'input_file_path': '../datasets/PXD010000/Biodiversity_M_smegmatis_BHI_aerobic_3_05Oct16_Pippi

In [9]:
processing_result = pd.DataFrame(data=output_files)
processing_result

Unnamed: 0,input_file_path,output_file_path,original_sequence_count,NaN_decoy_count,above_fdr_count,left_decoys,left_targets,fdr,final_sequence_count
0,../datasets/PXD010000/Biodiversity_B_fragilis_01_28Jul15_Arwen_14-12-03_mzmlid.parquet,../dumps/PXD010000/training_columns/Biodiversity_B_fragilis_01_28Jul15_Arwen_14-12-03_mzmlid.parquet,41598,217,14291,260,26830,0.009691,26830
1,../datasets/PXD010000/Biodiversity_B_thet_CMgluc_anaerobic_01_01Feb16_Arwen_15-07-13_mzmlid.parquet,../dumps/PXD010000/training_columns/Biodiversity_B_thet_CMgluc_anaerobic_01_01Feb16_Arwen_15-07-13_mzmlid.parquet,44969,306,20199,235,24229,0.009699,24229
2,../datasets/PXD010000/Biodiversity_M_smegmatis_BHI_aerobic_3_05Oct16_Pippin_16-05-06_mzmlid.parquet,../dumps/PXD010000/training_columns/Biodiversity_M_smegmatis_BHI_aerobic_3_05Oct16_Pippin_16-05-06_mzmlid.parquet,41343,273,13344,256,27470,0.009319,27470
3,../datasets/PXD010000/Biodiversity_B_fragilis_LIB_anaerobic_03_01Feb16_Arwen_15-07-13_mzmlid.parquet,../dumps/PXD010000/training_columns/Biodiversity_B_fragilis_LIB_anaerobic_03_01Feb16_Arwen_15-07-13_mzmlid.parquet,45881,230,19750,244,25657,0.00951,25657
4,../datasets/PXD010000/Biodiversity_Lactobacillus_casei_MRS_01_27Dec15_Arwen_15-07-13_mzmlid.parquet,../dumps/PXD010000/training_columns/Biodiversity_Lactobacillus_casei_MRS_01_27Dec15_Arwen_15-07-13_mzmlid.parquet,44383,667,29411,137,14168,0.00967,14168
5,../datasets/PXD010000/Biodiversity_P_ruminicola_MDM_anaerobic_2_09Jun16_Pippin_16-03-39_mzmlid.parquet,../dumps/PXD010000/training_columns/Biodiversity_P_ruminicola_MDM_anaerobic_2_09Jun16_Pippin_16-03-39_mzmlid.parquet,36615,168,17033,186,19228,0.009673,19228
6,../datasets/PXD010000/Biodiversity_S_agalactiae_LIB_aerobic_01_26Feb16_Arwen_16-01-01_mzmlid.parquet,../dumps/PXD010000/training_columns/Biodiversity_S_agalactiae_LIB_aerobic_01_26Feb16_Arwen_16-01-01_mzmlid.parquet,36436,92,21795,115,14434,0.007967,14434
7,../datasets/PXD010000/Biodiversity_S_thermosulf_FeYE_anaerobic_1_01Jun16_Pippin_16-03-39_mzmlid.parquet,../dumps/PXD010000/training_columns/Biodiversity_S_thermosulf_FeYE_anaerobic_1_01Jun16_Pippin_16-03-39_mzmlid.parquet,32318,234,14978,160,16946,0.009442,16946
8,../datasets/PXD010000/Biodiversity_A_cryptum_FeTSB_anaerobic_1_01Jun16_Pippin_16-03-39_mzmlid.parquet,../dumps/PXD010000/training_columns/Biodiversity_A_cryptum_FeTSB_anaerobic_1_01Jun16_Pippin_16-03-39_mzmlid.parquet,29447,571,20938,72,7866,0.009153,7866
9,../datasets/PXD010000/Biodiversity_M_xanthus_DZ2_48h_plates_1_13Jun16_Pippin_16-03-39_mzmlid.parquet,../dumps/PXD010000/training_columns/Biodiversity_M_xanthus_DZ2_48h_plates_1_13Jun16_Pippin_16-03-39_mzmlid.parquet,37305,383,12382,231,24309,0.009503,24309


In [11]:
len(processing_result)

235

In [15]:
processing_result.describe()

Unnamed: 0,original_sequence_count,NaN_decoy_count,above_fdr_count,left_decoys,left_targets,fdr,final_sequence_count
count,235.0,235.0,235.0,235.0,235.0,235.0,235.0
mean,41468.46383,905.446809,17329.348936,220.706383,23012.961702,0.009568,23012.961702
std,7603.08072,1974.191891,6004.568654,72.524229,7259.472179,0.000556,7259.472179
min,28756.0,36.0,7036.0,14.0,1456.0,0.007845,1456.0
25%,36647.5,203.0,13438.0,171.0,18327.5,0.00943,18327.5
50%,41598.0,350.0,16512.0,233.0,24001.0,0.009659,24001.0
75%,45195.5,587.0,20786.5,261.0,26852.5,0.009824,26852.5
max,86594.0,14491.0,36596.0,570.0,57972.0,0.012974,57972.0


In [17]:
processing_result.drop(columns=['input_file_path', 'output_file_path']).sum()

original_sequence_count    9.745089e+06
NaN_decoy_count            2.127800e+05
above_fdr_count            4.072397e+06
left_decoys                5.186600e+04
left_targets               5.408046e+06
fdr                        2.248445e+00
final_sequence_count       5.408046e+06
dtype: float64

In [13]:
processing_result.to_parquet(f"{TRAINING_COLUMNS_DUMP_PATH}/processing_result.parquet")