# Prototyping a Data Filtering Pipeline
## Prerequisites

In [1]:
import glob
import pandas as pd
from mmproteo.utils.utils import ensure_dir_exists
from mmproteo.utils import log
from mmproteo.utils.formats.mz import FilteringProcessor, filter_files
from mmproteo.utils.processing import ItemProcessor
import os

In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 1000)

In [3]:
logger = log.DummyLogger(verbose=False)

INFO: Printing to Stdout


## Data Import

In [4]:
pwd

'/tf/workspace/notebooks'

In [5]:
PROJECT = "PXD010000"
DATA_PATH = f"../datasets/{PROJECT}"
DUMP_PATH = f"../dumps/{PROJECT}"
TRAINING_COLUMNS_DUMP_PATH = DUMP_PATH + "/training_columns"
MZMLID_FILES_PATH = f"{DATA_PATH}/*_mzmlid.parquet"

In [6]:
ensure_dir_exists(TRAINING_COLUMNS_DUMP_PATH)

In [7]:
MZMLID_FILE_PATHS = glob.glob(MZMLID_FILES_PATH)
len(MZMLID_FILE_PATHS)

235

In [8]:
output_files = filter_files(input_file_paths=MZMLID_FILE_PATHS,
                            output_path=TRAINING_COLUMNS_DUMP_PATH,
                            fdr=0.01,
                            skip_existing=True,
                            thread_count=0,
                            logger=logger)
output_files

INFO: No mzmlid files were fdr-filtered


[]

In [13]:
# from a previous run
processing_result = pd.DataFrame(data=output_files)
processing_result

Unnamed: 0,input_file_path,output_file_path,original_sequence_count,NaN_decoy_count,above_fdr_count,left_decoys,left_targets,fdr,final_sequence_count
0,../datasets/PXD010000/Biodiversity_M_smegmatis_BHI_aerobic_3_05Oct16_Pippin_16-05-06_mzmlid.parquet,../dumps/PXD010000/Biodiversity_M_smegmatis_BHI_aerobic_3_05Oct16_Pippin_16-05-06_mzmlid.parquet,41343,273,-27453,256,27470,0.009319,27470
1,../datasets/PXD010000/Biodiversity_B_fragilis_LIB_anaerobic_03_01Feb16_Arwen_15-07-13_mzmlid.parquet,../dumps/PXD010000/Biodiversity_B_fragilis_LIB_anaerobic_03_01Feb16_Arwen_15-07-13_mzmlid.parquet,45881,230,-25671,244,25657,0.00951,25657
2,../datasets/PXD010000/Biodiversity_Lactobacillus_casei_MRS_01_27Dec15_Arwen_15-07-13_mzmlid.parquet,../dumps/PXD010000/Biodiversity_Lactobacillus_casei_MRS_01_27Dec15_Arwen_15-07-13_mzmlid.parquet,44383,667,-13638,137,14168,0.00967,14168
3,../datasets/PXD010000/Biodiversity_P_ruminicola_MDM_anaerobic_2_09Jun16_Pippin_16-03-39_mzmlid.parquet,../dumps/PXD010000/Biodiversity_P_ruminicola_MDM_anaerobic_2_09Jun16_Pippin_16-03-39_mzmlid.parquet,36615,168,-19246,186,19228,0.009673,19228
4,../datasets/PXD010000/Biodiversity_S_agalactiae_LIB_aerobic_01_26Feb16_Arwen_16-01-01_mzmlid.parquet,../dumps/PXD010000/Biodiversity_S_agalactiae_LIB_aerobic_01_26Feb16_Arwen_16-01-01_mzmlid.parquet,36436,92,-14457,115,14434,0.007967,14434
5,../datasets/PXD010000/Biodiversity_S_thermosulf_FeYE_anaerobic_1_01Jun16_Pippin_16-03-39_mzmlid.parquet,../dumps/PXD010000/Biodiversity_S_thermosulf_FeYE_anaerobic_1_01Jun16_Pippin_16-03-39_mzmlid.parquet,32318,234,-16872,160,16946,0.009442,16946
6,../datasets/PXD010000/Biodiversity_A_cryptum_FeTSB_anaerobic_1_01Jun16_Pippin_16-03-39_mzmlid.parquet,../dumps/PXD010000/Biodiversity_A_cryptum_FeTSB_anaerobic_1_01Jun16_Pippin_16-03-39_mzmlid.parquet,29447,571,-7367,72,7866,0.009153,7866
7,../datasets/PXD010000/Biodiversity_M_xanthus_DZ2_48h_plates_1_13Jun16_Pippin_16-03-39_mzmlid.parquet,../dumps/PXD010000/Biodiversity_M_xanthus_DZ2_48h_plates_1_13Jun16_Pippin_16-03-39_mzmlid.parquet,37305,383,-24157,231,24309,0.009503,24309
8,../datasets/PXD010000/M_alcali_copp_MeOH_B1_T2_02_QE_23Mar18_Oak_18-01-07_mzmlid.parquet,../dumps/PXD010000/M_alcali_copp_MeOH_B1_T2_02_QE_23Mar18_Oak_18-01-07_mzmlid.parquet,53875,6111,-14441,199,20353,0.009777,20353
9,../datasets/PXD010000/Biodiversity_D_acidovorans_TGY_aerobic_03_29Apr16_Samwise_16-03-32_renamed_mzmlid.parquet,../dumps/PXD010000/Biodiversity_D_acidovorans_TGY_aerobic_03_29Apr16_Samwise_16-03-32_renamed_mzmlid.parquet,31944,404,-19616,198,19822,0.009989,19822


In [14]:
processing_result['final_sequence_count'].sum()

5356987