# Prototyping a Data Filtering Pipeline
## Prerequisites

In [1]:
import glob
import pandas as pd
import mmproteo
import os
from typing imp

Matplotlib created a temporary config/cache directory at /tmp/matplotlib-z_vhfzwh because the default path (/.config/matplotlib) is not a writable directory; it is highly recommended to set the MPLCONFIGDIR environment variable to a writable directory, in particular to speed up the import of Matplotlib and to better support multiprocessing.


In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 1000)

## Data Import

In [3]:
pwd

'/tf/workspace/notebooks'

In [4]:
PROJECT = "PXD010000"
DATA_PATH = f"../datasets/{PROJECT}"
DUMP_PATH = f"../dumps/{PROJECT}"
TRAINING_COLUMNS_DUMP_PATH = DUMP_PATH + "/training_columns"
MZMLID_FILES_PATH = f"{DATA_PATH}/*_mzmlid.parquet"

In [5]:
mmproteo.utils.utils.ensure_dir_exists(TRAINING_COLUMNS_DUMP_PATH)

In [8]:
MZMLID_FILE_PATHS = glob.glob(MZMLID_FILES_PATH)
MZMLID_FILE_PATHS

['../datasets/PXD010000/Biodiversity_B_fragilis_01_28Jul15_Arwen_14-12-03_mzmlid.parquet',
 '../datasets/PXD010000/Biodiversity_B_thet_CMgluc_anaerobic_01_01Feb16_Arwen_15-07-13_mzmlid.parquet',
 '../datasets/PXD010000/Biodiversity_M_smegmatis_BHI_aerobic_3_05Oct16_Pippin_16-05-06_mzmlid.parquet',
 '../datasets/PXD010000/Biodiversity_B_fragilis_LIB_anaerobic_03_01Feb16_Arwen_15-07-13_mzmlid.parquet',
 '../datasets/PXD010000/Biodiversity_Lactobacillus_casei_MRS_01_27Dec15_Arwen_15-07-13_mzmlid.parquet',
 '../datasets/PXD010000/Biodiversity_P_ruminicola_MDM_anaerobic_2_09Jun16_Pippin_16-03-39_mzmlid.parquet',
 '../datasets/PXD010000/Biodiversity_S_agalactiae_LIB_aerobic_01_26Feb16_Arwen_16-01-01_mzmlid.parquet',
 '../datasets/PXD010000/Biodiversity_S_thermosulf_FeYE_anaerobic_1_01Jun16_Pippin_16-03-39_mzmlid.parquet',
 '../datasets/PXD010000/Biodiversity_A_cryptum_FeTSB_anaerobic_1_01Jun16_Pippin_16-03-39_mzmlid.parquet',
 '../datasets/PXD010000/Biodiversity_M_xanthus_DZ2_48h_plates_1_13

In [9]:
len(MZMLID_FILE_PATHS)

235

In [10]:
# for testing
MZMLID_FILE_PATHS = MZMLID_FILE_PATHS[:5]

In [15]:
class FilteringProcessor:
    def __init__(self, dump_path: str, fdr: float = 0.01, skip_existing: bool = True):
        self.is_decoy_column_name = 'SpectrumIdentificationItem__1__PeptideEvidenceRef__isDecoy'
        self.fdr_column_name = 'SpectrumIdentificationItem__1__MSGFQValue'
        self.fdr = fdr
        self.output_columns = [
            'SpectrumIdentificationItem__1__PeptideEvidenceRef__PeptideSequence',
            'mz_array',
            'intensity_array',
        ]
        self.dump_path = dump_path.rstrip(os.path.sep)
        self.skip_existing = skip_existing
    
    def __call__(self, input_file_path: str) -> Optional[str]:
        output_file_path = self.dump_path + os.path.sep + input_file_path.split(os.path.sep)[-1]
        if self.skip_existing and os.path.exists(output_file_path):
            return None
        
        res = {
            'input_file_path' : input_file_path,
            'output_file_path': output_file_path,
        }
        df = pd.read_parquet(current_input_file_path)
        res['original_sequence_count'] = len(df)
        df = df.dropna(subset=[self.is_decoy_column_name])
        res['NaN_decoy_count'] = res['original_sequence_count'] - len(df)
        df = df[df[self.fdr_column_name] <= self.fdr]
        res['above_fdr_count'] = res['NaN_decoy_count'] - len(df)
        decoy_counts = df[IS_DECOY].value_counts()
        res['left_decoys'] = decoy_counts.get(True, 0)
        res['left_targets'] = decoy_counts.get(False, 0)
        res['fdr'] = res['left_decoys'] / res['left_targets']
        # filter out Decoys
        df = df[~df[self.is_decoy_column_name].astype(bool)]
        res['final_sequence_count'] = len(df)
        df = df[self.output_columns]
        df.to_parquet(output_file_path)
        return res

NameError: name 'Optional' is not defined