# Parallelized Data Filtering Pipeline
## Prerequisites

In [1]:
import glob
import os
import re
from typing import List, Dict

import pandas as pd
import wget
from openpyxl import load_workbook

from mmproteo.utils import log
from mmproteo.utils.formats.mz import filter_files
from mmproteo.utils.utils import ensure_dir_exists

In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 2000)

In [3]:
logger = log.DummyLogger(verbose=True)

INFO: Printing to Stdout


## Data Import

In [4]:
pwd

'/hpi/fs00/home/mirko.krause/masterthesis/pride-downloader/notebooks'

Code to run before running this pipeline to get/create all relevant data files:

```
cd ../datasets
mkdir PXD010000
cd PXD010000
mmproteo -p PXD010000 -e mzml,mzid --thread-count 0 download extract mz2parquet
```

In [5]:
PROJECT = "PXD010000"

In [6]:
DATA_PATH = '/scratch/mirko.krause/PXD010000'
DUMP_PATH = '/scratch/mirko.krause/dumps/PXD010000'
THREAD_COUNT=32

In [7]:
TRAINING_COLUMNS_DUMP_PATH = os.path.join(DUMP_PATH, "training_columns")
MZMLID_FILES_PATH = os.path.join(DATA_PATH, "*_mzmlid.parquet")

In [8]:
# taken from https://www.biorxiv.org/content/10.1101/428334v2.supplementary-material
SPECIES_MAPPING_FILE_DOWNLOAD_URL = 'https://www.biorxiv.org/highwire/filestream/128716/field_highwire_adjunct_files/1/428334-2.xlsx'
SPECIES_MAPPING_FILE_PATH = os.path.join(DUMP_PATH, 'file_to_species_mapping.xlsx')

In [9]:
ensure_dir_exists(TRAINING_COLUMNS_DUMP_PATH)

In [10]:
MZMLID_FILE_PATHS = glob.glob(MZMLID_FILES_PATH)
len(MZMLID_FILE_PATHS)

235

In [11]:
def download_species_mapping(download_url: str, output_file: str):
    if os.path.exists(output_file):
        print(f"skipping, because '{output_file}' already exists")
    else:
        print(wget.download(download_url, out=output_file))

def read_xlsx(file: str) -> pd.DataFrame:
    workbook = load_workbook(file)
    worksheet = workbook[workbook.worksheets[0].title]
    data = worksheet.values
    columns = next(data)[0:]
    df = pd.DataFrame(data=data, columns=columns)
    return df

def get_species_mapping(download_url: str = SPECIES_MAPPING_FILE_DOWNLOAD_URL,
                        output_file: str = SPECIES_MAPPING_FILE_PATH) -> pd.DataFrame:
    download_species_mapping(download_url=download_url, output_file=output_file)
    df = read_xlsx(file=output_file)
    df = df.drop(columns='id')
    df = df.set_index('mgf_file')
    return df

def join_species_mapping(data_df: pd.DataFrame, species_df: pd.DataFrame) -> pd.DataFrame:
    df = data_df.copy()
    
    # remove '.mzml' extension
    df['mgf_file'] = df['mzml_filename'].str.slice(stop=-5)
    df = df.join(species_df, on='mgf_file')
    
    return df

In [12]:
species_df = get_species_mapping()
species_df.head(1)

skipping, because '/scratch/mirko.krause/dumps/PXD010000/file_to_species_mapping.xlsx' already exists


Unnamed: 0_level_0,species,num_PSM,istrain
mgf_file,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Biodiversity_A_cryptum_FeTSB_anaerobic_1_01Jun16_Pippin_16-03-39,Acidiphilium_cryptum_JF-5,6659,Train


In [13]:
PEPTIDE_SEQUENCE = 'SpectrumIdentificationItem__1__PeptideSequence'

In [14]:
output_columns = [
    'peptide_sequence',
    'mz_array',
    'intensity_array',
    'species',
    'istrain'
]

In [15]:
column_patterns_to_drop = [
    "SpectrumIdentificationItem__(?!1__)",
    "SpectrumIdentificationItem__1__PeptideEvidenceRef__(?!0__)"  # they seem to be filled with NaN and None anyway
]

In [16]:
def drop_column_patterns(df: pd.DataFrame, column_patterns: List[str]) -> pd.DataFrame:
    for column_pattern in column_patterns:
        re_pattern = re.compile(column_pattern)
        columns_to_drop = [col for col in df.columns if re_pattern.match(col) is not None]
        df = df.drop(columns=columns_to_drop)
    return df

In [17]:
def join_species(df: pd.DataFrame) -> pd.DataFrame:
    df = join_species_mapping(data_df=df, species_df=species_df)
    return df

In [18]:
def build_modification_columns_dict(modification_columns: List[str], separator: str = '__') -> List[Dict[str, str]]:
    modification_columns_dict: Dict[str, Dict[str, str]] = dict()

    # this assumes that there is only one 'SpectrumIdentificationItem', otherwise the 
    # later ones overwrite the previous ones
    for col in modification_columns:
        parts = col.split(separator)
        modification = modification_columns_dict.get(parts[-2], dict())
        modification_columns_dict[parts[-2]] = modification
        modification[parts[-1]] = col

    modification_columns_dicts: List[Dict[str, str]] = [
        column_dict for _, column_dict in sorted(modification_columns_dict.items())
    ]
    return modification_columns_dicts

class ModificationAnnotator:
    def __init__(self,
                 modification_columns_dicts: List[Dict[str, str]],
                 modification_location_suffix: str = 'location',
                 modification_name_suffix: str = 'name'):
        self.modification_columns_dicts = modification_columns_dicts
        self.modification_location_suffix = modification_location_suffix
        self.modification_name_suffix = modification_name_suffix

    def __call__(self, col: pd.Series) -> None:
        is_na = col.isna()
        for modification_columns_dict in self.modification_columns_dicts:
            if is_na[modification_columns_dict[self.modification_location_suffix]]:
                break
            modification_location = int(
                col[modification_columns_dict[self.modification_location_suffix]]
            ) - 1  # 1-indexed
            modification_name = col[modification_columns_dict[self.modification_name_suffix]]
            col[PEPTIDE_SEQUENCE][modification_location] += f"({modification_name})"
        return None

def annotate_modifications(df: pd.DataFrame,
                           modification_location_suffix: str = 'location',
                           modification_name_suffix: str = 'name',
                           peptide_sequence_col: str = PEPTIDE_SEQUENCE) -> pd.DataFrame:
    modification_columns = [
        col for col in df.columns 
        if 'modification' in col.lower()
        and 'peptideevidenceref' not in col.lower()
    ]
    modification_columns_dicts = build_modification_columns_dict(modification_columns)
    modification_annotator = ModificationAnnotator(
        modification_columns_dicts=modification_columns_dicts,
        modification_location_suffix=modification_location_suffix,
        modification_name_suffix=modification_name_suffix
    )

    df[peptide_sequence_col] = df[peptide_sequence_col].apply(list)
    df.apply(func=modification_annotator, axis=1)
    return df

In [19]:
def rename_columns(df: pd.DataFrame) -> pd.DataFrame:
    df = df.rename(columns={PEPTIDE_SEQUENCE: 'peptide_sequence'})
    return df

In [20]:
def post_processor(df: pd.DataFrame) -> pd.DataFrame:
    df = drop_column_patterns(df=df, column_patterns=column_patterns_to_drop)
    df = join_species(df=df)
    df = annotate_modifications(df=df)
    df = rename_columns(df)
    return df

In [21]:
output_files = filter_files(input_file_paths=MZMLID_FILE_PATHS,
                            output_path=TRAINING_COLUMNS_DUMP_PATH,
                            fdr=0.01,
                            skip_existing=True,
                            output_columns=output_columns,
                            post_processor=post_processor,
                            thread_count=THREAD_COUNT,
                            logger=logger)
output_files[:1]

DEBUG: Processing items with 32 subprocesses
DEBUG: Trying to fdr-filter 235 mzmlid files
INFO: Finished filtering '/scratch/mirko.krause/PXD010000/Biodiversity_A_cryptum_FeTSB_anaerobic_1_01Jun16_Pippin_16-03-39_mzmlid.parquet' -> '/scratch/mirko.krause/dumps/PXD010000/training_columns/Biodiversity_A_cryptum_FeTSB_anaerobic_1_01Jun16_Pippin_16-03-39_mzmlid.parquet'
INFO: Finished filtering '/scratch/mirko.krause/PXD010000/Biodiversity_A_cryptum_FeTSB_anaerobic_2_01Jun16_Pippin_16-03-39_mzmlid.parquet' -> '/scratch/mirko.krause/dumps/PXD010000/training_columns/Biodiversity_A_cryptum_FeTSB_anaerobic_2_01Jun16_Pippin_16-03-39_mzmlid.parquet'
INFO: Finished filtering '/scratch/mirko.krause/PXD010000/Biodiversity_B_infantis_CMcarb_anaerobic_01_26Feb16_Arwen_16-01-01_mzmlid.parquet' -> '/scratch/mirko.krause/dumps/PXD010000/training_columns/Biodiversity_B_infantis_CMcarb_anaerobic_01_26Feb16_Arwen_16-01-01_mzmlid.parquet'
INFO: Finished filtering '/scratch/mirko.krause/PXD010000/Biodiversit

INFO: Finished filtering '/scratch/mirko.krause/PXD010000/Biodiversity_S_elongatus_BG11_aerobic_1_14July16_Pippin_16-05-01_mzmlid.parquet' -> '/scratch/mirko.krause/dumps/PXD010000/training_columns/Biodiversity_S_elongatus_BG11_aerobic_1_14July16_Pippin_16-05-01_mzmlid.parquet'
INFO: Finished filtering '/scratch/mirko.krause/PXD010000/Biodiversity_S_elongatus_BG11NaCl_aerobic_2_05Oct16_Pippin_16-05-06_mzmlid.parquet' -> '/scratch/mirko.krause/dumps/PXD010000/training_columns/Biodiversity_S_elongatus_BG11NaCl_aerobic_2_05Oct16_Pippin_16-05-06_mzmlid.parquet'
INFO: Finished filtering '/scratch/mirko.krause/PXD010000/M_alcali_copp_MeOH_B1_T2_02_QE_23Mar18_Oak_18-01-07_mzmlid.parquet' -> '/scratch/mirko.krause/dumps/PXD010000/training_columns/M_alcali_copp_MeOH_B1_T2_02_QE_23Mar18_Oak_18-01-07_mzmlid.parquet'
INFO: Finished filtering '/scratch/mirko.krause/PXD010000/Biodiversity_F_prausnitzii_Carb_01_28Oct15_Arwen_15-07-13_mzmlid.parquet' -> '/scratch/mirko.krause/dumps/PXD010000/training_

INFO: Finished filtering '/scratch/mirko.krause/PXD010000/Biodiversity_HL49_HLHYE_aerobic_1_05Oct16_Pippin_16-05-06_mzmlid.parquet' -> '/scratch/mirko.krause/dumps/PXD010000/training_columns/Biodiversity_HL49_HLHYE_aerobic_1_05Oct16_Pippin_16-05-06_mzmlid.parquet'
INFO: Finished filtering '/scratch/mirko.krause/PXD010000/M_alcali_copp_MeOH_B3_T1_05_QE_23Mar18_Oak_18-01-07_mzmlid.parquet' -> '/scratch/mirko.krause/dumps/PXD010000/training_columns/M_alcali_copp_MeOH_B3_T1_05_QE_23Mar18_Oak_18-01-07_mzmlid.parquet'
INFO: Finished filtering '/scratch/mirko.krause/PXD010000/Biodiversity_P_denitrificans_LIB_aerobic_01_29Apr16_Samwise_16-03-32_renamed_mzmlid.parquet' -> '/scratch/mirko.krause/dumps/PXD010000/training_columns/Biodiversity_P_denitrificans_LIB_aerobic_01_29Apr16_Samwise_16-03-32_renamed_mzmlid.parquet'
INFO: Finished filtering '/scratch/mirko.krause/PXD010000/P_putida_18Nov15_2_21Mar16_Arwen_16-01-03_mzmlid.parquet' -> '/scratch/mirko.krause/dumps/PXD010000/training_columns/P_pu

INFO: Finished filtering '/scratch/mirko.krause/PXD010000/Biodiversity_B_subtilis_NCIB3610_plates_2_03May16_Samwise_16-03-32_mzmlid.parquet' -> '/scratch/mirko.krause/dumps/PXD010000/training_columns/Biodiversity_B_subtilis_NCIB3610_plates_2_03May16_Samwise_16-03-32_mzmlid.parquet'
INFO: Finished filtering '/scratch/mirko.krause/PXD010000/Biodiversity_R_palustris_PMnitro_anaerobic_3_01Jun16_Pippin_16-03-39_mzmlid.parquet' -> '/scratch/mirko.krause/dumps/PXD010000/training_columns/Biodiversity_R_palustris_PMnitro_anaerobic_3_01Jun16_Pippin_16-03-39_mzmlid.parquet'
INFO: Finished filtering '/scratch/mirko.krause/PXD010000/QC_Shew_13_05_500ng_2_5hr_24Mar14_Samwise_13-07-17_mzmlid.parquet' -> '/scratch/mirko.krause/dumps/PXD010000/training_columns/QC_Shew_13_05_500ng_2_5hr_24Mar14_Samwise_13-07-17_mzmlid.parquet'
INFO: Finished filtering '/scratch/mirko.krause/PXD010000/Biodiversity_P_polymyxa_TBS_aerobic_1_17July16_Samwise_16-04-10_mzmlid.parquet' -> '/scratch/mirko.krause/dumps/PXD010000

INFO: Finished filtering '/scratch/mirko.krause/PXD010000/Biodiversity_S_agalactiae_LIB_aerobic_02_26Feb16_Arwen_16-01-01_mzmlid.parquet' -> '/scratch/mirko.krause/dumps/PXD010000/training_columns/Biodiversity_S_agalactiae_LIB_aerobic_02_26Feb16_Arwen_16-01-01_mzmlid.parquet'
INFO: Finished filtering '/scratch/mirko.krause/PXD010000/QC_Shew_13_05_500ng_2_100uL_5hr_30Mar14_Samwise_13-07-17_mzmlid.parquet' -> '/scratch/mirko.krause/dumps/PXD010000/training_columns/QC_Shew_13_05_500ng_2_100uL_5hr_30Mar14_Samwise_13-07-17_mzmlid.parquet'
INFO: Finished filtering '/scratch/mirko.krause/PXD010000/P_putida_01Dec15_1_21Mar16_Arwen_16-01-03_mzmlid.parquet' -> '/scratch/mirko.krause/dumps/PXD010000/training_columns/P_putida_01Dec15_1_21Mar16_Arwen_16-01-03_mzmlid.parquet'
INFO: Finished filtering '/scratch/mirko.krause/PXD010000/Biodiversity_R_jostii_R2A_aerobic_3_23Nov16_Pippin_16-09-11_mzmlid.parquet' -> '/scratch/mirko.krause/dumps/PXD010000/training_columns/Biodiversity_R_jostii_R2A_aerobic_

INFO: Finished filtering '/scratch/mirko.krause/PXD010000/Biodiversity_B_fragilis_LIB_anaerobic_03_01Feb16_Arwen_15-07-13_mzmlid.parquet' -> '/scratch/mirko.krause/dumps/PXD010000/training_columns/Biodiversity_B_fragilis_LIB_anaerobic_03_01Feb16_Arwen_15-07-13_mzmlid.parquet'
INFO: Finished filtering '/scratch/mirko.krause/PXD010000/Biodiversity_R_palustris_PMnitro_anaerobic_1_01Jun16_Pippin_16-03-39_mzmlid.parquet' -> '/scratch/mirko.krause/dumps/PXD010000/training_columns/Biodiversity_R_palustris_PMnitro_anaerobic_1_01Jun16_Pippin_16-03-39_mzmlid.parquet'
INFO: Finished filtering '/scratch/mirko.krause/PXD010000/Biodiversity_A_faecalis_LB_aerobic_02_26Feb16_Arwen_16-01-01_mzmlid.parquet' -> '/scratch/mirko.krause/dumps/PXD010000/training_columns/Biodiversity_A_faecalis_LB_aerobic_02_26Feb16_Arwen_16-01-01_mzmlid.parquet'
INFO: Finished filtering '/scratch/mirko.krause/PXD010000/Biodiversity_B_thet_CMgluc_anaerobic_03_01Feb16_Arwen_15-07-13_mzmlid.parquet' -> '/scratch/mirko.krause/du

INFO: Finished filtering '/scratch/mirko.krause/PXD010000/Biodiversity_S_thermosulf_FeYE_anaerobic_2_01Jun16_Pippin_16-03-39_mzmlid.parquet' -> '/scratch/mirko.krause/dumps/PXD010000/training_columns/Biodiversity_S_thermosulf_FeYE_anaerobic_2_01Jun16_Pippin_16-03-39_mzmlid.parquet'
INFO: Finished filtering '/scratch/mirko.krause/PXD010000/Biodiversity_M_xanthus_DZ2_plates_2_03May16_Samwise_16-03-32_mzmlid.parquet' -> '/scratch/mirko.krause/dumps/PXD010000/training_columns/Biodiversity_M_xanthus_DZ2_plates_2_03May16_Samwise_16-03-32_mzmlid.parquet'
INFO: Finished filtering '/scratch/mirko.krause/PXD010000/Biodiversity_B_fragilis_01_28Jul15_Arwen_14-12-03_mzmlid.parquet' -> '/scratch/mirko.krause/dumps/PXD010000/training_columns/Biodiversity_B_fragilis_01_28Jul15_Arwen_14-12-03_mzmlid.parquet'INFO: Finished filtering '/scratch/mirko.krause/PXD010000/Biodiversity_P_polymyxa_TBS_aerobic_3_17July16_Samwise_16-04-10_mzmlid.parquet' -> '/scratch/mirko.krause/dumps/PXD010000/training_columns/B

INFO: Finished filtering '/scratch/mirko.krause/PXD010000/Biodiversity_B_cereus_PN_L_CL_2_09Oct16_Pippin_16-05-06_mzmlid.parquet' -> '/scratch/mirko.krause/dumps/PXD010000/training_columns/Biodiversity_B_cereus_PN_L_CL_2_09Oct16_Pippin_16-05-06_mzmlid.parquet'
INFO: Finished filtering '/scratch/mirko.krause/PXD010000/Biodiversity_B_subtilis_NCIB3610_24h_plates_1_13Jun16_Pippin_16-03-39_mzmlid.parquet' -> '/scratch/mirko.krause/dumps/PXD010000/training_columns/Biodiversity_B_subtilis_NCIB3610_24h_plates_1_13Jun16_Pippin_16-03-39_mzmlid.parquet'
INFO: Finished filtering '/scratch/mirko.krause/PXD010000/Biodiversity_F_prausnitzii_LIB_01_28Oct15_Arwen_15-07-13_mzmlid.parquet' -> '/scratch/mirko.krause/dumps/PXD010000/training_columns/Biodiversity_F_prausnitzii_LIB_01_28Oct15_Arwen_15-07-13_mzmlid.parquet'
INFO: Finished filtering '/scratch/mirko.krause/PXD010000/Biodiversity_D_acidovorans_TGY_aerobic_01_29Apr16_Samwise_16-03-32_renamed_mzmlid.parquet' -> '/scratch/mirko.krause/dumps/PXD010

[{'input_file_path': '/scratch/mirko.krause/PXD010000/M_alcali_copp_CH4_B3_T1_11_QE_23Mar18_Oak_18-01-07_mzmlid.parquet',
  'output_file_path': '/scratch/mirko.krause/dumps/PXD010000/training_columns/M_alcali_copp_CH4_B3_T1_11_QE_23Mar18_Oak_18-01-07_mzmlid.parquet',
  'original_sequence_count': 53901,
  'NaN_decoy_count': 0,
  'above_fdr_count': 28799,
  'left_decoys': 246,
  'left_targets': 24856,
  'fdr': 0.009897006758931445,
  'removed_decoys': 246,
  'removed_by_post_processor': 0,
  'final_sequence_count': 24856}]

In [22]:
processing_result = pd.DataFrame(data=output_files)
print(f"number of processing results = {len(processing_result)}")
processing_result

number of processing results = 235


Unnamed: 0,input_file_path,output_file_path,original_sequence_count,NaN_decoy_count,above_fdr_count,left_decoys,left_targets,fdr,removed_decoys,removed_by_post_processor,final_sequence_count
0,/scratch/mirko.krause/PXD010000/M_alcali_copp_CH4_B3_T1_11_QE_23Mar18_Oak_18-01-07_mzmlid.parquet,/scratch/mirko.krause/dumps/PXD010000/training_columns/M_alcali_copp_CH4_B3_T1_11_QE_23Mar18_Oak_18-01-07_mzmlid.parquet,53901,0,28799,246,24856,0.009897,246,0,24856
1,/scratch/mirko.krause/PXD010000/Biodiversity_A_tumefaciens_R2A_aerobic_1_23Nov16_Pippin_16-09-11_mzmlid.parquet,/scratch/mirko.krause/dumps/PXD010000/training_columns/Biodiversity_A_tumefaciens_R2A_aerobic_1_23Nov16_Pippin_16-09-11_mzmlid.parquet,44899,0,28626,159,16114,0.009867,159,0,16114
2,/scratch/mirko.krause/PXD010000/Biodiversity_A_cryptum_FeTSB_anaerobic_2_01Jun16_Pippin_16-03-39_mzmlid.parquet,/scratch/mirko.krause/dumps/PXD010000/training_columns/Biodiversity_A_cryptum_FeTSB_anaerobic_2_01Jun16_Pippin_16-03-39_mzmlid.parquet,30057,0,19501,100,10456,0.009564,100,0,10456
3,/scratch/mirko.krause/PXD010000/Biodiversity_M_xanthus_DZ2_48h_plates_2_13Jun16_Pippin_16-03-39_mzmlid.parquet,/scratch/mirko.krause/dumps/PXD010000/training_columns/Biodiversity_M_xanthus_DZ2_48h_plates_2_13Jun16_Pippin_16-03-39_mzmlid.parquet,36028,0,13151,212,22665,0.009354,212,0,22665
4,/scratch/mirko.krause/PXD010000/M_alcali_copp_CH4_B1_T1_07_QE_23Mar18_Oak_18-01-07_mzmlid.parquet,/scratch/mirko.krause/dumps/PXD010000/training_columns/M_alcali_copp_CH4_B1_T1_07_QE_23Mar18_Oak_18-01-07_mzmlid.parquet,52241,0,26530,251,25460,0.009859,251,0,25460
5,/scratch/mirko.krause/PXD010000/Biodiversity_B_bifidum_CMcarb_anaerobic_03_26Feb16_Arwen_16-01-01_mzmlid.parquet,/scratch/mirko.krause/dumps/PXD010000/training_columns/Biodiversity_B_bifidum_CMcarb_anaerobic_03_26Feb16_Arwen_16-01-01_mzmlid.parquet,36793,0,20580,128,16085,0.007958,128,0,16085
6,/scratch/mirko.krause/PXD010000/Biodiversity_C_gilvus_GS2_anaerobic_03_01Feb16_Arwen_15-07-13_mzmlid.parquet,/scratch/mirko.krause/dumps/PXD010000/training_columns/Biodiversity_C_gilvus_GS2_anaerobic_03_01Feb16_Arwen_15-07-13_mzmlid.parquet,47819,0,17192,293,30334,0.009659,293,0,30334
7,/scratch/mirko.krause/PXD010000/Biodiversity_B_infantis_CMcarb_anaerobic_03_26Feb16_Arwen_16-01-01_mzmlid.parquet,/scratch/mirko.krause/dumps/PXD010000/training_columns/Biodiversity_B_infantis_CMcarb_anaerobic_03_26Feb16_Arwen_16-01-01_mzmlid.parquet,38149,0,24375,115,13659,0.008419,115,0,13659
8,/scratch/mirko.krause/PXD010000/Biodiversity_B_thet_LIB_anaerobic_01_01Feb16_Arwen_15-07-13_mzmlid.parquet,/scratch/mirko.krause/dumps/PXD010000/training_columns/Biodiversity_B_thet_LIB_anaerobic_01_01Feb16_Arwen_15-07-13_mzmlid.parquet,45370,0,18091,260,27019,0.009623,260,0,27019
9,/scratch/mirko.krause/PXD010000/Biodiversity_B_thetaiotaomicron_Carb_01_26Aug15_Arwen_15-07-13_mzmlid.parquet,/scratch/mirko.krause/dumps/PXD010000/training_columns/Biodiversity_B_thetaiotaomicron_Carb_01_26Aug15_Arwen_15-07-13_mzmlid.parquet,46099,0,12653,317,33129,0.009569,317,0,33129


In [23]:
pd.read_parquet(processing_result['output_file_path'][0]).head(2)

Unnamed: 0,peptide_sequence,mz_array,intensity_array,species,istrain
10,"[A, Q, D, A, N, S, K]","[55.52651, 57.070667, 58.65301, 59.04999, 70.49079, 71.36482, 71.36916, 81.070595, 84.04509, 84.0814, 87.055885, 89.071335, 99.08091, 101.060555, 104.07124, 105.03414, 110.07161, 116.99716, 122.0814, 129.10211, 130.08636, 132.2341, 144.98212, 147.11276, 149.02406, 156.05164, 166.04987, 168.26451, 174.9927, 176.97217, 183.07704, 184.94261, 186.08669, 187.07144, 200.10307, 214.9894, 216.1335, 222.95937, 234.14436, 240.96979, 252.96886, 268.97363, 270.9805, 274.95737, 276.9208, 282.9257, 288.99133, 290.89578, 292.96707, 294.9256, 296.91827, 300.9344, 301.11282, 312.8554, 312.93576, 312.9736, 320.9031, 322.65353, 325.82892, 330.94656, 330.98865, 331.1608, 348.1871, 348.8575, 348.90347, 348.95468, 349.00125, 349.87643, 349.92053, 350.1523, 358.35028, 366.8451, 366.89886, 367.1221, 367.83646, 384.84717, 385.84552, 419.22418, 534.25574, 545.37744, 732.1827]","[659.5671, 608.75525, 546.56464, 3819.5784, 591.54675, 2477.9727, 2694.7358, 686.86145, 1165.8091, 5537.022, 1078.8009, 10548.873, 791.52155, 703.69684, 881.34076, 1278.1882, 1130.8687, 587.8636, 14346.429, 4513.947, 2267.6965, 709.75635, 3117.74, 4489.533, 4378.6626, 757.805, 3406.3015, 663.79913, 4043.6924, 3147.7522, 5508.182, 676.4029, 999.5737, 984.10834, 2814.8125, 865.2825, 1080.5931, 9253.03, 6978.9414, 4747.2095, 1276.2789, 808.17487, 15744.225, 773.06506, 1434.1737, 919.12585, 969.67896, 835.977, 832.26984, 1275.6095, 673.7256, 1199.8829, 1071.8888, 1225.1465, 814.68445, 811.11993, 811.0509, 1426.3348, 700.696, 4342.713, 4328.0737, 1413.877, 3291.378, 725.9424, 928.89594, 6499.051, 5342.61, 1010.7088, 1527.7334, 789.90283, 4134.1743, 7074.271, 669.7933, 752.65875, 3421.3188, 2765.7603, 3741.364, 3163.1875, 1311.3015, 793.21173, 826.27454]",Methylomicrobium_alcaliphilum,Train
33,"[S, A, S, E, N, T, A, K]","[57.070744, 60.04527, 69.03429, 69.045105, 70.02956, 70.06591, 71.08615, 71.211266, 71.21855, 71.221565, 72.08146, 74.06073, 77.407425, 81.034134, 84.04502, 84.08143, 86.06049, 86.096985, 87.055786, 88.03971, 90.27, 92.253174, 95.04971, 95.085785, 97.02881, 101.07128, 101.1077, 102.05558, 110.0716, 112.05097, 112.08718, 113.054375, 113.07137, 115.08703, 116.07098, 122.08141, 126.05502, 129.0181, 129.06593, 129.10228, 130.04999, 130.08621, 131.08157, 132.07645, 132.08487, 133.06136, 135.05412, 136.06178, 137.05672, 137.0654, 141.06548, 143.08174, 143.11841, 145.09706, 147.07652, 147.1127, 148.06021, 152.05685, 155.08142, 157.01323, 157.06085, 158.09258, 159.0765, 162.06645, 164.0568, 166.07222, 167.08153, 169.06088, 170.04439, 171.07643, 171.11325, 172.0717, 173.0549, 173.09204, 173.12842, 175.11903, 176.1222, 176.99461, 178.0608, 183.07596, 185.07935, 185.09355, 186.08638, 186.12349, 187.07202, 187.10764, 189.08665, 190.06097, 198.08807, 199.07053, 200.10257, 200.13898, 201.12276, 202.07233, 203.10222, 204.09785, 204.13414, 208.07191, 212.1028, 213.08766, ...]","[1143.0697, 19260.041, 887.2041, 833.74133, 1524.0825, 5631.586, 1719.0298, 5572.0083, 1243.2659, 753.30536, 9365.847, 7974.683, 645.18524, 951.01245, 9802.182, 10888.421, 993.8404, 1001.7659, 9623.724, 950.3592, 692.3416, 680.32166, 807.13574, 817.4192, 5426.999, 4071.7788, 728.4322, 5123.5, 1141.1661, 55794.45, 2588.2976, 2880.6003, 1191.2072, 18423.299, 1539.7909, 7598.1143, 864.39417, 1656.529, 3182.4639, 17328.502, 2983.5916, 7229.348, 55641.586, 1290.6182, 2519.254, 2551.9092, 1185.7681, 84646.45, 1045.3793, 4001.9243, 839.3363, 8263.848, 1120.7463, 944.0327, 5280.4336, 15860.762, 847.4068, 6684.9756, 1249.5118, 1130.3918, 1031.5548, 9398.223, 19254.945, 11463.114, 10621.498, 2931.642, 988.80475, 925.57324, 1157.3889, 4976.7427, 819.0945, 690.0549, 1180.6656, 2898.2095, 1608.5857, 24398.0, 1291.9, 1076.0074, 6400.83, 1155.3615, 931.04694, 1131.7196, 1484.8499, 1056.363, 5125.423, 1051.5304, 3585.3267, 4460.2773, 2388.3562, 1677.0857, 746.99915, 1320.594, 1244.1313, 888.46686, 816.92346, 2644.899, 4443.9604, 1361.2762, 5345.5728, 1158.5089, ...]",Methylomicrobium_alcaliphilum,Train


In [24]:
processing_result.describe()

Unnamed: 0,original_sequence_count,NaN_decoy_count,above_fdr_count,left_decoys,left_targets,fdr,removed_decoys,removed_by_post_processor,final_sequence_count
count,235.0,235.0,235.0,235.0,235.0,235.0,235.0,235.0,235.0
mean,41468.46383,0.0,17783.255319,224.846809,23460.361702,0.00955,224.846809,0.0,23460.361702
std,7603.08072,0.0,6466.651584,71.819172,7222.015317,0.000436,71.819172,0.0,7222.015317
min,28756.0,0.0,7170.0,14.0,1464.0,0.007884,14.0,0.0,1464.0
25%,36647.5,0.0,13557.5,178.0,19210.0,0.009495,178.0,0.0,19210.0
50%,41598.0,0.0,16592.0,236.0,24516.0,0.009698,236.0,0.0,24516.0
75%,45195.5,0.0,20961.5,261.5,27283.0,0.009804,261.5,0.0,27283.0
max,86594.0,0.0,36829.0,574.0,58897.0,0.009994,574.0,0.0,58897.0


In [25]:
processing_result.drop(columns=['input_file_path', 'output_file_path']).sum()

original_sequence_count      9.745089e+06
NaN_decoy_count              0.000000e+00
above_fdr_count              4.179065e+06
left_decoys                  5.283900e+04
left_targets                 5.513185e+06
fdr                          2.244270e+00
removed_decoys               5.283900e+04
removed_by_post_processor    0.000000e+00
final_sequence_count         5.513185e+06
dtype: float64

In [26]:
processing_result.to_parquet(os.path.join(TRAINING_COLUMNS_DUMP_PATH, "processing_result.parquet"))