# Parallelized Data Filtering Pipeline
## Prerequisites

In [1]:
import glob
import os
import re
from typing import List, Dict

import pandas as pd
import wget
from openpyxl import load_workbook

from mmproteo.utils import log
from mmproteo.utils.formats.mz import filter_files
from mmproteo.utils.utils import ensure_dir_exists

In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 2000)

In [3]:
logger = log.DummyLogger(verbose=True)

INFO: Printing to Stdout


## Data Import

In [4]:
pwd

'/tf/workspace/notebooks'

Code to run before running this pipeline to get/create all relevant data files:

```
cd ../datasets
mkdir PXD010000
cd PXD010000
mmproteo -p PXD010000 -e mzml,mzid --thread-count 0 download extract mz2parquet
```

In [5]:
PROJECT = "PXD010000"
DATA_PATH = os.path.join("..", "datasets", PROJECT)
DUMP_PATH = os.path.join("..", "dumps", PROJECT)
TRAINING_COLUMNS_DUMP_PATH = os.path.join(DUMP_PATH, "training_columns")
MZMLID_FILES_PATH = os.path.join(DATA_PATH, "*_mzmlid.parquet")

In [6]:
# taken from https://www.biorxiv.org/content/10.1101/428334v2.supplementary-material
SPECIES_MAPPING_FILE_DOWNLOAD_URL = 'https://www.biorxiv.org/highwire/filestream/128716/field_highwire_adjunct_files/1/428334-2.xlsx'
SPECIES_MAPPING_FILE_PATH = os.path.join(DUMP_PATH, 'file_to_species_mapping.xlsx')

In [7]:
ensure_dir_exists(TRAINING_COLUMNS_DUMP_PATH)

In [8]:
MZMLID_FILE_PATHS = glob.glob(MZMLID_FILES_PATH)
len(MZMLID_FILE_PATHS)

235

In [9]:
def download_species_mapping(download_url: str, output_file: str):
    if os.path.exists(output_file):
        print(f"skipping, because '{output_file}' already exists")
    else:
        print(wget.download(download_url, out=output_file))

def read_xlsx(file: str) -> pd.DataFrame:
    workbook = load_workbook(file)
    worksheet = workbook[workbook.worksheets[0].title]
    data = worksheet.values
    columns = next(data)[0:]
    df = pd.DataFrame(data=data, columns=columns)
    return df

def get_species_mapping(download_url: str = SPECIES_MAPPING_FILE_DOWNLOAD_URL,
                        output_file: str = SPECIES_MAPPING_FILE_PATH) -> pd.DataFrame:
    download_species_mapping(download_url=download_url, output_file=output_file)
    df = read_xlsx(file=output_file)
    df = df.drop(columns='id')
    df = df.set_index('mgf_file')
    return df

def join_species_mapping(data_df: pd.DataFrame, species_df: pd.DataFrame) -> pd.DataFrame:
    df = data_df.copy()
    
    # remove '.mzml' extension
    df['mgf_file'] = df['mzml_filename'].str.slice(stop=-5)
    df = df.join(species_df, on='mgf_file')
    
    return df

In [10]:
species_df = get_species_mapping()
species_df.head(1)

skipping, because '../dumps/PXD010000/file_to_species_mapping.xlsx' already exists


Unnamed: 0_level_0,species,num_PSM,istrain
mgf_file,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Biodiversity_A_cryptum_FeTSB_anaerobic_1_01Jun16_Pippin_16-03-39,Acidiphilium_cryptum_JF-5,6659,Train


In [11]:
PEPTIDE_SEQUENCE = 'SpectrumIdentificationItem__1__PeptideSequence'

In [12]:
output_columns = [
    'peptide_sequence',
    'mz_array',
    'intensity_array',
    'species',
    'istrain'
]

In [13]:
column_patterns_to_drop = [
    "SpectrumIdentificationItem__(?!1__)",
    "SpectrumIdentificationItem__1__PeptideEvidenceRef__(?!0__)"  # they seem to be filled with NaN and None anyway
]

In [14]:
def drop_column_patterns(df: pd.DataFrame, column_patterns: List[str]) -> pd.DataFrame:
    for column_pattern in column_patterns:
        re_pattern = re.compile(column_pattern)
        columns_to_drop = [col for col in df.columns if re_pattern.match(col) is not None]
        df = df.drop(columns=columns_to_drop)
    return df

In [15]:
def join_species(df: pd.DataFrame) -> pd.DataFrame:
    df = join_species_mapping(data_df=df, species_df=species_df)
    return df

In [16]:
def build_modification_columns_dict(modification_columns: List[str], separator: str = '__') -> List[Dict[str, str]]:
    modification_columns_dict: Dict[str, Dict[str, str]] = dict()

    # this assumes that there is only one 'SpectrumIdentificationItem', otherwise the 
    # later ones overwrite the previous ones
    for col in modification_columns:
        parts = col.split(separator)
        modification = modification_columns_dict.get(parts[-2], dict())
        modification_columns_dict[parts[-2]] = modification
        modification[parts[-1]] = col

    modification_columns_dicts: List[Dict[str, str]] = [
        column_dict for _, column_dict in sorted(modification_columns_dict.items())
    ]
    return modification_columns_dicts

class ModificationAnnotator:
    def __init__(self,
                 modification_columns_dicts: List[Dict[str, str]],
                 modification_location_suffix: str = 'location',
                 modification_name_suffix: str = 'name'):
        self.modification_columns_dicts = modification_columns_dicts
        self.modification_location_suffix = modification_location_suffix
        self.modification_name_suffix = modification_name_suffix

    def __call__(self, col: pd.Series) -> None:
        is_na = col.isna()
        for modification_columns_dict in self.modification_columns_dicts:
            if is_na[modification_columns_dict[self.modification_location_suffix]]:
                break
            modification_location = int(
                col[modification_columns_dict[self.modification_location_suffix]]
            ) - 1  # 1-indexed
            modification_name = col[modification_columns_dict[self.modification_name_suffix]]
            col[PEPTIDE_SEQUENCE][modification_location] += f"({modification_name})"
        return None

def annotate_modifications(df: pd.DataFrame,
                           modification_location_suffix: str = 'location',
                           modification_name_suffix: str = 'name',
                           peptide_sequence_col: str = PEPTIDE_SEQUENCE) -> pd.DataFrame:
    modification_columns = [
        col for col in df.columns 
        if 'modification' in col.lower()
        and 'peptideevidenceref' not in col.lower()
    ]
    modification_columns_dicts = build_modification_columns_dict(modification_columns)
    modification_annotator = ModificationAnnotator(
        modification_columns_dicts=modification_columns_dicts,
        modification_location_suffix=modification_location_suffix,
        modification_name_suffix=modification_name_suffix
    )

    df[peptide_sequence_col] = df[peptide_sequence_col].apply(list)
    df.apply(func=modification_annotator, axis=1)
    return df

In [17]:
def rename_columns(df: pd.DataFrame) -> pd.DataFrame:
    df = df.rename(columns={PEPTIDE_SEQUENCE: 'peptide_sequence'})
    return df

In [18]:
def post_processor(df: pd.DataFrame) -> pd.DataFrame:
    df = drop_column_patterns(df=df, column_patterns=column_patterns_to_drop)
    df = join_species(df=df)
    df = annotate_modifications(df=df)
    df = rename_columns(df)
    return df

In [19]:
output_files = filter_files(input_file_paths=MZMLID_FILE_PATHS,
                            output_path=TRAINING_COLUMNS_DUMP_PATH,
                            fdr=0.01,
                            skip_existing=True,
                            output_columns=output_columns,
                            post_processor=post_processor,
                            thread_count=0,
                            logger=logger)
output_files[:1]

DEBUG: Processing items with 8 subprocesses
DEBUG: Skipping filtering '../datasets/PXD010000/Biodiversity_B_fragilis_LIB_anaerobic_03_01Feb16_Arwen_15-07-13_mzmlid.parquet' -> '../dumps/PXD010000/training_columns/Biodiversity_B_fragilis_LIB_anaerobic_03_01Feb16_Arwen_15-07-13_mzmlid.parquet' already exists
DEBUG: Skipping filtering '../datasets/PXD010000/Biodiversity_B_fragilis_01_28Jul15_Arwen_14-12-03_mzmlid.parquet' -> '../dumps/PXD010000/training_columns/Biodiversity_B_fragilis_01_28Jul15_Arwen_14-12-03_mzmlid.parquet' already exists
DEBUG: Skipping filtering '../datasets/PXD010000/Biodiversity_A_cryptum_FeTSB_anaerobic_1_01Jun16_Pippin_16-03-39_mzmlid.parquet' -> '../dumps/PXD010000/training_columns/Biodiversity_A_cryptum_FeTSB_anaerobic_1_01Jun16_Pippin_16-03-39_mzmlid.parquet' already exists
DEBUG: Trying to fdr-filter 235 mzmlid files
INFO: Finished filtering '../datasets/PXD010000/Biodiversity_P_ruminicola_MDM_anaerobic_2_09Jun16_Pippin_16-03-39_mzmlid.parquet' -> '../dumps/PX

INFO: Finished filtering '../datasets/PXD010000/Cj_media_MH_R1_23Feb15_Arwen_14-12-03_mzmlid.parquet' -> '../dumps/PXD010000/training_columns/Cj_media_MH_R1_23Feb15_Arwen_14-12-03_mzmlid.parquet'
DEBUG: Skipping filtering '../datasets/PXD010000/Biodiversity_B_fragilis_LIB_aerobic_02_01Feb16_Arwen_15-07-13_mzmlid.parquet' -> '../dumps/PXD010000/training_columns/Biodiversity_B_fragilis_LIB_aerobic_02_01Feb16_Arwen_15-07-13_mzmlid.parquet' already exists
INFO: Finished filtering '../datasets/PXD010000/Biodiversity_S_aurantiaca_CYE_aerobic_1_17July16_Samwise_16-04-10_mzmlid.parquet' -> '../dumps/PXD010000/training_columns/Biodiversity_S_aurantiaca_CYE_aerobic_1_17July16_Samwise_16-04-10_mzmlid.parquet'
INFO: Finished filtering '../datasets/PXD010000/Biodiversity_S_agalactiae_LIB_aerobic_03_26Feb16_Arwen_16-01-01_mzmlid.parquet' -> '../dumps/PXD010000/training_columns/Biodiversity_S_agalactiae_LIB_aerobic_03_26Feb16_Arwen_16-01-01_mzmlid.parquet'
INFO: Finished filtering '../datasets/PXD010

INFO: Finished filtering '../datasets/PXD010000/Biodiversity_M_xanthus_DZ2_plates_2_03May16_Samwise_16-03-32_mzmlid.parquet' -> '../dumps/PXD010000/training_columns/Biodiversity_M_xanthus_DZ2_plates_2_03May16_Samwise_16-03-32_mzmlid.parquet'
INFO: Finished filtering '../datasets/PXD010000/Biodiversity_F_succinogenes_MDM_02_27Dec15_Arwen_15-07-13_mzmlid.parquet' -> '../dumps/PXD010000/training_columns/Biodiversity_F_succinogenes_MDM_02_27Dec15_Arwen_15-07-13_mzmlid.parquet'
INFO: Finished filtering '../datasets/PXD010000/M_alcali_copp_MeOH_B2_T2_04_QE_23Mar18_Oak_18-01-07_mzmlid.parquet' -> '../dumps/PXD010000/training_columns/M_alcali_copp_MeOH_B2_T2_04_QE_23Mar18_Oak_18-01-07_mzmlid.parquet'
INFO: Finished filtering '../datasets/PXD010000/Biodiversity_HL111_HLHglutamate_aerobic_1_14July16_Pippin_16-05-01_mzmlid.parquet' -> '../dumps/PXD010000/training_columns/Biodiversity_HL111_HLHglutamate_aerobic_1_14July16_Pippin_16-05-01_mzmlid.parquet'
INFO: Finished filtering '../datasets/PXD010

INFO: Finished filtering '../datasets/PXD010000/Biodiversity_M_xanthus_pellet_set2_3_13Jun16_Pippin_16-03-39_mzmlid.parquet' -> '../dumps/PXD010000/training_columns/Biodiversity_M_xanthus_pellet_set2_3_13Jun16_Pippin_16-03-39_mzmlid.parquet'
INFO: Finished filtering '../datasets/PXD010000/M_alcali_copp_CH4_B2_T2_10_QE_23Mar18_Oak_18-01-07_mzmlid.parquet' -> '../dumps/PXD010000/training_columns/M_alcali_copp_CH4_B2_T2_10_QE_23Mar18_Oak_18-01-07_mzmlid.parquet'
INFO: Finished filtering '../datasets/PXD010000/Biodiversity_M_smegmatis_BHI_aerobic_2_05Oct16_Pippin_16-05-06_mzmlid.parquet' -> '../dumps/PXD010000/training_columns/Biodiversity_M_smegmatis_BHI_aerobic_2_05Oct16_Pippin_16-05-06_mzmlid.parquet'
INFO: Finished filtering '../datasets/PXD010000/Biodiversity_B_subtilis_NCIB3610_pellet_2_03May16_Samwise_16-03-32_mzmlid.parquet' -> '../dumps/PXD010000/training_columns/Biodiversity_B_subtilis_NCIB3610_pellet_2_03May16_Samwise_16-03-32_mzmlid.parquet'
INFO: Finished filtering '../dataset

INFO: Finished filtering '../datasets/PXD010000/LP_LS_Phi_Stat_R3_30Sep14_Pippin_13-04-12_mzmlid.parquet' -> '../dumps/PXD010000/training_columns/LP_LS_Phi_Stat_R3_30Sep14_Pippin_13-04-12_mzmlid.parquet'
INFO: Finished filtering '../datasets/PXD010000/QC_Shew_13_05_500ng_2_100uL_5hr_30Mar14_Samwise_13-07-17_mzmlid.parquet' -> '../dumps/PXD010000/training_columns/QC_Shew_13_05_500ng_2_100uL_5hr_30Mar14_Samwise_13-07-17_mzmlid.parquet'
INFO: Finished filtering '../datasets/PXD010000/Biodiversity_S_griseorubens_HSM_aerobic_1_23Nov16_Pippin_16-09-11_mzmlid.parquet' -> '../dumps/PXD010000/training_columns/Biodiversity_S_griseorubens_HSM_aerobic_1_23Nov16_Pippin_16-09-11_mzmlid.parquet'
INFO: Finished filtering '../datasets/PXD010000/Biodiversity_D_acidovorans_TGY_aerobic_01_29Apr16_Samwise_16-03-32_renamed_mzmlid.parquet' -> '../dumps/PXD010000/training_columns/Biodiversity_D_acidovorans_TGY_aerobic_01_29Apr16_Samwise_16-03-32_renamed_mzmlid.parquet'
INFO: Finished filtering '../datasets/PX

INFO: Finished filtering '../datasets/PXD010000/Biodiversity_R_gnavus_Carb_01_28Oct15_Arwen_15-07-13_mzmlid.parquet' -> '../dumps/PXD010000/training_columns/Biodiversity_R_gnavus_Carb_01_28Oct15_Arwen_15-07-13_mzmlid.parquet'
DEBUG: Skipping filtering '../datasets/PXD010000/Biodiversity_A_cryptum_FeTSB_anaerobic_2_01Jun16_Pippin_16-03-39_mzmlid.parquet' -> '../dumps/PXD010000/training_columns/Biodiversity_A_cryptum_FeTSB_anaerobic_2_01Jun16_Pippin_16-03-39_mzmlid.parquet' already exists
INFO: Finished filtering '../datasets/PXD010000/Biodiversity_P_ruminicola_MDM_anaerobic_1_09Jun16_Pippin_16-03-39_mzmlid.parquet' -> '../dumps/PXD010000/training_columns/Biodiversity_P_ruminicola_MDM_anaerobic_1_09Jun16_Pippin_16-03-39_mzmlid.parquet'
INFO: Finished filtering '../datasets/PXD010000/Biodiversity_F_novicida_TSB_aerobic_02_01Feb16_Arwen_15-07-13_mzmlid.parquet' -> '../dumps/PXD010000/training_columns/Biodiversity_F_novicida_TSB_aerobic_02_01Feb16_Arwen_15-07-13_mzmlid.parquet'
INFO: Finish

INFO: Finished filtering '../datasets/PXD010000/Biodiversity_M_xanthus_DZ2_48h_plates_3_13Jun16_Pippin_16-03-39_mzmlid.parquet' -> '../dumps/PXD010000/training_columns/Biodiversity_M_xanthus_DZ2_48h_plates_3_13Jun16_Pippin_16-03-39_mzmlid.parquet'
INFO: Finished filtering '../datasets/PXD010000/Biodiversity_C_indologenes_LIB_aerobic_01_03May16_Samwise_16-03-32_mzmlid.parquet' -> '../dumps/PXD010000/training_columns/Biodiversity_C_indologenes_LIB_aerobic_01_03May16_Samwise_16-03-32_mzmlid.parquet'
INFO: Finished filtering '../datasets/PXD010000/S_venezuelae_MYM_1_21Mar16_Arwen_16-01-03_mzmlid.parquet' -> '../dumps/PXD010000/training_columns/S_venezuelae_MYM_1_21Mar16_Arwen_16-01-03_mzmlid.parquet'
INFO: Finished filtering '../datasets/PXD010000/Biodiversity_HL49_HLHYE_aerobic_1_05Oct16_Pippin_16-05-06_mzmlid.parquet' -> '../dumps/PXD010000/training_columns/Biodiversity_HL49_HLHYE_aerobic_1_05Oct16_Pippin_16-05-06_mzmlid.parquet'
INFO: Finished filtering '../datasets/PXD010000/Biodiversi

[{'input_file_path': '../datasets/PXD010000/Biodiversity_B_thet_CMgluc_anaerobic_01_01Feb16_Arwen_15-07-13_mzmlid.parquet',
  'output_file_path': '../dumps/PXD010000/training_columns/Biodiversity_B_thet_CMgluc_anaerobic_01_01Feb16_Arwen_15-07-13_mzmlid.parquet',
  'original_sequence_count': 44969,
  'NaN_decoy_count': 0,
  'above_fdr_count': 20389,
  'left_decoys': 238,
  'left_targets': 24342,
  'fdr': 0.00977733957768466,
  'removed_decoys': 238,
  'removed_by_post_processor': 0,
  'final_sequence_count': 24342}]

In [20]:
processing_result = pd.DataFrame(data=output_files)
print(f"number of processing results = {len(processing_result)}")
processing_result

number of processing results = 195


Unnamed: 0,input_file_path,output_file_path,original_sequence_count,NaN_decoy_count,above_fdr_count,left_decoys,left_targets,fdr,removed_decoys,removed_by_post_processor,final_sequence_count
0,../datasets/PXD010000/Biodiversity_B_thet_CMgluc_anaerobic_01_01Feb16_Arwen_15-07-13_mzmlid.parquet,../dumps/PXD010000/training_columns/Biodiversity_B_thet_CMgluc_anaerobic_01_01Feb16_Arwen_15-07-13_mzmlid.parquet,44969,0,20389,238,24342,0.009777,238,0,24342
1,../datasets/PXD010000/Biodiversity_M_smegmatis_BHI_aerobic_3_05Oct16_Pippin_16-05-06_mzmlid.parquet,../dumps/PXD010000/training_columns/Biodiversity_M_smegmatis_BHI_aerobic_3_05Oct16_Pippin_16-05-06_mzmlid.parquet,41343,0,13471,258,27614,0.009343,258,0,27614
2,../datasets/PXD010000/Biodiversity_Lactobacillus_casei_MRS_01_27Dec15_Arwen_15-07-13_mzmlid.parquet,../dumps/PXD010000/training_columns/Biodiversity_Lactobacillus_casei_MRS_01_27Dec15_Arwen_15-07-13_mzmlid.parquet,44383,0,29959,140,14284,0.009801,140,0,14284
3,../datasets/PXD010000/Biodiversity_P_ruminicola_MDM_anaerobic_2_09Jun16_Pippin_16-03-39_mzmlid.parquet,../dumps/PXD010000/training_columns/Biodiversity_P_ruminicola_MDM_anaerobic_2_09Jun16_Pippin_16-03-39_mzmlid.parquet,36615,0,17137,187,19291,0.009694,187,0,19291
4,../datasets/PXD010000/Biodiversity_S_agalactiae_LIB_aerobic_01_26Feb16_Arwen_16-01-01_mzmlid.parquet,../dumps/PXD010000/training_columns/Biodiversity_S_agalactiae_LIB_aerobic_01_26Feb16_Arwen_16-01-01_mzmlid.parquet,36436,0,21863,115,14458,0.007954,115,0,14458
5,../datasets/PXD010000/Biodiversity_S_thermosulf_FeYE_anaerobic_1_01Jun16_Pippin_16-03-39_mzmlid.parquet,../dumps/PXD010000/training_columns/Biodiversity_S_thermosulf_FeYE_anaerobic_1_01Jun16_Pippin_16-03-39_mzmlid.parquet,32318,0,15143,166,17009,0.00976,166,0,17009
6,../datasets/PXD010000/Biodiversity_M_xanthus_DZ2_48h_plates_1_13Jun16_Pippin_16-03-39_mzmlid.parquet,../dumps/PXD010000/training_columns/Biodiversity_M_xanthus_DZ2_48h_plates_1_13Jun16_Pippin_16-03-39_mzmlid.parquet,37305,0,12523,235,24547,0.009573,235,0,24547
7,../datasets/PXD010000/M_alcali_copp_MeOH_B1_T2_02_QE_23Mar18_Oak_18-01-07_mzmlid.parquet,../dumps/PXD010000/training_columns/M_alcali_copp_MeOH_B1_T2_02_QE_23Mar18_Oak_18-01-07_mzmlid.parquet,53875,0,31362,222,22291,0.009959,222,0,22291
8,../datasets/PXD010000/Biodiversity_D_acidovorans_TGY_aerobic_03_29Apr16_Samwise_16-03-32_renamed_mzmlid.parquet,../dumps/PXD010000/training_columns/Biodiversity_D_acidovorans_TGY_aerobic_03_29Apr16_Samwise_16-03-32_renamed_mzmlid.parquet,31944,0,11629,199,20116,0.009893,199,0,20116
9,../datasets/PXD010000/Biodiversity_M_xanthus_pellet_set2_2_13Jun16_Pippin_16-03-39_mzmlid.parquet,../dumps/PXD010000/training_columns/Biodiversity_M_xanthus_pellet_set2_2_13Jun16_Pippin_16-03-39_mzmlid.parquet,37236,0,9760,261,27215,0.00959,261,0,27215


In [21]:
pd.read_parquet(processing_result['output_file_path'][0]).head(2)

Unnamed: 0,peptide_sequence,mz_array,intensity_array,species,istrain
13,"[K, N, Y, P, P, G, Q, H, G, N, S, R]","[101.07112, 101.10742, 107.021416, 110.07149, 110.20564, 112.05074, 112.08743, 116.9714, 126.05578, 129.09389, 129.10258, 129.80075, 136.05199, 136.06181, 136.07568, 139.08679, 146.05948, 158.09203, 164.71826, 166.5727, 172.10834, 175.11887, 181.09839, 191.08078, 195.08641, 197.3471, 198.12485, 208.10797, 209.0921, 221.10052, 226.11795, 233.09122, 233.12791, 243.14487, 250.11877, 261.0862, 262.1505, 269.1606, 269.95697, 283.13937, 285.6411, 286.14096, 287.96625, 348.0716, 375.16342, 380.1939, 388.1928, 389.18234, 406.20917, 416.1904, 418.19452, 418.69345, 426.70694, 427.20596, 427.70856, 433.21497, 434.21732, 466.22662, 466.71808, 469.83746, 475.23376, 475.73486, 476.23047, 542.0604, 553.2464, 570.277, 571.2696, 698.3293, 738.3299, 755.35425, 756.3499, 778.90155, 835.37695, 852.4092, 853.4059, 949.454, 950.4538, 1272.6879]","[1494.7335, 1330.0582, 555.6968, 12300.612, 671.8086, 6780.3574, 751.6792, 748.8547, 927.55804, 873.4868, 17632.912, 638.6927, 1446.98, 23024.256, 9145.9375, 881.254, 869.34875, 935.7128, 692.7625, 820.34534, 733.90424, 2487.5261, 644.194, 722.5788, 943.80945, 593.9168, 804.5731, 1469.4506, 1470.86, 705.2241, 3405.9792, 1499.5656, 1526.7897, 1187.5823, 9305.036, 829.0959, 5530.5317, 744.4488, 5240.722, 1281.8816, 3153.7954, 1113.0565, 931.864, 1560.404, 988.2837, 991.70807, 916.1088, 777.2383, 2335.9285, 841.3272, 4616.214, 1248.9667, 12370.505, 7089.8315, 1014.40234, 10816.706, 929.97485, 741.0236, 849.171, 3891.2617, 13207.523, 6247.789, 1057.9454, 843.8056, 809.49664, 6597.267, 1286.7241, 1126.5024, 1089.6128, 5042.437, 1114.732, 669.27545, 1074.6268, 11747.381, 4883.1143, 4003.6157, 1596.3551, 833.33765]",Bacteroides_thetaiotaomicron_VPI-5482,Train
15,"[K, N, Y, P, P, G, Q, H, G, N, S, R]","[101.10806, 129.10223, 136.06146, 159.04768, 167.34846, 174.89876, 388.19592, 389.17804, 406.20837, 426.70538, 433.21164, 460.49826, 475.2321, 475.73428, 755.3489, 852.39984, 853.38873, 863.1118, 949.4544, 950.4531, 1112.5189]","[797.35864, 4481.5327, 778.8775, 644.96454, 660.11194, 670.0452, 1139.2582, 826.70264, 801.45056, 2769.278, 628.2064, 687.6226, 6952.3896, 1265.5562, 1002.8281, 5185.783, 907.1982, 696.1757, 8979.854, 2546.05, 1225.1301]",Bacteroides_thetaiotaomicron_VPI-5482,Train


In [22]:
processing_result.describe()

Unnamed: 0,original_sequence_count,NaN_decoy_count,above_fdr_count,left_decoys,left_targets,fdr,removed_decoys,removed_by_post_processor,final_sequence_count
count,195.0,195.0,195.0,195.0,195.0,195.0,195.0,195.0,195.0
mean,41638.620513,0.0,17342.635897,231.374359,24064.610256,0.009593,231.374359,0.0,24064.610256
std,8018.762765,0.0,6740.265267,71.04738,7206.875627,0.000378,71.04738,0.0,7206.875627
min,28756.0,0.0,7170.0,64.0,6971.0,0.007954,64.0,0.0,6971.0
25%,36476.5,0.0,12816.5,191.5,20210.0,0.009516,191.5,0.0,20210.0
50%,41605.0,0.0,15733.0,236.0,24610.0,0.009715,236.0,0.0,24610.0
75%,45264.0,0.0,20655.0,269.0,27894.0,0.009821,269.0,0.0,27894.0
max,86594.0,0.0,36829.0,574.0,58897.0,0.009994,574.0,0.0,58897.0


In [23]:
processing_result.drop(columns=['input_file_path', 'output_file_path']).sum()

original_sequence_count      8.119531e+06
NaN_decoy_count              0.000000e+00
above_fdr_count              3.381814e+06
left_decoys                  4.511800e+04
left_targets                 4.692599e+06
fdr                          1.870634e+00
removed_decoys               4.511800e+04
removed_by_post_processor    0.000000e+00
final_sequence_count         4.692599e+06
dtype: float64

In [24]:
processing_result.to_parquet(os.path.join(TRAINING_COLUMNS_DUMP_PATH, "processing_result.parquet"))