# Data Filtering Pipeline
## Prerequisites

In [1]:
import glob
import pandas as pd
from mmproteo.utils.utils import ensure_dir_exists
from mmproteo.utils import log
from mmproteo.utils.formats.mz import FilteringProcessor, filter_files
from mmproteo.utils.processing import ItemProcessor
import wget
from openpyxl import load_workbook
import os

In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 1000)

In [3]:
logger = log.DummyLogger(verbose=True)

INFO: Printing to Stdout


## Data Import

In [4]:
pwd

'/tf/workspace/notebooks'

Code to run before running this pipeline to get/create all relevant data files:

```
cd ../datasets
mkdir PXD010000
cd PXD010000
mmproteo -p PXD010000 -e mzml,mzid --thread-count 0 download extract mz2parquet
```

In [5]:
PROJECT = "PXD010000"
DATA_PATH = f"../datasets/{PROJECT}"
DUMP_PATH = f"../dumps/{PROJECT}"
TRAINING_COLUMNS_DUMP_PATH = DUMP_PATH + "/training_columns"
MZMLID_FILES_PATH = f"{DATA_PATH}/*_mzmlid.parquet"

In [6]:
# taken from https://www.biorxiv.org/content/10.1101/428334v2.supplementary-material
SPECIES_MAPPING_FILE_DOWNLOAD_URL = 'https://www.biorxiv.org/highwire/filestream/128716/field_highwire_adjunct_files/1/428334-2.xlsx'
SPECIES_MAPPING_FILE_PATH = os.path.join(DUMP_PATH, 'file_to_species_mapping.xlsx')

In [7]:
ensure_dir_exists(TRAINING_COLUMNS_DUMP_PATH)

In [8]:
MZMLID_FILE_PATHS = glob.glob(MZMLID_FILES_PATH)
len(MZMLID_FILE_PATHS)

17

In [9]:
def download_species_mapping(download_url: str, output_file: str):
    if os.path.exists(output_file):
        print(f"skipping, because '{output_file}' already exists")
    else:
        print(wget.download(download_url, out=output_file))

def read_xlsx(file: str) -> pd.DataFrame:
    workbook = load_workbook(SPECIES_MAPPING_FILE_PATH)
    worksheet = workbook[workbook.worksheets[0].title]
    data = worksheet.values
    columns = next(data)[0:]
    df = pd.DataFrame(data=data, columns=columns)
    return df

In [10]:
def get_species_mapping(download_url: str = SPECIES_MAPPING_FILE_DOWNLOAD_URL,
                       output_file: str = SPECIES_MAPPING_FILE_PATH) -> pd.DataFrame:
    download_species_mapping(download_url=download_url, output_file=output_file)
    df = read_xlsx(file=output_file)
    df = df.drop(columns='id')
    df = df.set_index('mgf_file')
    return df

def join_species_mapping(data_df: pd.DataFrame, species_df: pd.DataFrame) -> pd.DataFrame:
    df = data_df.copy()
    
    # remove '.mzml' extension
    df['mgf_file'] = df['mzml_filename'].str.slice(stop=-5)
    df = df.join(species_df, on='mgf_file')
    
    return df

In [11]:
species_df = get_species_mapping()
species_df.head(1)

skipping, because '../dumps/PXD010000/file_to_species_mapping.xlsx' already exists


Unnamed: 0_level_0,species,num_PSM,istrain
mgf_file,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Biodiversity_A_cryptum_FeTSB_anaerobic_1_01Jun16_Pippin_16-03-39,Acidiphilium_cryptum_JF-5,6659,Train


In [12]:
output_columns = FilteringProcessor.get_default_output_columns()
output_columns.append('species')
output_columns.append('istrain')
output_columns

['SpectrumIdentificationItem__1__PeptideEvidenceRef__PeptideSequence',
 'mz_array',
 'intensity_array',
 'species',
 'istrain']

In [13]:
def join_species(df: pd.DataFrame) -> pd.DataFrame:
    return join_species_mapping(data_df=df, species_df=species_df)

In [14]:
output_files = filter_files(input_file_paths=MZMLID_FILE_PATHS,
                            output_path=TRAINING_COLUMNS_DUMP_PATH,
                            fdr=0.01,
                            skip_existing=False,
                            output_columns=output_columns,
                            post_processor=join_species,
                            thread_count=0,
                            logger=logger)
output_files[:1]

DEBUG: Processing items with 8 subprocesses
DEBUG: Trying to fdr-filter 17 mzmlid files
INFO: Finished filtering '../datasets/PXD010000/Biodiversity_A_cryptum_FeTSB_anaerobic_1_01Jun16_Pippin_16-03-39_mzmlid.parquet' -> '../dumps/PXD010000/training_columns/Biodiversity_A_cryptum_FeTSB_anaerobic_1_01Jun16_Pippin_16-03-39_mzmlid.parquet'
INFO: Finished filtering '../datasets/PXD010000/Biodiversity_A_tumefaciens_R2A_aerobic_2_23Nov16_Pippin_16-09-11_mzmlid.parquet' -> '../dumps/PXD010000/training_columns/Biodiversity_A_tumefaciens_R2A_aerobic_2_23Nov16_Pippin_16-09-11_mzmlid.parquet'
INFO: Finished filtering '../datasets/PXD010000/Biodiversity_A_tumefaciens_R2A_aerobic_1_23Nov16_Pippin_16-09-11_mzmlid.parquet' -> '../dumps/PXD010000/training_columns/Biodiversity_A_tumefaciens_R2A_aerobic_1_23Nov16_Pippin_16-09-11_mzmlid.parquet'
INFO: Finished filtering '../datasets/PXD010000/Biodiversity_A_faecalis_LB_aerobic_01_26Feb16_Arwen_16-01-01_mzmlid.parquet' -> '../dumps/PXD010000/training_colum

[{'input_file_path': '../datasets/PXD010000/Biodiversity_A_cryptum_FeTSB_anaerobic_1_01Jun16_Pippin_16-03-39_mzmlid.parquet',
  'output_file_path': '../dumps/PXD010000/training_columns/Biodiversity_A_cryptum_FeTSB_anaerobic_1_01Jun16_Pippin_16-03-39_mzmlid.parquet',
  'original_sequence_count': 29447,
  'NaN_decoy_count': 571,
  'above_fdr_count': 20938,
  'left_decoys': 72,
  'left_targets': 7866,
  'fdr': 0.009153318077803204,
  'removed_decoys': 72,
  'removed_by_post_processor': 0,
  'final_sequence_count': 7866}]

In [18]:
processing_result = pd.DataFrame(data=output_files)
print(f"number of processing results = {len(processing_result)}")
processing_result

number of processing results = 17


Unnamed: 0,input_file_path,output_file_path,original_sequence_count,NaN_decoy_count,above_fdr_count,left_decoys,left_targets,fdr,removed_decoys,removed_by_post_processor,final_sequence_count
0,../datasets/PXD010000/Biodiversity_A_cryptum_FeTSB_anaerobic_1_01Jun16_Pippin_16-03-39_mzmlid.parquet,../dumps/PXD010000/training_columns/Biodiversity_A_cryptum_FeTSB_anaerobic_1_01Jun16_Pippin_16-03-39_mzmlid.parquet,29447,571,20938,72,7866,0.009153,72,0,7866
1,../datasets/PXD010000/Biodiversity_A_tumefaciens_R2A_aerobic_1_23Nov16_Pippin_16-09-11_mzmlid.parquet,../dumps/PXD010000/training_columns/Biodiversity_A_tumefaciens_R2A_aerobic_1_23Nov16_Pippin_16-09-11_mzmlid.parquet,44899,14491,21273,117,9018,0.012974,117,0,9018
2,../datasets/PXD010000/Biodiversity_B_cereus_ATCC14579_LB_aerobic_2_17July16_Samwise_16-04-10_mzmlid.parquet,../dumps/PXD010000/training_columns/Biodiversity_B_cereus_ATCC14579_LB_aerobic_2_17July16_Samwise_16-04-10_mzmlid.parquet,41267,638,12571,273,27785,0.009825,273,0,27785
3,../datasets/PXD010000/Biodiversity_B_cereus_PN_L_CL_1_09Oct16_Pippin_16-05-06_mzmlid.parquet,../dumps/PXD010000/training_columns/Biodiversity_B_cereus_PN_L_CL_1_09Oct16_Pippin_16-05-06_mzmlid.parquet,40647,451,14018,247,25931,0.009525,247,0,25931
4,../datasets/PXD010000/Biodiversity_B_cereus_ATCC14579_LB_aerobic_1_17July16_Samwise_16-04-10_mzmlid.parquet,../dumps/PXD010000/training_columns/Biodiversity_B_cereus_ATCC14579_LB_aerobic_1_17July16_Samwise_16-04-10_mzmlid.parquet,41390,651,12595,279,27865,0.010013,279,0,27865
5,../datasets/PXD010000/Biodiversity_B_bifidum_CMcarb_anaerobic_03_26Feb16_Arwen_16-01-01_mzmlid.parquet,../dumps/PXD010000/training_columns/Biodiversity_B_bifidum_CMcarb_anaerobic_03_26Feb16_Arwen_16-01-01_mzmlid.parquet,36793,40,20566,128,16059,0.007971,128,0,16059
6,../datasets/PXD010000/Biodiversity_A_tumefaciens_R2A_aerobic_2_23Nov16_Pippin_16-09-11_mzmlid.parquet,../dumps/PXD010000/training_columns/Biodiversity_A_tumefaciens_R2A_aerobic_2_23Nov16_Pippin_16-09-11_mzmlid.parquet,44379,14323,21237,104,8715,0.011933,104,0,8715
7,../datasets/PXD010000/Biodiversity_A_faecalis_LB_aerobic_01_26Feb16_Arwen_16-01-01_mzmlid.parquet,../dumps/PXD010000/training_columns/Biodiversity_A_faecalis_LB_aerobic_01_26Feb16_Arwen_16-01-01_mzmlid.parquet,37414,331,19204,145,17734,0.008176,145,0,17734
8,../datasets/PXD010000/Biodiversity_B_bifidum_CMcarb_anaerobic_02_26Feb16_Arwen_16-01-01_mzmlid.parquet,../dumps/PXD010000/training_columns/Biodiversity_B_bifidum_CMcarb_anaerobic_02_26Feb16_Arwen_16-01-01_mzmlid.parquet,36746,36,20501,138,16071,0.008587,138,0,16071
9,../datasets/PXD010000/Biodiversity_B_bifidum_CMcarb_anaerobic_01_26Feb16_Arwen_16-01-01_mzmlid.parquet,../dumps/PXD010000/training_columns/Biodiversity_B_bifidum_CMcarb_anaerobic_01_26Feb16_Arwen_16-01-01_mzmlid.parquet,36680,37,20049,130,16464,0.007896,130,0,16464


In [19]:
pd.read_parquet(processing_result.output_file_path[0])

Unnamed: 0,SpectrumIdentificationItem__1__PeptideEvidenceRef__PeptideSequence,mz_array,intensity_array,species,istrain
28,KTANGKPTSAARPTPTRR,"[119.026276, 129.10242, 130.106, 136.06168, 145.0969, 156.76949, 164.85573, 170.10326, 173.0915, 175.11906, 186.12355, 192.22217, 283.13882, 354.1733, 397.3297, 437.57962, 437.91693, 438.25174, 448.74512, 455.22256, 458.75394, 474.01434, 474.27203, 495.83508, 531.6236, 548.31146, 556.8167, 557.3126, 568.8131, 569.3073, 577.3114, 577.81494, 578.3125, 584.3174, 595.3419, 598.8284, 599.3284, 600.345, 604.34985, 607.34326, 607.8346, 612.3402, 624.85504, 625.8565, 626.349, 626.85126, 633.8519, 634.3542, 646.85846, 647.3572, 647.85675, 655.8684, 656.3654, 656.8717, 739.9118, 748.41516, 748.92145, 1094.5874, 1136.6008, 1198.9551]","[392.24573, 12147.2705, 666.3889, 823.4881, 972.2865, 361.78607, 417.08383, 456.71222, 444.43277, 6938.9653, 2749.4, 362.23166, 3266.9854, 705.06445, 421.15955, 2514.085, 1735.2051, 406.9606, 445.42236, 577.852, 402.63007, 934.3149, 581.1236, 4186.4634, 817.3599, 485.8316, 1748.2828, 535.7135, 724.7904, 587.59784, 487.24197, 2499.4954, 494.83868, 487.2329, 582.5176, 702.2936, 681.7647, 558.97064, 571.6264, 3133.6794, 740.22906, 611.37427, 613.9938, 2905.3833, 2306.8975, 933.85876, 2563.092, 1895.7808, 2862.7026, 4704.8896, 2356.066, 7795.0273, 2066.8728, 536.8591, 464.84116, 488.13596, 565.3722, 663.11945, 1888.6259, 485.75937]",Acidiphilium_cryptum_JF-5,Train
67,TANGKPTSAARPTPTRR,"[100.076126, 101.07142, 102.65847, 112.08731, 116.07152, 127.05014, 127.08706, 129.10248, 130.09898, 131.08194, 131.0964, 136.06255, 141.06573, 141.10223, 144.07684, 145.09726, 146.101, 147.09102, 149.04485, 151.08597, 155.08131, 157.10779, 158.09236, 167.05525, 168.05492, 169.06157, 169.09734, 171.11298, 173.09233, 175.11914, 176.12292, 178.10922, 179.10727, 181.09666, 186.12383, 187.12572, 189.115, 196.10718, 199.10756, 208.95291, 238.11821, 248.10284, 255.14467, 255.65672, 265.12866, 266.1132, 268.12704, 282.15506, 283.13962, 284.145, 285.13333, 286.1314, 287.12854, 318.92264, 328.43604, 336.16617, 342.01868, 342.53687, 354.17944, 359.02774, 360.029, 361.02628, 364.21445, 374.71375, 377.2116, 398.96924, 399.22525, 399.47473, 399.56046, 403.22864, 403.47565, 403.72675, 403.97348, 405.22763, 405.5638, 408.55597, 412.47546, 416.72723, 416.98312, 421.2324, 421.48282, 422.90533, 423.23853, 426.24008, 430.08963, 431.57574, 431.9065, 432.23834, 437.58102, 437.91638, 438.25027, 441.7452, 441.993, 446.23978, 446.4949, 446.776, 455.2206, 474.27643, 477.7837, 478.76593, ...]","[970.92377, 8024.293, 438.97754, 2044.9241, 561.6645, 562.46136, 1025.5193, 45620.453, 2429.1597, 493.67728, 5760.535, 564.12805, 725.96246, 687.16113, 480.4321, 39380.633, 923.3975, 599.9227, 941.59937, 700.46454, 1127.3604, 981.13477, 618.1566, 3090.163, 643.3061, 551.7212, 1692.1259, 3665.422, 11353.051, 25869.855, 682.6219, 493.28824, 1065.8511, 647.7293, 11052.513, 726.6507, 550.3089, 4186.7534, 2652.8933, 2889.251, 3592.6729, 2128.6946, 842.402, 431.40222, 2923.3171, 2151.774, 488.2678, 2129.3726, 13352.417, 636.7075, 699.53015, 617.92163, 745.91907, 2386.8274, 621.1262, 1157.5568, 2568.4563, 544.1001, 858.95496, 7178.6084, 18405.688, 1932.2197, 828.2166, 608.3685, 1130.984, 1097.5002, 610.0769, 602.9149, 541.29913, 2489.858, 4889.5854, 1031.2015, 566.7529, 2333.2449, 1020.2636, 1052.1848, 648.9998, 882.9873, 708.9695, 551.6752, 640.3782, 629.0327, 565.07196, 635.3501, 986.94135, 2477.0735, 2930.9094, 731.61957, 28104.584, 11376.41, 3264.2295, 2105.532, 1697.2511, 2107.8147, 1077.1783, 2534.8892, 538.2479, 680.5165, 727.0574, 972.1554, ...]",Acidiphilium_cryptum_JF-5,Train
69,KTANGKPTSAARPTPTR,"[101.07131, 101.10792, 110.071556, 112.05059, 116.07049, 123.09159, 129.1025, 130.09755, 130.1063, 131.0815, 131.09616, 131.98347, 136.06207, 141.10103, 145.09756, 148.18243, 151.08664, 153.24802, 155.08102, 156.0769, 169.09775, 171.11278, 173.0918, 175.11919, 185.12874, 186.0875, 186.12366, 196.10757, 199.10687, 210.32182, 238.11945, 252.09691, 255.14525, 265.1292, 266.11246, 266.1494, 268.13034, 274.64883, 282.1554, 283.13882, 283.6505, 284.145, 301.61932, 309.1554, 311.13095, 336.16602, 344.156, 353.18613, 354.17584, 356.19254, 373.21915, 383.17136, 385.54767, 385.87955, 386.21463, 398.2023, 410.2029, 432.20508, 435.2489, 437.2149, 439.739, 439.84042, 454.24023, 455.2237, 456.2305, 472.257, 478.7668, 479.26852, 485.2708, 485.5975, 491.9224, 498.76068, 507.27588, 507.7817, 508.94415, 509.2799, 509.61517, 520.7774, 529.29193, 529.791, 530.2924, 534.284, 534.7795, 536.95544, 537.28815, 542.62787, 542.9622, 555.8041, 556.30524, 560.30493, 568.81213, 569.30817, 569.81244, 571.32025, 577.81793, 578.3194, 578.8209, 583.3069, 584.3135, 600.341, ...]","[3675.893, 1327.4326, 1998.2828, 1035.8948, 1361.1746, 960.07794, 61628.18, 1104.9736, 1312.0294, 990.87524, 1477.9116, 797.60004, 962.75903, 866.8694, 10745.13, 903.43787, 1562.9825, 786.6563, 1280.7485, 946.3724, 843.2562, 7021.8486, 4936.954, 14158.6, 1711.8472, 1017.9612, 10636.432, 3522.6406, 3725.4905, 805.0644, 1495.0476, 898.31134, 1522.1528, 3500.2314, 1040.0103, 1140.1289, 900.6165, 1022.31836, 1829.0881, 12759.503, 4354.917, 1152.0646, 736.79205, 1097.8534, 1221.7074, 1557.757, 1441.5685, 1518.2312, 3276.1157, 1005.85223, 1826.9573, 1717.5045, 8335.707, 4662.998, 1704.3671, 3739.997, 1174.8475, 1289.6838, 3820.1873, 1894.3021, 1493.5336, 1130.0587, 2028.7438, 4746.6514, 1753.4117, 1181.098, 24190.582, 8845.266, 1237.6666, 1482.4384, 938.04315, 960.78876, 3885.5269, 1325.6501, 3920.322, 3737.038, 1007.90894, 1157.7865, 31539.12, 14270.232, 3573.9949, 4771.238, 1026.3484, 2085.493, 1120.2954, 1349.3654, 1406.5541, 12642.806, 3641.8975, 1407.5908, 13306.275, 11358.311, 1802.8945, 12360.576, 95025.36, 48127.754, 11082.771, 995.10504, 2075.6462, 3429.524, ...]",Acidiphilium_cryptum_JF-5,Train
73,KTANGKPTSAARPTPTR,"[129.10246, 150.25829, 175.11871, 178.11052, 186.12349, 212.13824, 235.28714, 255.57036, 283.13977, 485.2607, 503.60635, 509.27728, 536.61835, 536.95166, 542.62616, 542.9619, 562.7823, 563.2835, 563.78503, 569.30676, 570.6517, 571.3202, 575.7858, 576.28925, 577.8181, 578.3166, 578.82007, 579.3221, 579.651, 580.31995, 584.7992, 585.302, 585.6635, 585.7968, 585.99335, 586.299, 633.104, 641.86127, 642.3607, 670.378, 670.8736, 718.8772, 719.38165, 724.0245, 727.39746, 727.89874, 728.39557, 754.394, 762.9142, 763.4161, 763.91284, 772.4123, 804.92413, 813.44116, 813.94025, 814.433, 818.92957, 867.9579, 1132.2631, 1137.5854, 1269.1558, 1510.962, 1572.8278]","[8827.272, 392.54102, 2725.1372, 689.052, 771.0132, 477.76483, 421.05676, 479.78464, 773.68726, 630.4267, 654.27734, 1001.8967, 679.4603, 584.86584, 713.1974, 2057.493, 1044.0569, 2205.9475, 604.872, 1085.0674, 620.50354, 1637.4172, 990.6302, 2376.0312, 12824.357, 6410.964, 1749.057, 1935.5543, 3383.7886, 1731.0715, 7338.683, 21974.021, 3322.3145, 10177.434, 3989.1467, 634.05164, 495.1127, 1071.4807, 730.33417, 6570.293, 1852.2682, 950.55096, 472.3612, 567.6517, 5457.6865, 939.3391, 705.0005, 830.3275, 7488.0757, 3625.1287, 1942.6853, 523.9595, 2269.1858, 6757.8276, 3179.453, 1962.7655, 763.0397, 447.8305, 458.29007, 1785.5914, 513.5198, 630.78674, 549.80334]",Acidiphilium_cryptum_JF-5,Train
78,KTANGKPTSAARPTPTR,"[101.10759, 103.8768, 112.08745, 126.3069, 129.10251, 130.10608, 145.09723, 151.08563, 173.09221, 175.11923, 178.11049, 179.10776, 185.12856, 186.12352, 230.14928, 231.97632, 245.96835, 252.45296, 255.09952, 257.0266, 269.16248, 283.13962, 358.3223, 381.62943, 414.64362, 455.22495, 485.2504, 509.28098, 529.29205, 536.9576, 542.63055, 542.9605, 554.28766, 555.8075, 562.7851, 563.2879, 569.30835, 569.80804, 571.3217, 573.64966, 573.9802, 575.7943, 576.28986, 576.78656, 577.81726, 578.3171, 579.3236, 579.65533, 579.9863, 584.32556, 584.79895, 585.3008, 585.6611, 585.79645, 600.3461, 641.86096, 642.3697, 670.3764, 670.8761, 710.3787, 718.896, 719.38226, 727.4001, 727.8979, 740.9034, 754.4016, 754.9054, 762.9166, 763.4152, 790.4424, 791.42474, 791.93005, 804.434, 804.9364, 805.4285, 813.4397, 813.9415, 818.9333, 863.40643, 1137.5941, 1138.6056]","[509.4069, 447.36395, 551.3344, 418.13742, 11915.7295, 528.72, 921.3412, 476.30368, 572.6511, 3001.9648, 794.9798, 900.2417, 565.0211, 2157.4626, 686.00397, 490.04626, 462.16513, 447.71368, 426.0191, 416.6034, 471.19543, 1859.7793, 438.00885, 465.9474, 531.4674, 578.4225, 520.3449, 1735.2427, 792.67883, 895.9605, 2230.8743, 1129.7719, 669.7742, 901.574, 2374.9045, 2178.1575, 853.2339, 704.40826, 2392.2078, 532.5578, 953.27496, 2567.393, 2706.815, 752.5698, 17834.365, 11315.101, 2924.7268, 5614.188, 3298.3057, 558.77576, 33194.438, 24507.344, 13464.7, 9423.735, 602.5168, 3096.7085, 1018.6424, 5958.025, 6191.2227, 514.6899, 740.4064, 1937.1185, 6380.0283, 5063.115, 521.0642, 1880.4785, 2172.8142, 10168.834, 8652.061, 618.5556, 1861.4052, 835.06165, 1849.8495, 3922.71, 2901.438, 11136.014, 10550.944, 822.2212, 556.5142, 2836.104, 1693.784]",Acidiphilium_cryptum_JF-5,Train
...,...,...,...,...,...
29322,NMITGAAQMDGAILVVSAADGPMPQTR,"[246.09, 349.5577, 368.24615, 501.27988, 617.19275, 786.39453, 787.397, 901.41095, 905.4696, 950.5483, 1130.5216, 1161.4972, 1328.646, 1404.6503, 1619.7834, 1834.6278, 2304.6409, 2454.6526]","[662.0856, 483.2599, 463.03635, 917.5805, 532.2196, 3648.228, 793.558, 684.743, 2851.121, 531.5502, 802.08295, 636.6504, 575.53754, 546.033, 539.8515, 606.91455, 560.0548, 627.1756]",Acidiphilium_cryptum_JF-5,Train
29365,ADLANLVEQLSSLTVLEAAELSK,"[147.11281, 151.98907, 173.12782, 177.07407, 201.12357, 223.42484, 228.13434, 234.14479, 300.15417, 347.22827, 366.17664, 371.19217, 376.25922, 377.26144, 388.21912, 390.33197, 440.20935, 476.2691, 477.27435, 502.4604, 528.82336, 547.30597, 549.28845, 618.3493, 747.3798, 860.46954, 861.46326, 878.4847, 1060.582, 1152.922, 1362.0847, 1825.0463]","[449.54553, 441.76135, 664.3595, 457.61807, 834.9699, 485.63382, 528.66656, 637.3353, 919.5532, 859.0, 427.4786, 1600.0372, 4236.418, 439.71646, 432.887, 435.30505, 615.7236, 872.3656, 455.98038, 468.75375, 437.80948, 2022.0881, 490.24045, 877.6733, 2016.3541, 2230.4202, 762.4395, 527.0859, 640.8084, 479.47208, 605.67035, 544.02545]",Acidiphilium_cryptum_JF-5,Train
29380,EIPTIAIASALLGGALNLLSASLPDEAILR,"[108.33525, 162.91159, 194.02742, 217.49213, 321.86224, 543.3059, 601.3698, 736.58203, 767.0342, 813.4452, 814.44916, 926.5255, 1171.6285, 1172.6273]","[425.07245, 503.77744, 435.06436, 466.0152, 534.77405, 496.8213, 681.6504, 472.63382, 483.8626, 5013.854, 638.431, 704.5605, 709.261, 541.1003]",Acidiphilium_cryptum_JF-5,Train
29391,ADLANLVEQLSSLTVLEAAELSK,"[115.767006, 120.16908, 205.66785, 234.14247, 240.09795, 300.15356, 476.2731, 598.3306, 618.34406, 667.75507, 747.3855, 778.6241, 860.4664]","[445.98532, 412.40884, 457.01, 819.4613, 461.7322, 558.59094, 576.91473, 663.95776, 887.3325, 503.81818, 575.3625, 554.8363, 2180.4207]",Acidiphilium_cryptum_JF-5,Train


In [17]:
processing_result.describe()

Unnamed: 0,original_sequence_count,NaN_decoy_count,above_fdr_count,left_decoys,left_targets,fdr,removed_decoys,removed_by_post_processor,final_sequence_count
count,234.0,234.0,234.0,234.0,234.0,234.0,234.0,234.0,234.0
mean,41519.837607,906.876068,17313.92735,221.34188,23077.692308,0.00957,221.34188,0.0,23077.692308
std,7578.393316,1978.301972,6012.774545,72.020999,7206.748798,0.000557,72.020999,0.0,7206.748798
min,28756.0,36.0,7036.0,14.0,1456.0,0.007845,14.0,0.0,1456.0
25%,36696.5,202.0,13414.0,172.75,18354.25,0.009432,172.75,0.0,18354.25
50%,41601.5,348.5,16493.0,233.0,24053.0,0.00966,233.0,0.0,24053.0
75%,45195.75,587.5,20724.25,261.5,26863.75,0.009825,261.5,0.0,26863.75
max,86594.0,14491.0,36596.0,570.0,57972.0,0.012974,570.0,0.0,57972.0


In [18]:
processing_result.drop(columns=['input_file_path', 'output_file_path']).sum()

original_sequence_count      9.715642e+06
NaN_decoy_count              2.122090e+05
above_fdr_count              4.051459e+06
left_decoys                  5.179400e+04
left_targets                 5.400180e+06
fdr                          2.239291e+00
removed_decoys               5.179400e+04
removed_by_post_processor    0.000000e+00
final_sequence_count         5.400180e+06
dtype: float64

In [19]:
processing_result.to_parquet(f"{TRAINING_COLUMNS_DUMP_PATH}/processing_result.parquet")