# Prototyping an ML Model
## Prerequisites

In [12]:
import glob
import pandas as pd
from mmproteo.utils.utils import ensure_dir_exists
from mmproteo.utils import log
from mmproteo.utils.formats.mz import FilteringProcessor, filter_files
from mmproteo.utils.processing import ItemProcessor
import os
import tensorflow as tf
import numpy as np
from typing import Iterable, Callable, Dict, Any, Tuple
import gc

In [13]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 1000)

In [14]:
logger = log.DummyLogger(verbose=False)

INFO: Printing to Stdout


## Configuration

In [15]:
pwd

'/tf/workspace/notebooks'

In [16]:
PROJECT = "PXD010000"
DUMP_PATH = os.path.join("..", "dumps", PROJECT)
TRAINING_COLUMNS_DUMP_PATH = os.path.join(DUMP_PATH, "training_columns")
FILES_PATH = os.path.join(TRAINING_COLUMNS_DUMP_PATH, "*_mzmlid.parquet")
STATISTICS_FILE_PATH = os.path.join(TRAINING_COLUMNS_DUMP_PATH, "statistics.parquet")
DATASET_DUMP_PATH = os.path.join(TRAINING_COLUMNS_DUMP_PATH, "tf_datasets")

In [17]:
MZMLID_FILE_PATHS = glob.glob(FILES_PATH)
len(MZMLID_FILE_PATHS)

235

In [18]:
MZMLID_FILE_PATHS[0]

'../dumps/PXD010000/training_columns/Biodiversity_S_agalactiae_LIB_aerobic_02_26Feb16_Arwen_16-01-01_mzmlid.parquet'

In [20]:
pd.read_parquet(MZMLID_FILE_PATHS[1])

Unnamed: 0,SpectrumIdentificationItem__1__PeptideEvidenceRef__PeptideSequence,mz_array,intensity_array,species,istrain
0,TTTPKKPNSAMR,"[100.07641, 100.087074, 101.07137, 101.1077, 102.05501, 110.07133, 110.631386, 112.05055, 112.07596, 112.08711, 113.07111, 115.05069, 115.08657, 116.07058, 127.0866, 129.10239, 130.08636, 130.09476, 130.10594, 131.0815, 131.09012, 132.54994, 136.06177, 136.07571, 139.08629, 139.90489, 140.0708, 141.06584, 141.10217, 143.08157, 145.0981, 147.11227, 147.96593, 149.02304, 152.05658, 153.10185, 155.08127, 155.11765, 157.09727, 157.10991, 158.09113, 159.07635, 164.05647, 167.05591, 167.08096, 169.05196, 171.07687, 171.11266, 175.10767, 175.11974, 176.11064, 180.07697, 181.09694, 181.13272, 185.05566, 185.09209, 186.09564, 196.10736, 197.09218, 198.08804, 199.0704, 199.10811, 203.10245, 204.10599, 205.1368, 209.1289, 212.10365, 214.11859, 222.16032, 224.10338, 226.15494, 227.1583, 238.15607, 240.13449, 240.17038, 242.11392, 257.19702, 258.15533, 258.20068, 264.13522, 265.1653, 268.12894, 283.17685, 286.13928, 300.11865, 305.15933, 305.65024, 309.192, 312.04562, 314.16446, 321.8492, 327.2031, 327.5248, 327.8595, 329.19373, 336.19986, 337.64832, 343.18893, 346.16183, 346.5337, ...]","[1044.7223, 3649.1797, 1102.0496, 1357.6985, 844.1217, 4310.1567, 701.75555, 3388.4607, 14474.438, 3396.4175, 3271.5308, 785.9828, 3989.8667, 1080.443, 8620.463, 115111.52, 60642.484, 892.7931, 6232.6123, 1503.7406, 1695.8452, 621.4343, 8136.3633, 3622.0579, 3527.2075, 690.5978, 3141.1062, 9701.572, 1072.0393, 4542.582, 913.5738, 1275.7793, 904.80707, 1183.5809, 1307.1254, 3257.456, 6890.353, 782.2476, 22261.06, 948.4679, 1374.1157, 8412.553, 1015.7166, 942.1996, 4452.6733, 818.5687, 7237.765, 11416.154, 45089.855, 7394.4355, 1465.4384, 3969.8708, 1652.0625, 1171.807, 927.53595, 48796.72, 3459.731, 1612.3837, 884.8939, 1009.1056, 948.3301, 5930.4526, 20944.246, 1297.6293, 876.90686, 699.8927, 2823.239, 4986.422, 1002.72095, 6868.054, 48777.496, 5358.242, 1159.44, 6050.066, 1177.8954, 4301.8447, 13318.132, 1678.9065, 896.4516, 1518.1096, 1104.4894, 3720.5735, 1503.6326, 7302.4346, 1507.2914, 853.93396, 1312.6992, 4806.0015, 1169.0146, 3889.657, 1483.5475, 3854.5623, 6731.8696, 3342.8206, 5410.943, 1120.8875, 888.22064, 883.156, 4437.399, 1748.3259, ...]",Bacteroides_fragilis_638R,Train
3,RGQVEGMESSR,"[101.07124, 102.0553, 110.07134, 112.05085, 112.08691, 113.07111, 115.086685, 116.0705, 116.97228, 119.58438, 124.03943, 127.086365, 129.0665, 129.1023, 130.04953, 130.05978, 130.08626, 130.10585, 136.06165, 136.07605, 141.06583, 143.08133, 147.11246, 149.02335, 152.05638, 155.08112, 157.09698, 157.10889, 158.09206, 159.0761, 163.08455, 171.07649, 171.59804, 173.09224, 173.12799, 175.07077, 175.11919, 176.15643, 183.1128, 186.12308, 187.07169, 188.03653, 189.61603, 197.10431, 207.13461, 214.12898, 231.15611, 245.12381, 252.09872, 256.12704, 262.15063, 268.13138, 270.10626, 273.13388, 281.1357, 283.14725, 300.64038, 309.6448, 314.16162, 314.19324, 316.09637, 325.16174, 325.79483, 331.17108, 342.1879, 343.19034, 343.80283, 346.97397, 349.18295, 350.18475, 356.15732, 359.20905, 384.5217, 393.83148, 396.23593, 399.00772, 413.26212, 414.2659, 417.0347, 417.2444, 417.2967, 417.74738, 418.0339, 418.24805, 418.99487, 424.23, 441.25668, 442.25604, 460.2152, 461.21298, 478.22495, 479.226, 488.20978, 520.249, 532.253, 542.3029, 544.235, 561.26013, 570.2995, 571.30084, ...]","[1498.0873, 20642.28, 3844.8838, 1120.5063, 4205.9717, 874.3143, 2515.5444, 786.77234, 926.58484, 730.192, 5138.3696, 2751.0, 1000.0747, 15183.164, 2514.9229, 688.74, 2551.5332, 1013.9507, 6667.6606, 1194.0012, 3992.4336, 1274.8231, 1276.667, 838.5754, 669.43054, 1216.2158, 901.391, 1217.8413, 4599.2, 1217.2018, 970.0999, 3354.2034, 1381.6448, 652.9564, 1240.8739, 806.5449, 12598.689, 618.4468, 1167.6907, 2215.4497, 1226.4055, 1331.2247, 2589.5076, 1103.1534, 3314.474, 5952.45, 3313.3496, 3223.3242, 925.97974, 805.99945, 5254.821, 920.71, 1146.4827, 962.8078, 1564.1729, 842.9065, 1520.9548, 5055.0137, 1073.2836, 9688.531, 794.2665, 1371.2823, 1058.4569, 1372.5068, 36481.03, 4376.838, 958.2829, 5189.961, 14978.933, 2890.5093, 954.04565, 884.45465, 907.48895, 868.85864, 1533.7688, 816.29004, 22754.748, 4653.88, 3068.491, 4746.0464, 883.5928, 810.7221, 887.24255, 1424.3857, 2932.1465, 2775.8826, 21891.256, 3109.1223, 10081.893, 1342.2267, 19438.086, 3235.9631, 2758.4998, 720.628, 1013.52405, 5870.137, 952.2475, 2910.8145, 6807.415, 1152.3481, ...]",Bacteroides_fragilis_638R,Train
4,LENKTQEETPKK,"[102.05543, 110.07127, 112.05052, 112.08741, 113.26558, 116.97188, 126.05521, 127.08683, 129.06586, 129.10239, 130.08635, 130.10635, 136.06181, 136.07162, 138.05504, 143.08089, 144.0655, 147.11261, 149.96207, 152.05714, 155.08131, 158.09383, 171.07564, 173.0548, 175.11902, 184.10844, 186.12361, 197.12816, 208.10777, 209.092, 212.1386, 213.08578, 215.1391, 216.14172, 226.0833, 226.11894, 226.1544, 227.1579, 230.11263, 243.13763, 244.16649, 275.20703, 277.0742, 302.68124, 327.20148, 330.05783, 337.14822, 354.17728, 354.24994, 371.84048, 372.26053, 373.26447, 387.97827, 405.98837, 411.19608, 412.8003, 429.2813, 437.2194, 442.30505, 453.2528, 454.2585, 455.2248, 469.76245, 473.30725, 481.25052, 482.25558, 483.22757, 536.29694, 544.80273, 545.30225, 549.7871, 583.28204, 593.3081, 593.8031, 601.81934, 602.3311, 635.3454, 657.3349, 657.83356, 658.3393, 666.3356, 731.38635, 741.8714, 842.42236, 859.44934, 942.4724, 943.48285, 960.4999, 961.4917]","[7361.546, 4003.6309, 1037.2148, 723.81433, 606.61865, 2556.95, 2491.26, 893.6975, 1125.2485, 39392.168, 7063.4062, 2478.573, 11351.012, 931.9453, 1122.6821, 1584.9333, 1268.8372, 9353.861, 699.645, 1086.4026, 1334.4708, 633.4053, 1118.564, 801.9487, 2864.7756, 1293.2511, 812.2518, 1086.9083, 1080.293, 5375.067, 745.5624, 837.1468, 16401.15, 973.3552, 906.8999, 8097.995, 20569.99, 3237.4421, 1127.831, 4394.0767, 1001.46686, 1153.5564, 928.743, 670.3051, 1032.8434, 1599.6625, 1242.6686, 1248.217, 1154.2308, 776.60266, 14888.237, 1380.3208, 853.85266, 787.8747, 934.9394, 804.01294, 912.0707, 1263.0475, 720.4426, 2900.0017, 3216.855, 1335.8528, 821.01514, 3882.2896, 4459.4644, 5607.7744, 967.2915, 931.81976, 2454.1892, 1041.5533, 985.2618, 771.207, 1321.2294, 994.69904, 2480.1604, 1130.4955, 1078.6575, 4050.678, 1235.27, 1340.4612, 1068.5262, 3231.0366, 830.1816, 1066.2532, 2534.9922, 1220.7839, 797.5198, 6403.1646, 4023.3003]",Bacteroides_fragilis_638R,Train
5,GQVEGMESSR,"[101.07126, 102.055214, 112.0506, 116.9719, 124.03969, 126.05546, 127.08668, 129.066, 129.10228, 136.06177, 141.06581, 143.08275, 147.1128, 147.96664, 152.0342, 152.05635, 155.08093, 158.09244, 159.07611, 169.06073, 170.06487, 171.07625, 175.11905, 183.11278, 186.08734, 187.07199, 187.0905, 201.12344, 203.28166, 211.10703, 213.08865, 213.87234, 222.12297, 223.10677, 229.11826, 233.56068, 240.1341, 245.12473, 250.11824, 252.09691, 256.99036, 262.15186, 268.12793, 285.15585, 315.131, 316.09412, 332.15756, 349.1824, 357.144, 365.16092, 369.18015, 405.6895, 414.19638, 478.22476, 478.72095, 479.22058, 486.006, 502.03748, 503.03552, 504.21036, 524.7781, 532.51874, 548.04315, 561.2638, 591.2553, 601.26154, 618.28516, 619.2834, 625.2573, 661.2948, 682.2819, 683.28345, 720.2947, 729.3124, 730.30804, 747.32776, 748.3295, 794.3107, 811.3213, 812.3244, 846.24695, 846.39594, 847.39417, 848.39453, 849.8904, 895.379, 910.3924, 911.3853]","[1187.8904, 2972.269, 3574.3386, 1470.6052, 1060.8564, 968.8467, 2852.4521, 13417.714, 6234.5923, 7266.2993, 7269.694, 1061.9021, 652.39105, 1362.4257, 8712.281, 738.97974, 828.99554, 7607.2554, 957.90076, 31864.178, 1560.6099, 1244.2101, 6113.7915, 6111.2607, 80816.53, 1189.7592, 5395.001, 6197.933, 637.1054, 1296.5093, 691.6643, 747.781, 1429.0768, 922.32526, 3577.6533, 766.46027, 6604.7974, 928.7183, 1150.2063, 842.42194, 788.4884, 3352.585, 1150.4005, 4845.871, 1278.4639, 2253.1917, 1090.0879, 8567.199, 670.1812, 3148.8154, 883.5839, 770.94495, 929.0264, 7297.799, 1597.305, 1232.5024, 1395.1279, 13496.809, 4400.9097, 942.3943, 2970.1582, 862.437, 2982.6267, 2357.0806, 1460.0151, 901.1616, 27604.975, 8098.4688, 1594.6074, 2817.6833, 17989.469, 3790.375, 1121.8053, 3915.7847, 3046.1497, 36968.617, 12118.891, 1264.2759, 13526.702, 4452.0835, 1094.6704, 25940.133, 9532.036, 1289.1351, 850.582, 769.4591, 10357.301, 3156.138]",Bacteroides_fragilis_638R,Train
10,NYPPGQHGNSR,"[101.071144, 102.05501, 110.07145, 112.08698, 115.08625, 116.97175, 120.08064, 126.05502, 127.05047, 127.08645, 129.10266, 130.08638, 131.64159, 136.06154, 136.07573, 143.08168, 146.05975, 147.11282, 152.0556, 155.08174, 158.09244, 169.09773, 171.07596, 175.11922, 188.07106, 191.08171, 193.09695, 205.09721, 233.09288, 235.97621, 246.06747, 246.0984, 250.11809, 251.12149, 262.15088, 263.1538, 273.91876, 283.139, 285.6393, 286.135, 291.92825, 292.0396, 307.13824, 310.0511, 314.97687, 332.9512, 332.9916, 333.1557, 333.99158, 341.15674, 341.80493, 351.00534, 352.0044, 359.81766, 367.90652, 368.9035, 385.9152, 386.9163, 386.9885, 391.8446, 394.8357, 409.22934, 409.85403, 410.85144, 418.19186, 418.68945, 426.70538, 427.20096, 427.69992, 433.21326, 434.20157, 475.23282, 475.72748, 565.06964, 570.27045, 571.2576, 592.2501, 618.54535, 627.3158, 628.3194, 755.3529, 756.34106, 852.3976, 853.3929, 1196.2797]","[1603.7665, 638.4123, 6107.103, 941.5097, 1175.4966, 811.9929, 730.29083, 1511.2113, 603.3916, 1508.2676, 7183.3037, 629.8367, 615.5881, 6985.7354, 17924.873, 638.4245, 5556.8794, 1300.7413, 1037.5245, 2682.1401, 1203.2234, 604.9504, 1215.2642, 4572.924, 1065.3403, 1164.4872, 913.2287, 652.28705, 5168.6157, 725.83044, 691.0832, 570.4452, 10907.39, 1125.0837, 8004.555, 701.70276, 920.7317, 734.48486, 854.7602, 986.76654, 1283.0204, 2428.319, 834.96796, 647.4563, 635.8429, 763.3874, 1110.7059, 716.5199, 1581.7374, 1269.8304, 673.21027, 4426.5513, 1270.0804, 2762.1084, 3465.0676, 1504.4236, 1189.7056, 866.33765, 973.1809, 726.89703, 681.06024, 988.659, 18628.006, 3468.356, 1000.37573, 750.336, 7367.0864, 3955.991, 2528.9841, 2842.129, 1201.0422, 1546.1396, 1136.6676, 697.7678, 1524.4285, 1115.8948, 820.75574, 739.3746, 2534.232, 1025.1155, 1090.133, 2654.338, 984.80524, 2602.24, 718.72046]",Bacteroides_fragilis_638R,Train
...,...,...,...,...,...
41527,KCDMVEDAEMLELVEMEMR,"[101.05935, 103.020645, 104.05255, 104.586845, 107.08496, 109.0642, 109.100746, 111.04349, 111.08, 113.0591, 113.08182, 115.07468, 117.09013, 117.61649, 118.997826, 119.08494, 120.07991, 121.10063, 123.07969, 123.11663, 124.08373, 125.05891, 125.095474, 127.07443, 131.06929, 133.08513, 133.09985, 134.99284, 135.07893, 137.09514, 139.0741, 139.11081, 141.09024, 149.09563, 153.09059, 155.10571, 157.08511, 157.12067, 159.11604, 161.09464, 163.11053, 165.08986, 169.08496, 171.0999, 175.11794, 175.14807, 177.11163, 177.1266, 179.10521, 181.12146, 183.10028, 195.09993, 196.21724, 201.19516, 212.70088, 214.2516, 215.74602, 225.6624, 243.13188, 245.07568, 267.1582, 306.15775, 320.84552, 329.15085, 395.18127, 399.22293, 408.5439, 441.24878, 460.18903, 518.23425, 559.26056, 566.2391, 633.19824, 647.27295, 695.27576, 748.5672, 766.59735, 767.5057, 768.5121, 794.3417, 797.4525, 803.322, 874.3649, 907.4288, 1003.38824, 1327.7345, 1723.6533]","[8430.887, 731.2961, 2968.9656, 758.86145, 1318.9164, 1680.7355, 4514.129, 916.266, 5231.8555, 10082.403, 1132.7122, 6410.336, 1024.0154, 833.3183, 1194.0488, 1503.0662, 1764.7716, 5905.8374, 21239.508, 816.922, 1278.6604, 1214.3956, 1084.3068, 1223.183, 1670.9976, 7210.673, 895.606, 712.7596, 1051.5955, 4375.5815, 1363.8634, 1694.8792, 4508.612, 771.2844, 1368.0442, 4632.6387, 3052.2427, 729.27026, 1320.2073, 744.693, 1040.017, 1229.9174, 3537.703, 1086.5143, 878.55316, 624.47626, 962.44794, 1466.3921, 2839.6135, 3823.5632, 760.3185, 771.7214, 878.9504, 1179.1927, 704.0861, 925.87384, 749.07465, 1384.2212, 1127.6223, 3169.4258, 836.0461, 920.83386, 676.20557, 3378.6143, 1480.4438, 931.5564, 709.0285, 1644.6859, 3391.3098, 764.6054, 799.0509, 1233.6478, 789.6947, 1066.5266, 2898.648, 919.44104, 1208.9327, 28379.393, 1354.0396, 1628.7786, 1522.0673, 1279.2909, 1454.1941, 941.1085, 1172.7185, 817.3121, 1068.0411]",Bacteroides_fragilis_638R,Train
41537,LGANEAPPAILSIFLGSQLSATLDEIVR,"[101.05928, 103.0386, 103.07508, 105.06966, 109.10055, 111.07988, 113.05906, 115.07456, 117.053925, 117.09034, 120.08012, 121.10057, 123.07969, 125.05893, 127.07441, 130.45824, 131.06966, 133.08488, 136.8515, 137.0952, 141.09027, 147.10022, 153.08957, 157.08469, 157.70097, 161.07918, 161.11601, 163.1109, 167.05391, 169.0838, 175.09488, 175.118, 175.1329, 175.14758, 181.12093, 203.14336, 205.13776, 227.12607, 240.23091, 282.276, 295.27197, 300.05954, 301.05673, 343.2027, 350.85037, 367.266, 371.19528, 379.23413, 387.26395, 409.619, 459.337, 492.31305, 558.8013, 585.29254, 603.3217, 611.2054, 789.4149, 790.4259, 822.39197, 842.2726, 882.4496, 916.50757, 934.4882, 965.67285, 966.5361, 1003.532, 1116.5934, 1117.5991, 1331.6897, 1388.7151, 1389.7256, 1432.65, 1445.7666, 1454.1693, 1529.2706, 1865.6277, 2688.5247, 2825.4365]","[11718.903, 5609.024, 5741.102, 1012.5553, 1198.1283, 1475.6608, 1743.6683, 5389.0093, 17218.988, 3866.4636, 3929.384, 1407.1747, 9035.613, 783.1749, 935.6047, 693.6734, 10747.78, 4333.1646, 760.11346, 3094.9995, 1368.5172, 1806.6206, 1007.9336, 1790.9323, 669.6594, 1355.8413, 1233.1633, 928.12036, 632.6646, 2509.276, 898.91077, 1450.326, 767.0175, 693.757, 1260.9175, 787.7376, 757.9388, 809.57886, 1108.1156, 926.1734, 794.8231, 902.6264, 964.56244, 1174.5214, 742.0727, 923.73315, 1577.6155, 819.56445, 803.195, 757.5075, 930.73035, 1026.7606, 1100.2051, 905.52435, 886.0623, 829.8621, 3433.995, 877.7219, 889.8532, 839.18353, 765.29034, 1277.6627, 895.79865, 1449.689, 832.23303, 3239.5828, 1497.7186, 1258.4955, 896.76807, 1307.3108, 1171.4875, 813.18286, 947.9447, 927.9004, 997.20905, 991.1947, 958.0716, 1128.3889]",Bacteroides_fragilis_638R,Train
41538,VKLPRMLRK,"[101.05929, 105.06917, 107.08486, 109.06424, 109.10072, 111.04372, 111.07987, 113.059074, 115.07485, 117.090225, 118.99769, 119.084625, 121.10038, 123.04131, 123.079765, 123.11594, 124.08289, 125.05898, 125.0954, 127.074005, 131.06981, 133.08505, 134.9923, 135.07959, 135.11589, 137.09479, 139.07431, 139.1106, 147.1149, 149.09525, 155.10516, 161.13097, 163.07422, 163.11063, 163.96437, 167.05357, 167.10521, 169.08472, 177.10994, 179.1051, 189.10948, 227.12515, 228.87413, 249.40475, 290.14368, 313.2285, 355.06387, 357.0641, 429.48358, 578.39044, 579.17285, 579.3396, 579.467, 580.16644, 607.8414, 624.84827, 719.3299, 816.376, 959.64905, 1009.34644]","[3486.7505, 3543.5776, 2814.126, 1183.5564, 3790.3281, 848.05865, 2729.3174, 3988.5771, 2600.4077, 864.8714, 1055.0972, 979.766, 3805.1326, 955.7853, 8903.636, 1091.127, 657.33563, 836.5202, 727.4069, 953.50793, 920.2187, 2880.3945, 1533.6614, 793.9096, 2845.7075, 3195.3496, 1085.788, 928.81885, 757.66833, 1070.3203, 927.416, 845.5595, 709.41205, 1468.5775, 695.3783, 796.4109, 656.9765, 1261.9446, 661.06775, 2504.5942, 738.7829, 691.5681, 651.9965, 714.22516, 1115.553, 4159.0435, 654.1919, 1448.007, 743.66345, 1298.1304, 1486.2747, 7821.1763, 1208.793, 1014.1638, 762.4752, 697.9636, 1153.5125, 2571.4985, 704.1029, 787.9887]",Bacteroides_fragilis_638R,Train
41547,AIIEFPAVAEILSLMK,"[101.0594, 109.06431, 113.081474, 117.09094, 120.07982, 121.100784, 123.07968, 133.0846, 139.11093, 140.18805, 141.09013, 147.11353, 152.10332, 157.13243, 169.09612, 179.31271, 185.1273, 199.13107, 201.08557, 215.13774, 217.84146, 243.13303, 253.19173, 268.1645, 287.173, 296.1939, 298.2088, 299.30478, 314.16895, 316.16333, 339.20004, 390.20242, 427.251, 468.2432, 478.26202, 581.3234, 591.35046, 629.0922, 872.5944, 904.50494, 1171.6661, 1172.6421, 1655.1566]","[1138.2548, 653.75006, 3223.4802, 761.4799, 2881.2695, 1453.9004, 4705.9937, 856.48785, 824.2165, 710.07416, 1240.0293, 753.2827, 695.72955, 6462.374, 1052.6003, 798.1131, 8560.68, 4661.5454, 669.95654, 1006.8313, 678.3419, 6398.125, 911.96454, 2808.452, 1343.5549, 881.70245, 4311.497, 3230.7822, 1559.077, 802.6447, 798.5933, 844.9109, 3042.8967, 4070.3499, 1413.0841, 3888.8042, 790.0908, 677.8018, 3207.2578, 938.45074, 3926.0208, 1105.0334, 1049.019]",Bacteroides_fragilis_638R,Train


In [8]:
SEQ = FilteringProcessor.default_peptide_sequence_column_name
MZ = FilteringProcessor.default_mz_array_column_name
INT = FilteringProcessor.default_intensity_array_column_name

## Calculating Statistics over all MZMLID Files

In [10]:
file_path_count = len(MZMLID_FILE_PATHS)

def get_mzmlid_file_stats(item: Tuple[int, str]) -> Dict[str, Any]:
    idx, path = item
    info_text = f"Processing item {idx + 1}/{file_path_count} '{path}'"
    if idx % 10 == 0:
        logger.info(info_text)
    else:
        logger.debug(info_text)
    df = pd.read_parquet(path)
    max_sequence_length = df[SEQ].str.replace(r"[^A-Z]",'').str.len().max()
    max_array_length = df[INT].str.len().max()
    alphabet = set.union(*df[SEQ].apply(set))
    item_count = len(df)
    del df
    gc.collect()
    
    return {
        "file_path": path,
        "max_sequence_length": max_sequence_length,
        "max_array_length": max_array_length,
        "alphabet": alphabet,
        "item_count": item_count
    }

if os.path.exists(STATISTICS_FILE_PATH):
    file_stats = pd.read_parquet(STATISTICS_FILE_PATH)
    file_stats.alphabet = file_stats.alphabet.apply(set)
    print("loaded previous statistics")
else:
    file_stats = pd.DataFrame(
        ItemProcessor(
            items=enumerate(MZMLID_FILE_PATHS),
            item_processor=get_mzmlid_file_stats,
            action_name="analyse",
            subject_name="mzmlid file",
            thread_count=0,
            logger=logger
        ).process()
    )
    
    file_stats_writable = file_stats.copy()
    file_stats_writable.alphabet = file_stats_writable.alphabet.apply(list) # cannot store sets
    file_stats_writable.write_parquet(STATISTICS_FILE_PATH)

loaded previous statistics


In [11]:
file_stats.head(2)

Unnamed: 0,file_path,max_sequence_length,max_array_length,alphabet,item_count
0,../dumps/PXD010000/training_columns/Biodiversity_S_agalactiae_LIB_aerobic_02_26Feb16_Arwen_16-01-01_mzmlid.parquet,50,1267,"{K, S, L, C, A, H, R, D, Q, F, G, I, W, T, V, Y, P, E, N, M}",13279
1,../dumps/PXD010000/training_columns/Biodiversity_B_fragilis_01_28Jul15_Arwen_14-12-03_mzmlid.parquet,50,1845,"{K, S, L, C, A, H, R, D, Q, F, G, I, W, T, V, Y, P, E, N, M}",26830


In [12]:
MAX_SEQUENCE_LENGTH = file_stats.max_sequence_length.max()
print(f"MAX_SEQUENCE_LENGTH = {MAX_SEQUENCE_LENGTH}")

MAX_ARRAY_LENGTH = file_stats.max_array_length.max()
print(f"MAX_ARRAY_LENGTH = {MAX_ARRAY_LENGTH}")

TOTAL_ITEM_COUNT = file_stats.item_count.sum()
print(f"TOTAL_ITEM_COUNT = {TOTAL_ITEM_COUNT}")

ALPHABET = set.union(*file_stats.alphabet)
print(f"ALPHABET = {', '.join(sorted(ALPHABET))}")

MAX_SEQUENCE_LENGTH = 50
MAX_ARRAY_LENGTH = 2354
TOTAL_ITEM_COUNT = 5408046
ALPHABET = A, C, D, E, F, G, H, I, K, L, M, N, P, Q, R, S, T, V, W, Y


## Data Normalization, Padding, and Conversion to Tensorflow Datasets

In [13]:
def l2_normalize(values: np.ndarray) -> np.ndarray:
    return tf.keras.utils.normalize(x=values, order=2)

In [14]:
def base_peak_normalize(values: np.ndarray) -> np.ndarray:
    return values / values.max()

In [15]:
# by Tom, probably
# don't know, what it's based on
def ion_current_normalize(intensities):
    total_sum = np.sum(intensities**2)
    normalized = intensities/total_sum
    return normalized

In [16]:
NORMALIZATION=base_peak_normalize

In [17]:
PADDING_CHARACTERS = {
    SEQ: '_',
    MZ: 0.0,
    INT: 0.0,
}

In [18]:
ALPHABET.add(PADDING_CHARACTERS[SEQ])

In [19]:
char_to_idx = {char: idx for idx, char in enumerate(sorted(ALPHABET))}
idx_to_char = {idx: char for char, idx in char_to_idx.items()}
INDEX_ALPHABET = idx_to_char.keys()
char_to_idx

{'A': 0,
 'C': 1,
 'D': 2,
 'E': 3,
 'F': 4,
 'G': 5,
 'H': 6,
 'I': 7,
 'K': 8,
 'L': 9,
 'M': 10,
 'N': 11,
 'P': 12,
 'Q': 13,
 'R': 14,
 'S': 15,
 'T': 16,
 'V': 17,
 'W': 18,
 'Y': 19,
 '_': 20}

In [20]:
ARRAY_COLS = [MZ, INT]

In [21]:
def normalize_intensities(df: pd.DataFrame):
    df[INT] = df[INT].apply(NORMALIZATION)

def pad_sequence_column(df: pd.DataFrame):
    df[SEQ] = df[SEQ].str.pad(
        width=MAX_SEQUENCE_LENGTH, 
        fillchar=PADDING_CHARACTERS[SEQ], 
        side='right'
    )

def pad_array_columns(df: pd.DataFrame):
    for col in ARRAY_COLS:
        if len(df[col]) == 0:
            continue
        item_dtype = df[col].iloc[0].dtype

        df[col] = list(tf.keras.preprocessing.sequence.pad_sequences(
            sequences=df[col], 
            maxlen=MAX_ARRAY_LENGTH, 
            padding='post', 
            value=PADDING_CHARACTERS[col],
            dtype=item_dtype
        ))

def _sequence_to_indices(sequence: Iterable[str], 
                char_to_idx_mapping_fun: Callable[[str], int] = char_to_idx.get) -> np.ndarray:
    return np.array([char_to_idx_mapping_fun(char) for char in sequence], dtype=np.int8)

def sequence_column_to_indices(df: pd.DataFrame):
    df[SEQ] = df[SEQ].apply(list).apply(_sequence_to_indices)

def stack_numpy_arrays_in_dataframe(df: pd.DataFrame) -> pd.DataFrame:
    return df.apply(lambda item: [np.stack(item)])

def preprocess_dataframe(df: pd.DataFrame) -> pd.DataFrame:
    normalize_intensities(df)
    pad_sequence_column(df)
    pad_array_columns(df)
    sequence_column_to_indices(df)
    return stack_numpy_arrays_in_dataframe(df)

def df2dataset(stacked_df: pd.DataFrame) -> tf.data.Dataset:
    training_data = tuple(stacked_df[ARRAY_COLS].iloc[0])
    target_data = tuple(stacked_df[[SEQ]].iloc[0])
    dataset = tf.data.Dataset.from_tensor_slices((training_data, target_data))
    return dataset
    
def parquet_file_to_dataset_file_converter(item: Tuple[int, str]) -> str:
    idx, path = item
    tf_dataset_path = os.path.join(
        DATASET_DUMP_PATH, 
        path[len(TRAINING_COLUMNS_DUMP_PATH)+len(os.path.sep):])
    if os.path.exists(tf_dataset_path):
        logger.debug(f"Skipped '{path}' because '{tf_dataset_path}' already exists")
        return None
    
    info_text = f"Processing item {idx + 1}/{len(MZMLID_FILE_PATHS)}: '{path}'"
    if idx % 10 == 0:
        logger.info(info_text)
    else:
        logger.debug(info_text)
    df = pd.read_parquet(path)
    df = preprocess_dataframe(df)
    dataset = df2dataset(df)
    logger.debug(dataset.element_spec)
    
    tf.data.experimental.save(dataset=dataset, path=tf_dataset_path, compression='GZIP')
    
    del dataset
    del df
    gc.collect()
    
    return tf_dataset_path

In [22]:
dataset_file_paths = list(ItemProcessor(
    items=enumerate(MZMLID_FILE_PATHS),
    item_processor=parquet_file_to_dataset_file_converter,
    action_name="parquet2tf_dataset-process",
    subject_name="mzmlid parquet file",
    thread_count=2,
    logger=logger
).process())
dataset_file_paths[:3]

INFO: No mzmlid parquet files were parquet2tf_dataset-processed


[]

## Loading Tensorflow Datasets

In [23]:
dataset_file_paths = glob.glob(os.path.join(DATASET_DUMP_PATH, '*'))
dataset_file_paths[:3]

['../dumps/PXD010000/training_columns/tf_datasets/Biodiversity_HL48_HLHxylose_aerobic_2_09Jun16_Pippin_16-03-39_mzmlid.parquet',
 '../dumps/PXD010000/training_columns/tf_datasets/Biodiversity_P_ruminicola_MDM_anaerobic_1_09Jun16_Pippin_16-03-39_mzmlid.parquet',
 '../dumps/PXD010000/training_columns/tf_datasets/Biodiversity_C_Baltica_T240_R3_C_27Jan16_Arwen_15-07-13_mzmlid.parquet']

In [24]:
element_spec = ((tf.TensorSpec(shape=(MAX_ARRAY_LENGTH,), dtype=tf.float32), 
  tf.TensorSpec(shape=(MAX_ARRAY_LENGTH,), dtype=tf.float32)),
(tf.TensorSpec(shape=(MAX_SEQUENCE_LENGTH,), dtype=tf.int8)))
element_spec

((TensorSpec(shape=(2354,), dtype=tf.float32, name=None),
  TensorSpec(shape=(2354,), dtype=tf.float32, name=None)),
 TensorSpec(shape=(50,), dtype=tf.int8, name=None))

In [25]:
datasets = [tf.data.experimental.load(path=path, element_spec=element_spec, compression='GZIP') 
            for path in dataset_file_paths]

In [26]:
datasets[:3]

[<_LoadDataset shapes: (((2354,), (2354,)), (50,)), types: ((tf.float32, tf.float32), tf.int8)>,
 <_LoadDataset shapes: (((2354,), (2354,)), (50,)), types: ((tf.float32, tf.float32), tf.int8)>,
 <_LoadDataset shapes: (((2354,), (2354,)), (50,)), types: ((tf.float32, tf.float32), tf.int8)>]

## Concatenating Tensorflow Datasets

In [27]:
BATCH_SIZE = 256

dataset = datasets[0]
for ds in datasets[1:]:
    dataset = dataset.concatenate(ds)

dataset = dataset.batch(BATCH_SIZE)

## Building the Tensorflow Model

In [28]:
input_layers = {col: tf.keras.layers.Input(shape=(MAX_ARRAY_LENGTH,)) for col in ARRAY_COLS}
input_layers

{'mz_array': <KerasTensor: shape=(None, 2354) dtype=float32 (created by layer 'input_1')>,
 'intensity_array': <KerasTensor: shape=(None, 2354) dtype=float32 (created by layer 'input_2')>}

In [29]:
x = input_layers[MZ] + input_layers[INT]

x = tf.keras.layers.Flatten()(x)

for _ in range(1):
    x = tf.keras.layers.Dense(16*MAX_SEQUENCE_LENGTH*len(ALPHABET))(x)
    x = tf.keras.layers.Dropout(0.5)(x)

x = tf.keras.layers.Dense(MAX_SEQUENCE_LENGTH*len(ALPHABET))(x)

x = tf.reshape(x,(-1, MAX_SEQUENCE_LENGTH, len(ALPHABET)))

x = tf.keras.activations.softmax(x)

model = tf.keras.Model([input_layers[MZ],input_layers[INT]],x)
model.compile(optimizer=tf.keras.optimizers.Adam(),
              loss=tf.keras.losses.SparseCategoricalCrossentropy())
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 2354)]       0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 2354)]       0                                            
__________________________________________________________________________________________________
tf.__operators__.add (TFOpLambd (None, 2354)         0           input_1[0][0]                    
                                                                 input_2[0][0]                    
__________________________________________________________________________________________________
flatten (Flatten)               (None, 2354)         0           tf.__operators__.add[0][0]   

## Training the Tensorflow Model

In [30]:
def split_dataset(dataset, fraction):
    split_value = int(len(dataset) * fraction)
    a = dataset.take(split_value)
    b = dataset.skip(split_value)
    return a, b

In [31]:
dataset = dataset.shuffle(buffer_size=int(10000 / BATCH_SIZE))

In [32]:
model.fit(dataset, epochs=10)

Epoch 1/10
Epoch 2/10
  209/21126 [..............................] - ETA: 15:32 - loss: 5.4155

KeyboardInterrupt: 

## Evaluating the Tensorflow Model