In [1]:
#default_exp mass_spec.ms_reader

In [2]:
#export

import numpy as np

class MS2ReaderBase:
    def __init__(self):
        self.scan_idx_dict = {}
        self.masses: np.array = None
        self.intens: np.array = None

    def get_peaks(self, scan):
        start_idx, end_idx = self.scan_idx_dict[scan]
        return (
            self.masses[start_idx:end_idx],
            self.intens[start_idx:end_idx]
        )

class AlphaPeptHDFReader(MS2ReaderBase):
    def load_peaks(self, filename):
        from alphapept.io import HDF_File
        hdf_file = HDF_File(filename)
        self.ms_data = {}
        for dataset_name in hdf_file.read(group_name="Raw/MS2_scans"):
            values = hdf_file.read(
                dataset_name=dataset_name,
                group_name="Raw/MS2_scans",
            )
            self.ms_data[dataset_name] = values
        self.scan_idx_dict = {}
        ms_indices = self.ms_data['indices_ms2']
        self.masses = self.ms_data['mass_list_ms2']
        self.intens = self.ms_data['int_list_ms2']
        for i,scan in enumerate(self.ms_data['scan_list_ms2']):
            self.scan_idx_dict[scan] = (ms_indices[i], ms_indices[i+1])

def read_until(file, until):
    lines = []
    while True:
        line = file.readline().strip()
        if line.startswith(until):
            break
        else:
            lines.append(line)
    return lines

def find_line(lines, start):
    for line in lines:
        if line.startswith(start):
            return line
    return None

def parse_pfind_scan_from_TITLE(pfind_title):
    return int(pfind_title.split('.')[-4])

class MGFReader(MS2ReaderBase):

    def load_peaks(self, mgf):
        with open(mgf) as f:
            scanset = set()
            masses_list = []
            intens_list = []
            scan_list = []
            while True:
                line = f.readline()
                if not line: break
                if line.startswith('BEGIN IONS'):
                    lines = read_until(f, 'END IONS')
                    masses = []
                    intens = []
                    scan = None
                    for line in lines:
                        if line[0].isdigit():
                            mass,inten = [float(i) for i in line.strip().split()]
                            masses.append(mass)
                            intens.append(inten)
                        elif line.startswith('SCAN='):
                            scan = int(line.split('=')[1])
                    if not scan:
                        title = find_line(lines, 'TITLE=')
                        scan = parse_pfind_scan_from_TITLE(title)
                    if scan in scanset: continue
                    scanset.add(scan)
                    scan_list.append(scan)
                    masses_list.append(np.array(masses))
                    intens_list.append(np.array(intens))
        indices = np.zeros(len(masses_list)+1, dtype=np.int64)
        indices[1:] = [len(_) for _ in masses_list]
        indices = np.cumsum(indices)
        self.scan_idx_dict = {}
        for i,scan in enumerate(scan_list):
            self.scan_idx_dict[scan] = (indices[i], indices[i+1])
        self.masses = np.concatenate(masses_list)
        self.intens = np.concatenate(intens_list)

In [3]:
#hide
import io
def open(s):
    return io.StringIO(s)

In [4]:
#hide
mgf = """
BEGIN IONS
TITLE=02445a_BA7-TUM_HLA_7_01_01-DDA-1h-R1.8.8.3.0.dta
CHARGE=3+
RTINSECONDS=0.5418930
PEPMASS=272.276336
103.92207 5457.3
104.20045 5051.4
108.70090 5891.7
113.94175 6442.6
116.92975 40506.3
116.93716 8945.5
128.37773 6427.8
131.95308 288352.6
133.93259 7344.6
138.44611 7326.1
139.00072 41556.8
140.00319 16738.8
140.99719 9493.8
145.93156 10209.3
145.94897 10497.8
147.94559 8206.3
147.96396 30552.8
148.95543 14654.7
149.96338 234207.8
150.95096 8306.0
157.01089 84638.9
158.01357 27925.7
159.00627 16084.7
163.94281 24751.1
163.95915 32203.3
165.95605 44458.0
165.97186 11530.2
166.99500 26432.2
167.97302 9216.7
181.95230 13858.8
191.95448 66152.7
192.95538 8408.9
193.07185 9092.8
193.95313 660574.9
194.95674 23452.8
194.99008 143940.9
200.00568 19510.8
200.99942 23678.7
204.30894 9406.1
209.96466 21853.6
211.96245 65351.0
218.90355 9149.6
223.91072 11300.2
238.89684 12108.8
243.93825 10150.2
243.97040 10987.7
244.94121 8744.2
246.90314 11556.3
271.93225 29430.0
271.99219 51184.4
272.19150 31960.4
272.98602 35844.1
273.94431 11031.8
284.47998 8191.3
290.00125 66212.4
290.99539 54064.7
293.89490 10005.0
407.06372 10838.2
464.36697 9715.4
698.81390 9711.7
END IONS
BEGIN IONS
TITLE=02445a_BA7-TUM_HLA_7_01_01-DDA-1h-R1.11.11.2.0.dta
CHARGE=2+
RTINSECONDS=0.6455220
PEPMASS=287.427959
103.34669 5304.0
104.66884 5639.7
113.42419 6258.3
118.84039 5837.5
119.93203 13977.3
130.69589 6876.2
133.94824 43094.3
134.30524 7671.5
135.96359 9031.3
138.99994 8329.7
146.95573 31143.9
147.96323 12176.5
150.95151 65859.3
151.95818 24384.2
157.01105 19241.5
157.34985 7532.5
161.08838 7843.9
161.94234 20119.7
162.95146 60110.4
163.95877 183305.5
164.96657 13647.5
174.95139 150331.9
175.95258 21393.4
178.94460 11433.1
179.95316 13650.5
180.96204 15353.5
190.94572 30418.9
191.95422 61914.1
192.61461 8642.1
192.94395 12331.4
192.96207 132342.5
193.96318 19303.0
209.04164 25149.6
209.96368 154185.0
209.98361 12353.5
213.86244 11541.3
224.93071 12903.0
228.92879 8773.6
241.86043 135357.5
242.86113 20805.2
242.94327 26679.4
243.95219 29569.9
244.92361 12153.5
246.90300 16650.3
252.96521 73484.3
253.96646 11527.5
286.85858 10166.4
287.94186 18763.2
303.87665 39189.3
304.88116 11976.0
321.89087 97122.5
322.88867 28020.8
370.28696 9008.2
389.82578 13277.0
407.83545 12220.4
425.84872 13236.5
482.54852 10940.2
END IONS
"""
reader = MGFReader()
reader.load_peaks(mgf)
reader.get_peaks(11)


(array([103.34669, 104.66884, 113.42419, 118.84039, 119.93203, 130.69589,
        133.94824, 134.30524, 135.96359, 138.99994, 146.95573, 147.96323,
        150.95151, 151.95818, 157.01105, 157.34985, 161.08838, 161.94234,
        162.95146, 163.95877, 164.96657, 174.95139, 175.95258, 178.9446 ,
        179.95316, 180.96204, 190.94572, 191.95422, 192.61461, 192.94395,
        192.96207, 193.96318, 209.04164, 209.96368, 209.98361, 213.86244,
        224.93071, 228.92879, 241.86043, 242.86113, 242.94327, 243.95219,
        244.92361, 246.903  , 252.96521, 253.96646, 286.85858, 287.94186,
        303.87665, 304.88116, 321.89087, 322.88867, 370.28696, 389.82578,
        407.83545, 425.84872, 482.54852]),
 array([  5304. ,   5639.7,   6258.3,   5837.5,  13977.3,   6876.2,
         43094.3,   7671.5,   9031.3,   8329.7,  31143.9,  12176.5,
         65859.3,  24384.2,  19241.5,   7532.5,   7843.9,  20119.7,
         60110.4, 183305.5,  13647.5, 150331.9,  21393.4,  11433.1,
         13650.5,  