In [5]:
#default_exp mass_spec.ms_reader

## Basic MS2 spectrum file readers. Should be replace by AlphaRaw in the near future

In [6]:
#export
import os
import numpy as np
import pandas as pd

class MSReaderBase:
    def __init__(self):
        self.spectrum_df:pd.DataFrame = pd.DataFrame()
        self.mzs: np.array = np.array([])
        self.intensities: np.array = np.array([])

    def load(self, file_path):
        raise NotImplementedError('load()')

    def build_spectrum_df(self, scan_list, scan_indices, rt_list, mobility_list = None):
        if mobility_list is None: mobility_list = np.nan
        def set_col(col, indexes, values, dtype, na_value):
            self.spectrum_df.loc[indexes, col] = values
            self.spectrum_df[col].fillna(na_value, inplace=True)
            self.spectrum_df[col] = self.spectrum_df[col].astype(dtype)

        idx_len = np.max(scan_list)+1
        self.spectrum_df = pd.DataFrame(index=np.arange(idx_len, dtype=np.int64))
        self.spectrum_df['spec_idx'] = self.spectrum_df.index.values
        set_col('peak_start_idx', scan_list, scan_indices[:-1], np.int64, -1)
        set_col('peak_end_idx', scan_list, scan_indices[1:], np.int64, -1)
        set_col('rt', scan_list, rt_list, np.float64, np.nan)
        set_col('mobility', scan_list, mobility_list, np.float64, np.nan)

    def get_peaks(self, spec_idx):
        """Get peak (mz and intensity) values by `spec_idx`

        Args:
            spec_idx (object): indicator for a spectrum, could be scan no for thermo data.

        Returns:
            np.array: mz values for the given spec_idx (scan)
            np.array: intensity values for the given spec_idx
        """
        if spec_idx not in self.spectrum_df.index:
            return None, None
        start_idx, end_idx = self.spectrum_df.loc[
            spec_idx, ['peak_start_idx','peak_end_idx']
        ].values.astype(np.int64)
        return (
            self.mzs[start_idx:end_idx],
            self.intensities[start_idx:end_idx]
        )

class AlphaPept_HDF_MS1_Reader(MSReaderBase):
    def load(self, file_path):
        from alphapept.io import HDF_File
        hdf_file = HDF_File(file_path)
        self.ms_data = {}
        for dataset_name in hdf_file.read(group_name="Raw/MS1_scans"):
            values = hdf_file.read(
                dataset_name=dataset_name,
                group_name="Raw/MS1_scans",
            )
            self.ms_data[dataset_name] = values
        self.mzs = self.ms_data['mass_list_ms1']
        self.intensities = self.ms_data['int_list_ms1']
        self.build_spectrum_df(
            scan_list=self.ms_data['scan_list_ms1'], 
            scan_indices=self.ms_data['indices_ms1'],
            rt_list=self.ms_data['rt_list_ms1'],
            mobility_list=self.ms_data['mobility'] if 'mobility' in self.ms_data else None,
        )

class AlphaPept_HDF_MS2_Reader(MSReaderBase):
    def load(self, file_path):
        from alphapept.io import HDF_File
        hdf_file = HDF_File(file_path)
        self.ms_data = {}
        for dataset_name in hdf_file.read(group_name="Raw/MS2_scans"):
            values = hdf_file.read(
                dataset_name=dataset_name,
                group_name="Raw/MS2_scans",
            )
            self.ms_data[dataset_name] = values
        self.mzs = self.ms_data['mass_list_ms2']
        self.intensities = self.ms_data['int_list_ms2']
        if 'mobility2' in self.ms_data:
            scan_list = np.arange(len(self.ms_data['rt_list_ms2']))
        else:
            scan_list = self.ms_data['scan_list_ms2']
        self.build_spectrum_df(
            scan_list=scan_list, 
            scan_indices=self.ms_data['indices_ms2'],
            rt_list=self.ms_data['rt_list_ms2'],
            mobility_list=self.ms_data['mobility2'] if 'mobility2' in self.ms_data else None,
        )

def read_until(file, until):
    lines = []
    while True:
        line = file.readline().strip()
        if line.startswith(until):
            break
        else:
            lines.append(line)
    return lines

def find_line(lines, start):
    for line in lines:
        if line.startswith(start):
            return line
    return None

def parse_pfind_scan_from_TITLE(pfind_title):
    return int(pfind_title.split('.')[-4])

def is_pfind_mgf(mgf):
    return mgf.upper().endswith('_HCDFT.MGF')

def index_ragged_list(ragged_list: list)  -> np.ndarray:
    """Create lookup indices for a list of arrays for concatenation.

    Args:
        value (list): Input list of arrays.

    Returns:
        indices: A numpy array with indices.
    """
    indices = np.zeros(len(ragged_list) + 1, np.int64)
    indices[1:] = [len(i) for i in ragged_list]
    indices = np.cumsum(indices)

    return indices

class MGFReader(MSReaderBase):

    def load(self, mgf):
        if isinstance(mgf, str):
            f = open(mgf)
        else:
            f = mgf
        scanset = set()
        masses_list = []
        intens_list = []
        scan_list = []
        rt_list = []
        while True:
            line = f.readline()
            if not line: break
            if line.startswith('BEGIN IONS'):
                lines = read_until(f, 'END IONS')
                masses = []
                intens = []
                scan = None
                RT = 0
                for line in lines:
                    if line[0].isdigit():
                        mass,inten = [float(i) for i in line.strip().split()]
                        masses.append(mass)
                        intens.append(inten)
                    elif line.startswith('SCAN='):
                        scan = int(line.split('=')[1])
                    elif line.startswith('RTINSECOND'):
                        RT = float(line.split('=')[1])/60
                if not scan:
                    title = find_line(lines, 'TITLE=')
                    scan = parse_pfind_scan_from_TITLE(title)
                if scan in scanset: continue
                scanset.add(scan)
                scan_list.append(scan)
                rt_list.append(RT)
                masses_list.append(np.array(masses))
                intens_list.append(np.array(intens))
        if isinstance(mgf, str): 
            f.close()
        self.build_spectrum_df(
            scan_list, 
            index_ragged_list(masses_list), 
            rt_list
        )
        self.mzs = np.concatenate(masses_list)
        self.intensities = np.concatenate(intens_list)
    

class MSReaderProvider:
    def __init__(self):
        self.reader_dict = {}
    def register_reader(self, ms2_type, reader_class):
        self.reader_dict[ms2_type.lower()] = reader_class

    def get_reader(self, file_type)->MSReaderBase:
        if file_type not in self.reader_dict: return None
        else: return self.reader_dict[file_type.lower()]()

ms2_reader_provider = MSReaderProvider()
ms2_reader_provider.register_reader('mgf', MGFReader)
ms2_reader_provider.register_reader('alphapept', AlphaPept_HDF_MS2_Reader)
ms2_reader_provider.register_reader('alphapept_hdf', AlphaPept_HDF_MS2_Reader)

ms1_reader_provider = MSReaderProvider()
ms1_reader_provider.register_reader('alphapept', AlphaPept_HDF_MS1_Reader)
ms1_reader_provider.register_reader('alphapept_hdf', AlphaPept_HDF_MS1_Reader)

In [7]:
#export

try:
    from alphapept.pyrawfilereader import RawFileReader
    class ThermoRawMS1Reader(MSReaderBase):
        def __init__(self):
            super().__init__()
            self.profile_mode = False

        def load(self, raw_path):
            rawfile = RawFileReader(raw_path)
 
            spec_indices = np.array(
                range(rawfile.FirstSpectrumNumber, rawfile.LastSpectrumNumber + 1)
            )
            scan_list = []
            rt_list = []
            masses_list = []
            intens_list = []
            for i in spec_indices:
                try:
                    ms_order = rawfile.GetMSOrderForScanNum(i)

                    if ms_order == 1:
                        if self.profile_mode:
                            masses, intens = rawfile.GetProfileMassListFromScanNum(i)
                        else:
                            masses, intens = rawfile.GetCentroidMassListFromScanNum(i)
                        scan_list.append(i)
                        rt_list.append(rawfile.RTInSecondsFromScanNum(i))
                        masses_list.append(masses)
                        intens_list.append(intens)

                except KeyboardInterrupt as e:
                    raise e
                except SystemExit as e:
                    raise e
                except Exception as e:
                    print(f"Bad scan={i} in raw file '{raw_path}'")
            
            self.build_spectrum_df(
                scan_list,
                index_ragged_list(masses_list),
                rt_list,
            )
            self.mzs = np.concatenate(masses_list)
            self.intensities = np.concatenate(intens_list)
            rawfile.Close()

    class ThermoRawMS2Reader(MSReaderBase):
        def __init__(self):
            super().__init__()
            self.profile_mode = False

        def load(self, raw_path):
            rawfile = RawFileReader(raw_path)
 
            spec_indices = np.array(
                range(rawfile.FirstSpectrumNumber, rawfile.LastSpectrumNumber + 1)
            )
            scan_list = []
            rt_list = []
            masses_list = []
            intens_list = []
            for i in spec_indices:
                try:
                    ms_order = rawfile.GetMSOrderForScanNum(i)

                    if ms_order == 2:
                        if self.profile_mode:
                            masses, intens = rawfile.GetProfileMassListFromScanNum(i)
                        else:
                            masses, intens = rawfile.GetCentroidMassListFromScanNum(i)
                        scan_list.append(i)
                        rt_list.append(rawfile.RTFromScanNum(i))
                        masses_list.append(masses)
                        intens_list.append(intens)

                except KeyboardInterrupt as e:
                    raise e
                except SystemExit as e:
                    raise e
                # except Exception as e:
                #     print(f"Bad scan={i} in raw file '{raw_path}'")
            
            self.build_spectrum_df(
                scan_list,
                index_ragged_list(masses_list),
                rt_list,
            )
            self.mzs = np.concatenate(masses_list)
            self.intensities = np.concatenate(intens_list)
            rawfile.Close()
    
    ms2_reader_provider.register_reader('thermo', ThermoRawMS2Reader)
    ms2_reader_provider.register_reader('thermo_raw', ThermoRawMS2Reader)
    ms1_reader_provider.register_reader('thermo', ThermoRawMS1Reader)
    ms1_reader_provider.register_reader('thermo_raw', ThermoRawMS1Reader)
except Exception as e:
    # alphapept or RawFileReader is not installed
    print('alphapept or RawFileReader is not installed')
    print(e)

In [8]:
#hide
import io
mgf = io.StringIO("""
BEGIN IONS
TITLE=02445a_BA7-TUM_HLA_7_01_01-DDA-1h-R1.8.8.3.0.dta
CHARGE=3+
RTINSECONDS=0.5418930
PEPMASS=272.276336
103.92207 5457.3
104.20045 5051.4
108.70090 5891.7
113.94175 6442.6
116.92975 40506.3
116.93716 8945.5
128.37773 6427.8
131.95308 288352.6
133.93259 7344.6
138.44611 7326.1
139.00072 41556.8
140.00319 16738.8
140.99719 9493.8
145.93156 10209.3
145.94897 10497.8
147.94559 8206.3
147.96396 30552.8
148.95543 14654.7
149.96338 234207.8
150.95096 8306.0
157.01089 84638.9
158.01357 27925.7
159.00627 16084.7
163.94281 24751.1
163.95915 32203.3
165.95605 44458.0
165.97186 11530.2
166.99500 26432.2
167.97302 9216.7
181.95230 13858.8
191.95448 66152.7
192.95538 8408.9
193.07185 9092.8
193.95313 660574.9
194.95674 23452.8
194.99008 143940.9
200.00568 19510.8
200.99942 23678.7
204.30894 9406.1
209.96466 21853.6
211.96245 65351.0
218.90355 9149.6
223.91072 11300.2
238.89684 12108.8
243.93825 10150.2
243.97040 10987.7
244.94121 8744.2
246.90314 11556.3
271.93225 29430.0
271.99219 51184.4
272.19150 31960.4
272.98602 35844.1
273.94431 11031.8
284.47998 8191.3
290.00125 66212.4
290.99539 54064.7
293.89490 10005.0
407.06372 10838.2
464.36697 9715.4
698.81390 9711.7
END IONS
BEGIN IONS
TITLE=02445a_BA7-TUM_HLA_7_01_01-DDA-1h-R1.11.11.2.0.dta
CHARGE=2+
RTINSECONDS=0.6455220
PEPMASS=287.427959
103.34669 5304.0
104.66884 5639.7
113.42419 6258.3
118.84039 5837.5
119.93203 13977.3
130.69589 6876.2
133.94824 43094.3
134.30524 7671.5
135.96359 9031.3
138.99994 8329.7
146.95573 31143.9
147.96323 12176.5
150.95151 65859.3
151.95818 24384.2
157.01105 19241.5
157.34985 7532.5
161.08838 7843.9
161.94234 20119.7
162.95146 60110.4
163.95877 183305.5
164.96657 13647.5
174.95139 150331.9
175.95258 21393.4
178.94460 11433.1
179.95316 13650.5
180.96204 15353.5
190.94572 30418.9
191.95422 61914.1
192.61461 8642.1
192.94395 12331.4
192.96207 132342.5
193.96318 19303.0
209.04164 25149.6
209.96368 154185.0
209.98361 12353.5
213.86244 11541.3
224.93071 12903.0
228.92879 8773.6
241.86043 135357.5
242.86113 20805.2
242.94327 26679.4
243.95219 29569.9
244.92361 12153.5
246.90300 16650.3
252.96521 73484.3
253.96646 11527.5
286.85858 10166.4
287.94186 18763.2
303.87665 39189.3
304.88116 11976.0
321.89087 97122.5
322.88867 28020.8
370.28696 9008.2
389.82578 13277.0
407.83545 12220.4
425.84872 13236.5
482.54852 10940.2
END IONS
""")
reader = ms2_reader_provider.get_reader('mgf')
reader.load(mgf)
scan_no = 8
masses, intens = reader.get_peaks(scan_no)
assert len(masses)==(
    reader.spectrum_df.loc[scan_no,'peak_end_idx']
    -reader.spectrum_df.loc[scan_no,'peak_start_idx']
)
assert (np.diff(masses)>=0).all() #sorted
reader.spectrum_df

Unnamed: 0,peak_start_idx,peak_end_idx,rt,mobility
0,-1,-1,,
1,-1,-1,,
2,-1,-1,,
3,-1,-1,,
4,-1,-1,,
5,-1,-1,,
6,-1,-1,,
7,-1,-1,,
8,0,60,0.009032,
9,-1,-1,,


In [9]:
#hide
import os
reader = ms2_reader_provider.get_reader('thermo')
reader.load(os.path.expanduser('~/Workspace/Data/Thermo_iRT/iRT.raw'))
scan_no = 3934
masses, intens = reader.get_peaks(scan_no)
assert len(masses)==(
    reader.spectrum_df.loc[scan_no,'peak_end_idx']
    -reader.spectrum_df.loc[scan_no,'peak_start_idx']
)
assert (np.diff(masses)>=0).all() #sorted
reader.spectrum_df

Unnamed: 0,peak_start_idx,peak_end_idx,rt,mobility
0,-1,-1,,
1,-1,-1,,
2,-1,-1,,
3,-1,-1,,
4,-1,-1,,
...,...,...,...,...
3932,-1,-1,,
3933,-1,-1,,
3934,95456,95472,5.997334,
3935,-1,-1,,


In [10]:
#hide
import os
reader = ms1_reader_provider.get_reader('thermo')
reader.load(os.path.expanduser('~/Workspace/Data/Thermo_iRT/iRT.raw'))
scan_no = 3931
masses, intens = reader.get_peaks(scan_no)
assert len(masses)==(
    reader.spectrum_df.loc[scan_no,'peak_end_idx']
    -reader.spectrum_df.loc[scan_no,'peak_start_idx']
)
assert (np.diff(masses)>=0).all() #sorted
reader.spectrum_df

Unnamed: 0,peak_start_idx,peak_end_idx,rt,mobility
0,-1,-1,,
1,0,254,0.178981,
2,254,665,0.383501,
3,665,1131,0.588464,
4,1131,1663,0.793464,
...,...,...,...,...
3933,1004815,1006056,359.699078,
3934,-1,-1,,
3935,1006056,1007286,359.930573,
3936,-1,-1,,
