In [1]:
import os
import numpy as np
from datetime import datetime
from matchms import Spectrum
from matchms.importing import load_from_mgf
from matchms.exporting import save_as_mgf
from collections import Counter

In [2]:
FILE_1 = "20231031_nihnp_library_pos_all_lib_MSn.mgf"
FILE_2 = "20231130_mcescaf_library_pos_all_lib_MSn.mgf"
FILE_3 = "20231130_otavapep_library_pos_all_lib_MSn.mgf"
FILE_4 = "20240411_mcebio_library_pos_all_lib_MSn.mgf"

In [3]:
BASE_PATH = "/Users/macbook/CODE/MS/data/MSn/ALL"
SAVE_PATH = "../../../data/MSn"

In [4]:
file_paths = [os.path.join(BASE_PATH, FILE_1),
              os.path.join(BASE_PATH, FILE_2),
              os.path.join(BASE_PATH, FILE_3),
              os.path.join(BASE_PATH, FILE_4)]

In [5]:
all_spectra = []
spectra_counts = {}

# Initialize counters
root_id_counter = 0  # Counter for root MS2 spectra
fragment_id_counter = 0  # Counter for fragments under each root


for file_path in file_paths:
    spectra = list(load_from_mgf(file_path))
    spectra_counts[file_path] = len(spectra)
    idx_list = []  # To store IDENTIFIERs for validation

    for spectrum in spectra:
        ms_level = int(spectrum.get('ms_level'))
        if ms_level == 2:
            root_id_counter += 1
            fragment_id_counter = 0  # Reset fragment counter for new root
            # First part of IDENTIFIER: root ID
            root_id = f"{root_id_counter:07d}"
            # Second part of IDENTIFIER: fragment ID (for root, it's '0000000')
            fragment_id = f"{fragment_id_counter:07d}"
            idx = f"{root_id}_{fragment_id}"
            spectrum.set('IDENTIFIER', idx)
        else:
            fragment_id_counter += 1
            # Use the current root_id and increment fragment_id_counter
            fragment_id = f"{fragment_id_counter:07d}"
            idx = f"{root_id}_{fragment_id}"
            spectrum.set('IDENTIFIER', idx)
        idx_list.append(idx)

    all_spectra.extend(spectra)

for file, count in spectra_counts.items():
    print(f"{file}: {count} spectra")

combined_count = len(all_spectra)
print(f"Total combined spectra: {combined_count}")

/Users/macbook/CODE/MS/data/MSn/ALL/20231031_nihnp_library_pos_all_lib_MSn.mgf: 122837 spectra
/Users/macbook/CODE/MS/data/MSn/ALL/20231130_mcescaf_library_pos_all_lib_MSn.mgf: 243033 spectra
/Users/macbook/CODE/MS/data/MSn/ALL/20231130_otavapep_library_pos_all_lib_MSn.mgf: 63123 spectra
/Users/macbook/CODE/MS/data/MSn/ALL/20240411_mcebio_library_pos_all_lib_MSn.mgf: 374412 spectra
Total combined spectra: 803405


In [6]:
# Define the output file name with the current date
current_date = datetime.now().strftime("%Y%m%d")
output_file_name = f"{current_date}_msn_library_pos_all_lib_MSn.mgf"
output_file_path = os.path.join(SAVE_PATH, output_file_name)

# save_as_mgf(all_spectra, output_file_path)

dict_keys(['spectra'])


In [7]:
# Load the saved file to verify the number of spectra
loaded_spectra = list(load_from_mgf(output_file_path))
loaded_count = len(loaded_spectra)

# Compare counts
print(f"Loaded spectra from saved file: {loaded_count}")

if combined_count == loaded_count:
    print("Validation successful: Combined and loaded spectra counts match.")
else:
    print("Validation failed: Counts do not match!")
    print(f"Combined count: {combined_count}, Loaded count: {loaded_count}")

Loaded spectra from saved file: 803405
Validation successful: Combined and loaded spectra counts match.


In [8]:
loaded_spectra[0].metadata_dict()

{'charge': 1,
 'description': 'NIH NPAC ACONN collection of NP',
 'formula': 'C19H22O6',
 'inchi': 'InChI=1S/C19H22O6/c1-22-13-10-15(21)19(18(11-13)25-4)14(20)7-5-12-6-8-16(23-2)17(9-12)24-3/h6,8-11,21H,5,7H2,1-4H3',
 'smiles': 'COc1cc(O)c(C(=O)CCc2cc(OC)c(OC)cc2)c(OC)c1',
 'feature_id': '-1',
 'adduct': '[M+H]+',
 'spectype': 'ALL_MSN_TO_PSEUDO_MS2',
 'fragmentation_method': 'HCD',
 'isolation_window': '1.2000000476840569',
 'acquisition': 'Crude',
 'instrument_type': 'Orbitrap',
 'ims_type': 'none',
 'ion_source': 'ESI',
 'ionmode': 'positive',
 'dataset_id': 'MSVPLACEHOLDERID',
 'usi': 'mzspec:MSVPLACEHOLDERID:20230404_pluskal_nih_01P_A3_id_positive.mzML:-1',
 'scans': '-1',
 'precursor_purity': '1.0',
 'quality_chimeric': 'PASSED',
 'quality_explained_intensity': '0.97712445',
 'quality_explained_signals': '0.8',
 'collision_energy': '60.0',
 'num_peaks': '135',
 'compound_name': 'MEGxp0_001769',
 'parent_mass': '346.141638',
 'inchi_aux': 'MQEOTHTYNCMSAN-UHFFFAOYSA-N',
 'ms_level'