Spectrums with the same inchikey are combined into one spectrum that contains all the peaks

In [1]:
import sys
sys.path.append('../')

import pandas as pd
import numpy as np

from collections import Counter, defaultdict
from scripts.standarise_data import count_spectrums
from settings import MSrawdata

from matchms.importing import load_from_msp
from matchms import Spectrum

data_path = '../data/processed/MS2MassBank/'


In [2]:
print('MS dataset: ', MSrawdata)

MS dataset:  MassBank_NIST_Feb20


In [2]:
spectrums=[]
spectrums = list(load_from_msp(data_path+MSrawdata+"_metadata_toxfilter_peak.msp"))

In [3]:
count_spectrums(spectrums)

Total number of spectra: 3358
Total number of unique chemicals: 594
Average number of spectrums per unique chemical: 5.653198653198653


In [4]:
#Frequency of spectrums by inchikey
inchikeys=[s.get('inchikey') for s in spectrums]
frequency_counter = Counter(inchikeys)
inchikey = list(frequency_counter.keys())
frequency = list(frequency_counter.values())
df = pd.DataFrame.from_dict(frequency_counter, orient='index', columns=['Frequency'])
df.index.name = 'inchikey'
df.to_csv(data_path+'frequency_inchikeys_before_merging.csv')
df

Unnamed: 0_level_0,Frequency
inchikey,Unnamed: 1_level_1
GOJCZVPJCKEBQV-UHFFFAOYSA-N,6
YHMYGUUIMTVXNW-UHFFFAOYSA-N,6
ZQEIXNIJLIKNTD-UHFFFAOYSA-N,6
RXLOZRCLQMJJLC-UHFFFAOYSA-N,6
GNOIPBMMFNIUFM-UHFFFAOYSA-N,6
...,...
MBMQEIFVQACCCH-QBODLPLBSA-N,1
QCHFTSOMWOSFHM-WPRPVWTQSA-N,5
ZYGHJZDHTFUPRJ-UHFFFAOYSA-N,3
SNICXCGAKADSCV-JTQLQIEISA-N,3


In [5]:
# Creating a dictionary to group spectrums by their inchikey
spectrums_by_inchikey = defaultdict(list)
for s in spectrums:
    inchikey = s.get('inchikey')
    spectrums_by_inchikey[inchikey].append(s)
spectrums_by_inchikey

defaultdict(list,
            {'GOJCZVPJCKEBQV-UHFFFAOYSA-N': [Spectrum(precursor m/z=337.16, 9 fragments between 51.0 and 181.1),
              Spectrum(precursor m/z=337.16, 8 fragments between 51.0 and 181.1),
              Spectrum(precursor m/z=337.16, 2 fragments between 57.1 and 149.0),
              Spectrum(precursor m/z=337.16, 3 fragments between 57.1 and 152.1),
              Spectrum(precursor m/z=337.16, 1 fragments between 149.0 and 149.0),
              Spectrum(precursor m/z=337.16, 4 fragments between 149.0 and 181.1)],
             'YHMYGUUIMTVXNW-UHFFFAOYSA-N': [Spectrum(precursor m/z=151.03, 1 fragments between 151.0 and 151.0),
              Spectrum(precursor m/z=151.03, 6 fragments between 65.0 and 151.0),
              Spectrum(precursor m/z=151.03, 3 fragments between 93.1 and 151.0),
              Spectrum(precursor m/z=151.03, 1 fragments between 151.0 and 151.0),
              Spectrum(precursor m/z=151.03, 6 fragments between 65.0 and 151.0),
             

In [6]:
# Adding all spectrums to a combined spectrum per each inchikey
combined_spectrums = []
for inchikey, spectra in spectrums_by_inchikey.items():
    mz_data = np.concatenate([s.mz for s in spectra])
    intensity_data = np.concatenate([s.intensities for s in spectra])

    sort_indices = np.argsort(mz_data)
    mz_data_sorted = mz_data[sort_indices]
    intensity_data_sorted = intensity_data[sort_indices]

    combined_spectrum = Spectrum(mz=mz_data_sorted,
                                 intensities=intensity_data_sorted,
                                 metadata={'inchikey': inchikey})
    combined_spectrums.append(combined_spectrum)



In [7]:
from matchms.exporting import save_as_msp
save_as_msp(combined_spectrums, data_path + MSrawdata + '_metadata_toxfilter_peak_combined.msp')

In [8]:
#Total peaks by combined spectrum
inchikey_combined = [s.get('inchikey') for s in combined_spectrums]
peaks=[len(s.peaks.mz) for s in combined_spectrums]
df_peaks = pd.DataFrame({'num_peaks':peaks},index=inchikey_combined)
df_peaks.index.name = 'inchikey'
df_peaks.to_csv(data_path+'len_peaks_after_merging.csv')
df_peaks

Unnamed: 0_level_0,num_peaks
inchikey,Unnamed: 1_level_1
GOJCZVPJCKEBQV-UHFFFAOYSA-N,27
YHMYGUUIMTVXNW-UHFFFAOYSA-N,19
ZQEIXNIJLIKNTD-UHFFFAOYSA-N,63
RXLOZRCLQMJJLC-UHFFFAOYSA-N,58
GNOIPBMMFNIUFM-UHFFFAOYSA-N,13
...,...
MBMQEIFVQACCCH-QBODLPLBSA-N,4
QCHFTSOMWOSFHM-WPRPVWTQSA-N,25
ZYGHJZDHTFUPRJ-UHFFFAOYSA-N,8
SNICXCGAKADSCV-JTQLQIEISA-N,19
