In [1]:
from jupyter_client.session import json_packer
from matchms.importing import load_from_mgf
import os
from collections import defaultdict
import numpy as np
from collections import Counter
import pandas as pd
import json


In [2]:
DATA_PATH = "../../../data/MSn"

In [3]:
spectra = list(load_from_mgf("/Users/macbook/CODE/Majer:MassSpecGym/data/MSn/20241211_msn_library_pos_all_lib_MSn.mgf"))

In [4]:
# Collect all metadata keys from the spectra
metadata_keys = set()
for spectrum in spectra:
    metadata_keys.update(spectrum.metadata.keys())

print(f"Total unique metadata fields: {len(metadata_keys)}")
print("Metadata fields:")
for key in metadata_keys:
    print(key)

Total unique metadata fields: 39
Metadata fields:
acquisition
quality_explained_signals
inchi
precursor_mz
feature_id
fragmentation_method
precursor_purity
description
inchi_aux
other_matched_compounds_names
quality_explained_intensity
compound_name
retention_time
scans
collision_energy
other_matched_compounds
msn_collision_energies
usi
dataset_id
smiles
data_collector
msn_precursor_mzs
instrument_type
msn_fragmentation_methods
ionmode
formula
num_peaks
isolation_window
msn_isolation_windows
comment
charge
adduct
ms_level
quality_chimeric
spectype
principal_investigator
ion_source
ims_type
parent_mass


In [6]:
# Initialize a dictionary to hold lists of values for each metadata field
metadata_values = defaultdict(list)

# Collect values for each metadata field
for spectrum in spectra:
    for key in metadata_keys:
        value = spectrum.metadata.get(key, None)
        metadata_values[key].append(value)

In [8]:
metadata_frequencies = {}

for key, values in metadata_values.items():
    frequencies = Counter(values)
    metadata_frequencies[key] = frequencies

In [9]:
top_ten_values = {}

for key, freq_counter in metadata_frequencies.items():
    # Get the ten most common values and their counts
    top_ten = freq_counter.most_common(10)
    top_ten_values[key] = top_ten

In [10]:
for key in metadata_keys:
    print(f"\nMetadata Field: {key}")
    print("-" * (15 + len(key)))
    top_values = top_ten_values.get(key, [])
    if top_values:
        df = pd.DataFrame(top_values, columns=['Value', 'Frequency'])
        print(df)
    else:
        print("No data available.")


Metadata Field: acquisition
--------------------------
   Value  Frequency
0  Crude     803405

Metadata Field: quality_explained_signals
----------------------------------------
        Value  Frequency
0         1.0     504023
1         0.8       8402
2        0.75       8325
3   0.8333333       8135
4  0.85714287       7946
5       0.875       7558
6   0.8888889       6971
7   0.6666667       6782
8         0.9       6156
9  0.90909094       5533

Metadata Field: inchi
--------------------
                                               Value  Frequency
0  InChI=1S/C21H24F3N7O/c1-12-28-19(26-2)31-20(29...        541
1  InChI=1S/C22H29N7O/c1-15-13-17(29-9-11-30-12-1...        537
2  InChI=1S/C22H31NO/c1-16(2)23(17(3)4)14-13-20(1...        464
3  InChI=1S/C24H24ClFN2O/c1-15(24(29)28-20-9-6-18...        461
4  InChI=1S/C18H18N2O4/c21-15(19-24)10-2-1-3-11-2...        442
5  InChI=1S/C12H8N2/c1-3-9-5-6-10-4-2-8-14-12(10)...        442
6  InChI=1S/C15H17NO2/c1-11(17)16-9-8-13-5-3-4-12... 

In [15]:
metadata_frequencies['quality_explained_signals']

Counter({'1.0': 504023,
         '0.8': 8402,
         '0.75': 8325,
         '0.8333333': 8135,
         '0.85714287': 7946,
         '0.875': 7558,
         '0.8888889': 6971,
         '0.6666667': 6782,
         '0.9': 6156,
         '0.90909094': 5533,
         '0.9166667': 5335,
         None: 4885,
         '0.5': 4741,
         '0.9230769': 4679,
         '0.9285714': 4392,
         '0.93333334': 3995,
         '0.9375': 3706,
         '0.9411765': 3402,
         '0.9444444': 3352,
         '0.7777778': 3021,
         '0.94736844': 2922,
         '0.71428573': 2811,
         '0.8181818': 2731,
         '0.95': 2700,
         '0.6': 2464,
         '0.95238096': 2440,
         '0.84615386': 2434,
         '0.95454544': 2384,
         '0.8666667': 2159,
         '0.95652175': 2116,
         '0.9583333': 1990,
         '0.96': 1947,
         '0.88235295': 1918,
         '0.96153843': 1787,
         '0.8947368': 1693,
         '0.962963': 1672,
         '0.72727275': 1589,
         '

In [18]:
# Serialize data into file:
# json.dump( metadata_frequencies, open( os.path.join(DATA_PATH, "20240916_metadata_frequencies_all_pos.json"), 'w' ) )

## Canonization Analyzis

In [6]:
import os
import json
from typing import List, Dict, Set, Tuple, Union
from massspecgym.tools.analyzers import analyze_canonical_smiles, compare_canonical_smiles
from rdkit import Chem
import time

In [7]:
# Use old one to map old SMILES before canonization to PubChem and RDKit
mgf_path = "/Users/macbook/CODE/Majer:MassSpecGym/data/MSn/20240929_msn_library_pos_all_lib_MSn.mgf"
json_path = "/Users/macbook/CODE/Majer:MassSpecGym/data/MSn/smiles_canonicalization_all.json"

In [8]:
with open(json_path, 'r') as f:
    smiles_dict = json.load(f)

In [9]:
spectra = list(load_from_mgf(mgf_path))

In [10]:
smiles_in_spectra = set()
for spectrum in spectra:
    smiles = spectrum.metadata.get('smiles') or spectrum.metadata.get('SMILES')
    if smiles:
        smiles_in_spectra.add(smiles)

In [11]:
print(f"Total unique SMILES in spectra: {len(smiles_in_spectra)}")
print(f"Total SMILES in canonicalization PubChem dict: {len(smiles_dict)}")


problematic_smiles = smiles_in_spectra - set(smiles_dict.keys())
print(f"Found {len(problematic_smiles)} problematic OLD SMILES not in the canonicalization dict.")    

Total unique SMILES in spectra: 14008
Total SMILES in canonicalization dict: 14008
Found 0 problematic OLD SMILES not in the canonicalization dict.


In [12]:
_, canonical_smiles_set_rdkit, invalid_smiles_set_rdkit = analyze_canonical_smiles(spectra)

=== SMILES Processing Statistics ===
Mode: SPECTRA
Total SMILES extracted: 14008
Unique original SMILES: 14008
Unique canonical SMILES: 14008
Number of invalid SMILES: 0
Number of SMILES unchanged after RDKit canonicalization: 6447



In [13]:
# Extract PubChem canonical SMILES from the JSON dictionary
canonical_smiles_set_pubchem = set(smiles_dict.values())

print(f"Total unique PubChem canonical SMILES: {len(canonical_smiles_set_pubchem)}")
print(f"Total unique RDKit canonical SMILES: {len(canonical_smiles_set_rdkit)}")

# Check if both sets are identical
if canonical_smiles_set_pubchem == canonical_smiles_set_rdkit:
    print("\nPubChem and RDKit canonical SMILES sets are identical.")
else:
    print("\nPubChem and RDKit canonical SMILES sets differ.")

    pubchem_not_rdkit = canonical_smiles_set_pubchem - canonical_smiles_set_rdkit
    print(f"\n SMILES in PubChem but not in RDKit: {len(pubchem_not_rdkit)}")

    rdkit_not_pubchem = canonical_smiles_set_rdkit - canonical_smiles_set_pubchem
    print(f"\n SMILES in RDKit but not in PubChem: {len(rdkit_not_pubchem)}")

    overlap = canonical_smiles_set_pubchem.intersection(canonical_smiles_set_rdkit)
    print(f"\n Number of overlapping canonical SMILES: {len(overlap)}")

Total unique PubChem canonical SMILES: 14008
Total unique RDKit canonical SMILES: 14008

PubChem and RDKit canonical SMILES sets differ.

 SMILES in PubChem but not in RDKit: 13944

 SMILES in RDKit but not in PubChem: 13944

 Number of overlapping canonical SMILES: 64


In [5]:
compare_canonical_smiles(mgf_path, json_path)

To make sense of analyzis is recommended to use version of dataset without canonizations, date before 29/09
Loading spectra from MGF file...
Loaded 803405 spectra.

Loading PubChem canonical SMILES from JSON...
Loaded canonical SMILES dictionary with 14008 entries.

Extracting unique SMILES from spectra...
Total unique SMILES in spectra: 14008
Total SMILES in canonicalization dict (PubChem): 14008

Found 0 problematic SMILES not in the canonicalization dict.

Analyzing canonical SMILES with RDKit...
=== SMILES Processing Statistics ===
Mode: SPECTRA
Total SMILES extracted: 14008
Unique original SMILES: 14008
Unique canonical SMILES: 14008
Number of invalid SMILES: 0
Number of SMILES unchanged after RDKit canonicalization: 6447

Total unique RDKit canonical SMILES: 14008
Total invalid SMILES by RDKit: 0

Total unique PubChem canonical SMILES: 14008

Comparing PubChem and RDKit canonical SMILES sets...

 PubChem and RDKit canonical SMILES sets differ.

 SMILES in PubChem but not in RDKit