In [7]:
import json
import pandas as pd
from rdkit import Chem
from rdkit.Chem import inchi

In [8]:
# Load the JSON data
file_path = "GNPS-LIBRARY.json"
with open(file_path, 'r') as f:
    data = json.load(f)

compound_data = {}

def is_valid_smiles(smiles):
    """Check if a SMILES string is valid."""
    if not smiles or smiles.strip() in {"", "N/A"}:  # Catch empty, space-only, or "N/A"
        return False
    return Chem.MolFromSmiles(smiles) is not None

def inchi_to_smiles(inchi_str):
    """Convert InChI to SMILES if possible."""
    if not inchi_str or inchi_str.strip() in {"", "N/A"}:  # Check for empty values
        return None
    try:
        mol = inchi.MolFromInchi(inchi_str)
        return Chem.MolToSmiles(mol) if mol else None
    except:
        return None  # Avoid crashes if conversion fails

# Process each compound
for compound in data:
    compound_name = compound.get("Compound_Name", "Unknown")
    smiles = compound.get("Smiles") or compound.get("SMILES")
    inchi_str = compound.get("INCHI")

    # If SMILES is invalid, try generating it from InChI
    if not is_valid_smiles(smiles):
        smiles = inchi_to_smiles(inchi_str)

    if not is_valid_smiles(smiles):  # If still invalid, skip
        with open("missing_smiles.log", "a") as log_file:
            log_file.write(f"Skipping {compound_name}: Missing or invalid SMILES/InChI\n")
        continue

    # Extract metadata
    metadata = {key: compound.get(key) for key in [
        "spectrum_id", "source_file", "task", "scan", "ms_level",
        "library_membership", "Precursor_MZ", "ExactMass", "Charge",
        "Compound_Source", "Instrument", "Ion_Source", "Ion_Mode",
    ]}
    
    # Convert SMILES to InChI/InChIKey 
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        try:
            metadata.update({
                "smiles": smiles,
                "inchi": inchi.MolToInchi(mol),
                "inchikey": inchi.MolToInchiKey(mol)
            })
        except:
            metadata.update({"inchi": None, "inchikey": None})
    else:
        metadata.update({"inchi": None, "inchikey": None})

    # Parse peaks JSON 
    peaks_json = compound.get("peaks_json")
    try:
        spectra_df = pd.DataFrame(json.loads(peaks_json), columns=["m/z", "intensity"]) if isinstance(peaks_json, str) and peaks_json.strip().lower() != "n/a" else pd.DataFrame(columns=["m/z", "intensity"])
    except:
        spectra_df = pd.DataFrame(columns=["m/z", "intensity"])

    # Store the compound data
    compound_data[compound_name] = {"metadata": metadata, "spectra": spectra_df}
















































































[19:00:46] ERROR: 





































[19:00:47] Explicit valence for atom # 49 Na, 2, is greater than permitted













































[19:00:48] SMILES Parse Error: syntax error while parsing: InChI=1S/C16H21NO2/c1-2-3-4-5-6-11-14-16(19)15(18)12-9-7-8-10-13(12)17-14/h7-10,19H,2-6,11H2,1H3,(H,17,18)
[19:00:48] SMILES Parse Error: Failed parsing SMILES ' InChI=1S/C16H21NO2/c1-2-3-4-5-6-11-14-16(19)15(18)12-9-7-8-10-13(12)17-14/h7-10,19H,2-6,11H2,1H3,(H,17,18)' for input: ' InChI=1S/C16H21NO2/c1-2-3-4-5-6-11-14-16(19)15(18)12-9-7-8-10-13(12)17-14/h7-10,19H,2-6,11H2,1H3,(H,17,18)'
[19:00:48] ERROR: 



































[19:00:50] Can't kekulize mol.  Unkekulized atoms: 10 11 12 14 16
[19:00:50] Can't kekulize mol.  Unkekulized atoms: 10 11 12 14 16






























[19:00:50] SMILES Parse Error: unclosed ring for input: 'OC1=CC(C(OC)=O)=C(OC2=CC(C)=CC(O)=C2C(O)=O)C(OC)=C2'
[19:00:50] SMILES Parse Error: unclosed ring for input: 'O=C1C2=C(C=C(C)C=C2O)OC3=CC(O)=CC(C(OC)=O)=C32'
[19:00:50] SMILES Parse Error: unclosed ring for input: 'O=C([C@H](CC)C)O[C@H]1CCC=C2C1[C@@H](CC[C@@H](O)C[C@@H](O)CC(OC)=O)[C@@H](C)C=C3'
[19:00:50] SMILES Parse Error: unclosed ring for input: 'O=C(N[C@@H](CCCCCC(CC)=O)C(N[C@@H](CC1=CN(OC)C2=C1C=CC=C2)C3=O)=O)[C@@H]4N(C([C@H]([C@H](CC)C)N3)=O)CCCC5'

[19:00:50] SMILES Parse Error: unclosed ring for input: 'O=C(N(C(C=CC=C1)=C1C(N(C)[C@@]2([H])CC3=CC=CC=C3)=O)C2=N4)C5=C4C=CC=C6'




[19:00:50] SMILES Parse Error: unclosed ring for input: 'OC1=CC=C(CC(C(NC(C(CC)C)C(OC(C(CCCCCCCCCC)C)CC(NC(C(NC(C(NC(C(NC2CCC(N)=O)=O)C)=O)C)=O)C(O)C)=O)=O)=O)NC2=O)C=C2'

















[19:00:54] ERROR: 

[19:00:54] ERROR: 

[19:00:54] ERROR: 

[19:00:54] ERROR: 

[19:00:54] ERROR: 

















[19:00:54] Explicit valence for atom # 22 O, 3, is greater than permitted




[19:00:54] Explicit valence for atom # 31 O, 3, is greater than permitted

[19:00:54] Explicit valence for atom # 6 O, 3, is greater than permitted
[19:00:54] Explicit valence for atom # 2 O, 3, is greater than permitted



[19:00:54] Explicit valence for atom # 4 O, 3, is greater than permitted
[19:00:54] Explicit valence for atom # 7 O, 3, is greater than permitted

[19:00:54] Explicit valence for atom # 35 O, 3, is greater than permitted

[19:00:54] Explicit valence for atom # 35 O, 3, is greater than permitted


[19:00:54] Explicit valence for atom # 35 O, 3, is greater than permitted
[19:00:54] Explicit valence for atom # 35 O, 3, is greater than permitted










[19:00:55] ERROR: 















































































































[19:00:56] non-ring atom 88 marked aromatic

[19:00:56] non-ring atom 90 marked aromatic



































































































































































[19:00:57] SMILES Parse Error: syntax error while parsing: NA
[19:00:57] SMILES Parse Error: Failed parsing SMILES 'NA' for input: 'NA'
[19:00:57] ERROR: 









































































[19:00:58] ERROR: 

























[19:00:58] SMILES Parse Error: syntax error while parsing: (CC(=O)O3)O)C
[19:00:58] SMILES Parse Error: Failed parsing SMILES '(CC(=O)O3)O)C' for input: '(CC(=O)O3)O)C'


































































[19:00:59] Explicit valence for atom # 17 N, 4, is greater than permitted
[19:00:59] Explicit valence for atom # 19 N, 4, is greater than permitted



[19:00:59] SMILES Parse Error: syntax error while parsing: CC1(C)[C@H](OC(C)=O)[C@H](O)C[C@]([C@@]1([H])CC[C@H]2C)(C)[C@]32CC4=C(O)C=C(CO)C(C=O)=C4O3&gt;&gt;
[19:00:59] SMILES Parse Error: Failed parsing SMILES 'CC1(C)[C@H](OC(C)=O)[C@H](O)C[C@]([C@@]1([H])CC[C@H]2C)(C)[C@]32CC4=C(O)C=C(CO)C(C=O)=C4O3&gt;&gt;' for input: 'CC1(C)[C@H](OC(C)=O)[C@H](O)C[C@]([C@@]1([H])CC[C@H]2C)(C)[C@]32CC4=C(O)C=C(CO)C(C=O)=C4O3&gt;&gt;'






















































































































































































































































































[19:01:04] SMILES Parse Error: syntax error while parsing: CC1(C)[C@H](O)CC[C@]([C@@]1([H])CC[C@H]2C)(C)[C@]32CC4=C(O)C=C(C=O)C(C=O)=C4O3&gt;&gt;
[19:01:04] SMILES Parse Error: Failed parsing SMILES 'CC1(C)[C@H](O)CC[C@]([C@@]1([H])CC[C@H]2C)(C)[C@]32CC4=C(O)C=C(C=O)C(C=O)=C4O3&gt;&gt;' for input: 'CC1(C)[C@H](O)CC[C@]([C@@]1([H])CC[C@H]2C)(C)[C@]32CC4=C(O)C=C(C=O)C(C=O)=C4O3&gt;&gt;'
[19:01:04] SMILES Parse Error: syntax error while parsing: CC1(C)[C@H](OC(C)=O)CC[C@]([C@@]1([H])CC[C@H]2C)(C)[C@]32CC4=C(O)C=C(CO)C(C=O)=C4O3&gt;&gt;
[19:01:04] SMILES Parse Error: Failed parsing SMILES 'CC1(C)[C@H](OC(C)=O)CC[C@]([C@@]1([H])CC[C@H]2C)(C)[C@]32CC4=C(O)C=C(CO)C(C=O)=C4O3&gt;&gt;' for input: 'CC1(C)[C@H](OC(C)=O)CC[C@]([C@@]1([H])CC[C@H]2C)(C)[C@]32CC4=C(O)C=C(CO)C(C=O)=C4O3&gt;&gt;'
[19:01:04] SMILES Parse Error: syntax error while parsing: CC1(C)[C@H](OC(C)=O)[C@H](O)C[C@]([C@@]1([H])CC[C@H]2C)(C)[C@]32CC4=C(O)C=C(CO)C(C=O)=C4





































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































[19:01:13] SMILES Parse Error: extra open parentheses for input: 'COC(=O)C1=CCCC2C3(C)CC(OC(=O)C3CC(OC3OC(CO)C(O)C(O)C3O'
[19:01:13] ERROR: 










[19:01:14] Explicit valence for atom # 19 N, 4, is greater than permitted





[19:01:18] SMILES Parse Error: syntax error while parsing: 1S/C17H22O2/c1-3-5-6-7-8-9-10-14-17(19)15-12-11-13-16(18)4-2/h3-4,10,14,16-19H,1-2,5-9H2
[19:01:18] SMILES Parse Error: Failed parsing SMILES '1S/C17H22O2/c1-3-5-6-7-8-9-10-14-17(19)15-12-11-13-16(18)4-2/h3-4,10,14,16-19H,1-2,5-9H2' for input: '1S/C17H22O2/c1-3-5-6-7-8-9-10-14-17(19)15-12-11-13-16(18)4-2/h3-4,10,14,16-19H,1-2,5-9H2'
[19:01:18] ERROR: 




[19:01:26] ERROR: 












[19:01:26] SMILES Parse Error: syntax error while parsing: -O=C1O[C@@H](C2=CC=C(C(OC)=C2)O)[C@@H](C(O)=O)/C1=C\C3=CC=C(C(OC)=C3)O
[19:01:26] SMILES Parse Error: Failed parsing SMILES '-O=C1O[C@@H](C2=CC=C(C(OC)=C2)O)[C@@H](C(O)=O)/C1=C\C3=CC=C(C(OC)=C3)O' for input: '-O=C1O


















































[19:01:27] SMILES Parse Error: syntax error while parsing: O=C1C=2C=CC=CC2OC3=C(O)C(OC)=C(OC)C=C13;
[19:01:27] SMILES Parse Error: Failed parsing SMILES 'O=C1C=2C=CC=CC2OC3=C(O)C(OC)=C(OC)C=C13;' for input: 'O=C1C=2C=CC=CC2OC3=C(O)C(OC)=C(OC)C=C13;'













[19:01:27] Explicit valence for atom # 38 O, 3, is greater than permitted





[19:01:27] SMILES Parse Error: syntax error while parsing: ;O=C1C=2C(O)=CC(O)=CC2OC=3C(O)=CC=C(C13)CC=C(C)C
[19:01:27] SMILES Parse Error: Failed parsing SMILES ';O=C1C=2C(O)=CC(O)=CC2OC=3C(O)=CC=C(C13)CC=C(C)C' for input: ';O=C1C=2C(O)=CC(O)=CC2OC=3C(O)=CC=C(C13)CC=C(C)C'































































































































































[19:01:28] SMILES Parse Error: syntax error while parsing: p(MeOx6)H+
[19:01:28] SMILES Parse Error: Failed parsing SMILES 'p(MeOx6)H+' for input: 'p(MeOx6)H+'




























































































































































































































































































[19:01:29] Explicit valence for atom # 10 Na, 2, is greater than permitted











[19:01:29] Explicit valence for atom # 10 Na, 2, is greater than permitted









[19:01:29] Explicit valence for atom # 9 Na, 2, is greater than permitted











[19:01:29] Explicit valence for atom # 50 Na, 2, is greater than permitted


[19:01:29] Explicit valence for atom # 29 Na, 2, is greater than permitted



















[19:01:29] Explicit valence for atom # 35 Na, 2, is greater than permitted




























[19:01:29] Explicit valence for atom # 53 Na, 2, is greater than permitted
























































































































































[19:01:30] SMILES Parse Error: extra open parentheses for input: '[H][C@]1([C@H](CC2=CNC3=CC=CC=C32)'
[19:01:30] SMILES Parse Error: extra open parentheses for input: '[H][C@]1([C@H](CC2=CNC3=CC=CC=C32)'
[19:01:30] SMILES Parse Error: extra open parentheses for input: '[H][C@]1([C@H](CC2=CNC3=CC=CC=C32)N'
[19:01:30] SMILES Parse Error: syntax error while parsing: CC([C@H]1C)=C[C@@](/C=C/C[C@H](C)/C=
[19:01:30] SMILES Parse Error: Failed parsing SMILES 'CC([C@H]1C)=C[C@@](/C=C/C[C@H](C)/C=' for input: 'CC([C@H]1C)=C[C@@](/C=C/C[C@H](C)/C='




[19:01:31] SMILES Parse Error: syntax error while parsing: lydicamycin
[19:01:31] SMILES Parse Error: Failed parsing SMILES 'lydicamycin' for input: 'lydicamycin'
[19:01:31] ERROR: 

[19:01:31] SMILES Parse Error: syntax error while parsing: lydicamycin
[19:01:31] SMILES Parse Error: Failed parsing SMILES 'lydicamycin' for input: 'lydicamycin'
[19:01:31] ERROR: 









































































In [14]:
# Example of accessing data:
compound_name = "3-Des-Microcystein_LR"
metadata = compound_data[compound_name]["metadata"]
spectra_df = compound_data[compound_name]["spectra"]

print("Stored spectra:", compound_data[compound_name]["spectra"], "\n")
print(metadata["smiles"],"\n")  
print(metadata["inchi"])


Stored spectra:             m/z  intensity
0    289.286377     8068.0
1    295.545288    22507.0
2    298.489624     3925.0
3    317.324951    18742.0
4    319.655945     8604.0
..          ...        ...
213  954.491577   123937.0
214  963.686768   261578.0
215  964.524658   318164.0
216  965.192139   124405.0
217  982.221924    27147.0

[218 rows x 2 columns] 

CC(C)CC1NC(=O)C(C)NC(=O)C(=C)N(C)C(=O)CCC(NC(=O)C(C)C(NC(=O)C(CCCNC(N)=N)NC(=O)C(C)C(NC1=O)C(O)=O)\C=C\C(\C)=C\C(C)C(O)Cc1ccccc1)C(O)=O 

InChI=1S/C48H72N10O12/c1-25(2)22-36-45(66)57-39(47(69)70)29(6)41(62)54-34(16-13-21-51-48(49)50)44(65)53-33(18-17-26(3)23-27(4)37(59)24-32-14-11-10-12-15-32)28(5)40(61)55-35(46(67)68)19-20-38(60)58(9)31(8)43(64)52-30(7)42(63)56-36/h10-12,14-15,17-18,23,25,27-30,33-37,39,59H,8,13,16,19-22,24H2,1-7,9H3,(H,52,64)(H,53,65)(H,54,62)(H,55,61)(H,56,63)(H,57,66)(H,67,68)(H,69,70)(H4,49,50,51)/b18-17+,26-23+


In [10]:
# Print the first 10 compounds' metadata
for i, (compound_name, compound_info) in enumerate(compound_data.items()):
    if i >= 10:  # Limit to first 10 compounds
        break
    
    metadata = compound_info["metadata"]
    
    print(f"Compound {i+1}: {compound_name}")
    print("Metadata:")
    for key, value in metadata.items():
        print(f"  {key}: {value}")
    print("-" * 80)  # Separator for readability
    


Compound 1: 3-Des-Microcystein_LR
Metadata:
  spectrum_id: CCMSLIB00000001547
  source_file: 130618_Ger_Jenia_WT-3-Des-MCLR_MH981.4-qb.1.1..mgf
  task: 47daa4396adb426eaa5fa54b6ce7dd5f
  scan: 1
  ms_level: 2
  library_membership: GNPS-LIBRARY
  Precursor_MZ: 981.54
  ExactMass: 0.0
  Charge: 0
  Compound_Source: Isolated
  Instrument: qTof
  Ion_Source: LC-ESI
  Ion_Mode: Positive
  smiles: CC(C)CC1NC(=O)C(C)NC(=O)C(=C)N(C)C(=O)CCC(NC(=O)C(C)C(NC(=O)C(CCCNC(N)=N)NC(=O)C(C)C(NC1=O)C(O)=O)\C=C\C(\C)=C\C(C)C(O)Cc1ccccc1)C(O)=O
  inchi: InChI=1S/C48H72N10O12/c1-25(2)22-36-45(66)57-39(47(69)70)29(6)41(62)54-34(16-13-21-51-48(49)50)44(65)53-33(18-17-26(3)23-27(4)37(59)24-32-14-11-10-12-15-32)28(5)40(61)55-35(46(67)68)19-20-38(60)58(9)31(8)43(64)52-30(7)42(63)56-36/h10-12,14-15,17-18,23,25,27-30,33-37,39,59H,8,13,16,19-22,24H2,1-7,9H3,(H,52,64)(H,53,65)(H,54,62)(H,55,61)(H,56,63)(H,57,66)(H,67,68)(H,69,70)(H4,49,50,51)/b18-17+,26-23+
  inchikey: IYDKWWDUBYWQGF-NNAZGLEUSA-N
------------------

In [11]:
# Check for missing or invalid "smiles" values in the metadata dictionary
missing_smiles = []

for compound_name, compound_info in compound_data.items():
    smiles = compound_info["metadata"].get("smiles", "").strip()  # Get and clean the SMILES string
    
    if not smiles or smiles in ["N/A", " ", ""]:
        missing_smiles.append(compound_name)

# Print results
if missing_smiles:
    print(f"⚠️ {len(missing_smiles)} compounds are missing valid SMILES values:")
    for compound in missing_smiles[:10]:  # Show first 10 missing cases
        print(f"  - {compound}")
    print("...")
else:
    print("✅ All compounds have valid SMILES values.")


✅ All compounds have valid SMILES values.


In [12]:
spectra_count = sum(1 for compound in compound_data.values() if not compound["spectra"].empty)
print(f"Total number of spectra: {spectra_count}")


Total number of spectra: 7386


In [13]:
import json

# Load the JSON data
file_path = "GNPS-LIBRARY.json"
with open(file_path, 'r') as f:
    data = json.load(f)

# Count valid spectra
spectra_count = sum(1 for compound in data if compound.get("peaks_json") and compound["peaks_json"].strip().lower() != "n/a")

print(f"Total number of spectra in the original JSON: {spectra_count}")


Total number of spectra in the original JSON: 14674
