In [4]:
import json
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import rdMolDescriptors, AllChem, DataStructs

def to_canonical_smiles(smiles):
    if pd.isna(smiles):
        return None
    try:
        mol = Chem.MolFromSmiles(smiles)
        return Chem.MolToSmiles(mol, canonical=True) if mol else None
    except:
        return None

def smiles_to_fingerprint_single(smiles, fp_size=1024, radius=2):
    if smiles is None or (isinstance(smiles, str) and not smiles.strip()):
        return np.zeros(fp_size)
    
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return np.zeros(fp_size)
    
    fp = rdMolDescriptors.GetMorganFingerprintAsBitVect(mol, radius, nBits=fp_size)
    return np.array(fp)

json_file = "foodb_2020_04_07_json/Compound.json"
output_path = "saved_data/fooddb.csv"

compounds = []

with open(json_file, 'r') as f:
    for line in f:
        line = line.strip()
        if line:
            try:
                compound = json.loads(line)
                smiles = compound.get('moldb_smiles')
                name = compound.get('name')
                public_id = compound.get('public_id')
                if smiles:
                    compounds.append({
                        'id': compound.get('id'),
                        'public_id': public_id,
                        'name': name,
                        'raw_SMILES': smiles
                    })
            except json.JSONDecodeError:
                continue

foodb_df = pd.DataFrame(compounds)
foodb_df['canonical_SMILES'] = foodb_df['raw_SMILES'].apply(to_canonical_smiles)
print(f"로드된 화합물 수: {len(foodb_df)}")

foodb_df

[10:12:47] Explicit valence for atom # 31 N, 4, is greater than permitted
[10:12:47] Explicit valence for atom # 21 N, 4, is greater than permitted
[10:12:48] Explicit valence for atom # 1 Cl, 4, is greater than permitted
[10:12:48] Explicit valence for atom # 13 B, 4, is greater than permitted
[10:12:50] Explicit valence for atom # 34 N, 4, is greater than permitted
[10:12:50] Explicit valence for atom # 0 P, 11, is greater than permitted


로드된 화합물 수: 70413


Unnamed: 0,id,public_id,name,raw_SMILES,canonical_SMILES
0,4,FDB000004,Cyanidin 3-(6''-acetyl-galactoside),[H][C@]1(COC(C)=O)O[C@@]([H])(OC2=CC3=C(O)C=C(...,CC(=O)OC[C@H]1O[C@@H](Oc2cc3c(O)cc(O)cc3[o+]c2...
1,13,FDB000013,Cyanidin 3-(6''-succinyl-glucoside),[H][C@]1(COC(=O)CCC(O)=O)O[C@@]([H])(OC2=CC3=C...,O=C(O)CCC(=O)OC[C@H]1O[C@@H](Oc2cc3c(O)cc(O)cc...
2,14,FDB000014,Pelargonidin 3-(6''-succinyl-glucoside),[H][C@]1(COC(=O)CCC(O)=O)O[C@@]([H])(OC2=CC3=C...,O=C(O)CCC(=O)OC[C@H]1O[C@@H](Oc2cc3c(O)cc(O)cc...
3,24,FDB000024,Petunidin 3-O-(6''-acetyl-galactoside),[H][C@]1(COC(C)=O)OC(OC2=C([O+]=C3C=C(O)C=C(O)...,COc1cc(-c2[o+]c3cc(O)cc(O)c3cc2OC2O[C@H](COC(C...
4,25,FDB000025,Peonidin 3-(6''-acetyl-galactoside),[H][C@]1(COC(C)=O)OC(OC2=C([O+]=C3C=C(O)C=C(O)...,COc1cc(-c2[o+]c3cc(O)cc(O)c3cc2OC2O[C@H](COC(C...
...,...,...,...,...,...
70408,139984,FDB112151,gamma-Glutamylthreonine,C[C@@H](O)[C@H](NC(=O)CC[C@H](N)C(O)=O)C(O)=O,C[C@@H](O)[C@H](NC(=O)CC[C@H](N)C(=O)O)C(=O)O
70409,139985,FDB112152,gamma-Glutamyltryptophan,N[C@@H](CCC(=O)N[C@@H](CC1=CNC2=CC=CC=C12)C(O)...,N[C@@H](CCC(=O)N[C@@H](Cc1c[nH]c2ccccc12)C(=O)...
70410,139986,FDB112153,TG(i-16:0/18:0/10:0),[H][C@@](COC(=O)CCCCCCCCC)(COC(=O)CCCCCCCCCCCC...,CCCCCCCCCCCCCCCCCC(=O)O[C@H](COC(=O)CCCCCCCCC)...
70411,139987,FDB112154,TG(a-17:0/10:0/8:0)[rac],[H][C@@](COC(=O)CCCCCCC)(COC(=O)CCCCCCCCCCCCC(...,CCCCCCCCCC(=O)O[C@H](COC(=O)CCCCCCC)COC(=O)CCC...


In [None]:
fingerprints_list = foodb_df['canonical_SMILES'].apply(smiles_to_fingerprint_single).tolist()
fingerprints = np.vstack(fingerprints_list)
fp_df = pd.DataFrame(fingerprints, columns=[f'X{i+1}' for i in range(fingerprints.shape[1])])
final_df = pd.concat([foodb_df, fp_df], axis=1)
final_df.to_csv(output_path, index=False)