In [2]:
import json
import pandas as pd
import numpy as np
import multiprocessing
import pickle
import os
import gc
from rdkit import Chem
from rdkit.Chem import rdMolDescriptors, AllChem, DataStructs
from mordred import Calculator, descriptors
from tqdm import tqdm

In [3]:
def to_canonical_smiles(smiles):
    if pd.isna(smiles):
        return None
    try:
        mol = Chem.MolFromSmiles(smiles)
        return Chem.MolToSmiles(mol, canonical=True) if mol else None
    except:
        return None

def smiles_to_fingerprint(smiles, fp_size=1024, radius=2):
    if smiles is None or pd.isna(smiles):
        return np.zeros(fp_size)
    
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return np.zeros(fp_size)
    
    fp = rdMolDescriptors.GetMorganFingerprintAsBitVect(mol, radius=radius, nBits=fp_size)
    arr = np.zeros(fp_size)
    DataStructs.ConvertToNumpyArray(fp, arr)
    return arr

def load_foodb_data(json_file="foodb_2020_04_07_json/Compound.json"):
    compounds = []
    with open(json_file, 'r') as f:
        for line in f:
            line = line.strip()
            if line:
                try:
                    compound = json.loads(line)
                    smiles = compound.get('moldb_smiles')
                    if smiles:
                        compounds.append({
                            'id': compound.get('public_id'),
                            'name': compound.get('name'),
                            'raw_SMILES': smiles
                        })
                except json.JSONDecodeError:
                    continue
    
    foodb_df = pd.DataFrame(compounds)
    foodb_df['canonical_SMILES'] = foodb_df['raw_SMILES'].apply(to_canonical_smiles)
    print(f"로드된 화합물 수: {len(foodb_df)}")
    return foodb_df

def process_molecules(smiles_list, pkl_file='processed_molecules.pkl'):
    if os.path.exists(pkl_file):
        with open(pkl_file, 'rb') as f:
            data = pickle.load(f)
        print(f"불러온 분자 수: {len(data['mols'])}")
        return data['mols'], data['valid_indices']
    
    mols, valid_indices = [], []
    for i, smi in enumerate(tqdm(smiles_list, desc="Processing SMILES")):
        if smi is None or smi == '' or pd.isna(smi):
            continue
        try:
            mol = Chem.MolFromSmiles(smi)
            if mol is None:
                continue
            mol = Chem.AddHs(mol)
            result = AllChem.EmbedMolecule(mol, randomSeed=42)
            if result == 0:
                AllChem.MMFFOptimizeMolecule(mol, maxIters=1000)
                mols.append(mol)
                valid_indices.append(i)
        except:
            continue
    
    with open(pkl_file, 'wb') as f:
        pickle.dump({'mols': mols, 'valid_indices': valid_indices}, f)
    print(f"Valid molecules: {len(mols)}/{len(smiles_list)}")
    return mols, valid_indices

def get_descriptor_list(ratio='5x'):
    selected_descriptor = pd.read_csv('../descriptor_selection.csv')
    filename = f'descriptors_filtered_FTO_training_{ratio}_ignore3D_False.csv'
    
    mol = Chem.AddHs(Chem.MolFromSmiles('CCO'))
    all_descriptors = list(Calculator(descriptors).descriptors)
    descriptor_list = []
    
    for desc_name in selected_descriptor[filename].iloc[0:].dropna().tolist():
        for desc in all_descriptors:
            if str(desc).endswith(desc_name) or desc_name in str(desc):
                calc = Calculator([desc])
                result = calc(mol)
                print(f"✓ {desc_name} ({desc}): {result[0]}")
                descriptor_list.append(desc)
                break
    return descriptor_list

def check_col_list(ratio='5x'):
    selected_descriptor = pd.read_csv('../descriptor_selection.csv')
    file_md_list = {}
    for column in selected_descriptor.columns:
        filename = column
        selected_columns = selected_descriptor[column].iloc[0:].dropna().tolist()
        if filename and selected_columns:
            file_md_list[filename] = selected_columns

    file_name = f'descriptors_filtered_FTO_training_{ratio}_ignore3D_False.csv'
    md_cols = file_md_list[file_name]
    fp_cols = [f'X{i+1}' for i in range(1024)]
    return fp_cols, md_cols

def calculate_descriptors(mols, descriptor_list, valid_indices, smiles_list):
    calc = Calculator(descriptor_list, ignore_3D=False)
    total_cores = multiprocessing.cpu_count()
    nproc = max(1, total_cores - 4)
    print(f"Using {nproc}/{total_cores} cores")
    
    desc_df = calc.pandas(mols, nproc=nproc)
    valid_smiles = [smiles_list[i] for i in valid_indices]
    desc_df.insert(0, 'canonical_SMILES', valid_smiles)
    print(f"완료: {len(desc_df)}개 분자의 descriptor 계산됨")
    return desc_df

# 데이터 로드
foodb_df = load_foodb_data()
smiles_list = foodb_df['canonical_SMILES'].tolist()

# 분자 처리
mols, valid_indices = process_molecules(smiles_list)

[16:01:51] Explicit valence for atom # 31 N, 4, is greater than permitted
[16:01:52] Explicit valence for atom # 21 N, 4, is greater than permitted
[16:01:52] Explicit valence for atom # 1 Cl, 4, is greater than permitted
[16:01:52] Explicit valence for atom # 13 B, 4, is greater than permitted
[16:01:53] Explicit valence for atom # 34 N, 4, is greater than permitted
[16:01:53] Explicit valence for atom # 0 P, 11, is greater than permitted


로드된 화합물 수: 70413
불러온 분자 수: 24208


In [None]:
for ratio in ['5x','10x']:
    # descriptor 계산
    descriptor_list = get_descriptor_list(ratio)
    desc_df = calculate_descriptors(mols, descriptor_list, valid_indices, smiles_list)
    
    # 데이터 병합
    dataset = foodb_df.merge(desc_df, on='canonical_SMILES', how='inner')
    compound_info = dataset.iloc[:, :4]
    descriptor_data = dataset.iloc[:, 4:]
    
    # Fingerprint 추가
    fingerprints = compound_info['canonical_SMILES'].apply(smiles_to_fingerprint)
    fingerprints_array = np.vstack(fingerprints.values)
    fp_df = pd.DataFrame(fingerprints_array, columns=[f'X{i+1}' for i in range(fingerprints_array.shape[1])])
    
    # 최종 데이터
    final_data = pd.concat([compound_info, descriptor_data, fp_df], axis=1)
    final_data.rename(columns={'MATS1pe': 'MATS1p', 'ATSC6pe':'ATSC6p'}, inplace=True)
    fp_cols, md_cols = check_col_list(ratio)
    final_data = final_data[['id', 'canonical_SMILES', ] + fp_cols + md_cols]

    # 에러 제거 (canonical_SMILES 제외)
    data_cols = fp_cols + md_cols
    for col in data_cols:
        if col in final_data.columns:
            string_mask = final_data[col].astype(str).str.contains('missing|error|failed', case=False, na=False)
            if string_mask.any():
                print(f"컬럼 {col}에서 {string_mask.sum()}개의 에러 발견")
                final_data = final_data[~string_mask]

    # 숫자 변환 (canonical_SMILES 제외)
    final_data[data_cols] = final_data[data_cols].apply(pd.to_numeric, errors='coerce')
    final_data = final_data.drop_duplicates(subset=['id','canonical_SMILES']).reset_index(drop=True)
    final_data = final_data.dropna()

    print(f"정제 후 데이터 수: {len(final_data)}")
    
    final_data.to_csv(f'filtered_foodb_{ratio}.csv', index=False)

In [4]:
def load_json_file(json_file):
    data = []
    with open(json_file, 'r') as f:
        for line in f:
            line = line.strip()
            if line:
                try:
                    data.append(json.loads(line))
                except json.JSONDecodeError:
                    continue
    return data

# 1. 각 파일 로드
print("Loading Food.json...")
foods = load_json_file("foodb_2020_04_07_json/Food.json")
foods_df = pd.DataFrame(foods)
print(f"Foods: {len(foods_df)}")

print("Loading Compound.json...")
compounds = load_json_file("foodb_2020_04_07_json/Compound.json")
compounds_df = pd.DataFrame(compounds)
print(f"Compounds: {len(compounds_df)}")

print("Loading Content.json...")
contents = load_json_file("foodb_2020_04_07_json/Content.json")
contents_df = pd.DataFrame(contents)
print(f"Contents: {len(contents_df)}")

# 2. Food 데이터 정리 (필요한 컬럼만 선택)
food_cols = ['id', 'name', 'food_group', 'food_subgroup', 'food_type', 'description']
food_df_clean = foods_df[[col for col in food_cols if col in foods_df.columns]].copy()
food_df_clean.rename(columns={'id': 'food_id', 'name': 'food_name'}, inplace=True)

# 3. Compound 데이터 정리
compound_cols = ['id', 'public_id', 'name', 'moldb_smiles', 'description']
compound_df_clean = compounds_df[[col for col in compound_cols if col in compounds_df.columns]].copy()
compound_df_clean.rename(columns={'id': 'compound_id', 'name': 'compound_name', 'public_id': 'compound_public_id'}, inplace=True)

# 4. Content 데이터 정리 (food_id와 source_id(compound_id) 연결)
content_cols = ['id', 'food_id', 'source_id', 'source_type', 'orig_content', 'orig_unit', 
                'orig_min', 'orig_max', 'standard_content']
content_df_clean = contents_df[[col for col in content_cols if col in contents_df.columns]].copy()

# Compound만 필터링 (source_type이 'Compound'인 것만)
content_df_clean = content_df_clean[content_df_clean['source_type'] == 'Compound'].copy()
content_df_clean.rename(columns={'source_id': 'compound_id'}, inplace=True)

print(f"\nContent (Compound only): {len(content_df_clean)}")

# 5. 데이터 연결: Content + Compound + Food
# Step 1: Content와 Compound 연결
merged_df = content_df_clean.merge(compound_df_clean, on='compound_id', how='left')

# Step 2: Food 정보 추가
final_df = merged_df.merge(food_df_clean, on='food_id', how='left')

# 6. 컬럼 순서 정리
final_cols = ['compound_public_id', 'compound_name', 'food_name', 'food_group', 'food_subgroup', 
              'food_type', 'orig_content', 'orig_unit', 'standard_content', 'orig_min', 'orig_max',
              'compound_id', 'food_id', 'moldb_smiles']
final_df = final_df[[col for col in final_cols if col in final_df.columns]]
final_df.head()

Loading Food.json...
Foods: 992
Loading Compound.json...
Compounds: 70477
Loading Content.json...
Contents: 5691011

Content (Compound only): 5552979


Unnamed: 0,compound_public_id,compound_name,food_name,food_group,food_subgroup,food_type,orig_content,orig_unit,standard_content,orig_min,orig_max,compound_id,food_id,moldb_smiles
0,,,Alfalfa,Herbs and Spices,Herbs,Type 1,0.0,mg/100 g,0.0,,,21594,287,
1,,,Alfalfa,Herbs and Spices,Herbs,Type 1,0.0,mg/100 g,0.0,,,21595,287,
2,,,Alfalfa,Herbs and Spices,Herbs,Type 1,0.0,mg/100 g,0.0,,,21595,287,
3,FDB001131,Sucrose,Alfalfa,Herbs and Spices,Herbs,Type 1,0.0,mg/100 g,0.0,,,1131,287,OCC1OC(CO)(OC2OC(CO)C(O)C(O)C2O)C(O)C1O
4,FDB000753,Ethanol,Alfalfa,Herbs and Spices,Herbs,Type 1,0.0,mg/100 g,0.0,,,753,287,CCO


In [5]:
select = pd.read_excel('foodb_predictions_summary_0927.xlsx', sheet_name='Chemical_2nd_45')
select.head()

Unnamed: 0,연번,id,public_id,name,canonical_SMILES,probability_5x,probability_10x,probability_optnc,비고,MoleculeID,CATMoS_NT_pred,CATMoS_LD50_pred,CATMoS_LD50_predRange
0,1,633,FDB000633,Kaempferol,O=c1c(O)c(-c2ccc(O)cc2)oc2cc(O)cc(O)c12,0.8312,0.8781,0.804656,common_predictions,Molecule_3,1,2818,[1600-5000]
1,2,2417,FDB002417,Norartocarpetin,O=c1cc(-c2ccc(O)cc2O)oc2cc(O)cc(O)c12,0.7959,0.8133,0.707558,common_predictions,Molecule_4,1,2738,[1500-4900]
2,3,657,FDB000657,Galangin,O=c1c(O)c(-c2ccccc2)oc2cc(O)cc(O)c12,0.7706,0.8386,0.750609,common_predictions,Molecule_5,1,2272,[1300-4000]
3,4,2786,FDB002786,"3,8-Dihydroxy-1-methylanthraquinone-2-carboxyl...",Cc1c(C(=O)O)c(O)cc2c1C(=O)c1c(O)cccc1C2=O,0.7495,0.8302,0.701366,common_predictions,Molecule_6,1,3995,[2200-7100]
4,5,13865,FDB013862,Emodin,Cc1cc(O)c2c(c1)C(=O)c1cc(O)cc(O)c1C2=O,0.7287,0.9493,1.0,common_predictions,Molecule_7,1,2192,[1200-3900]


In [6]:
candidate_ids = select['public_id'].dropna().unique().tolist()
matched_foods = final_df[final_df['compound_public_id'].isin(candidate_ids)].copy()

columns_to_keep = [
    'compound_public_id', 'compound_name',
    'food_name', 'food_group', 'food_subgroup', 'food_type',
    'standard_content', 'orig_content', 'orig_unit', 'orig_min', 'orig_max'
]
matched_foods = matched_foods[[col for col in columns_to_keep if col in matched_foods.columns]]

matched_foods = matched_foods.drop_duplicates().reset_index(drop=True)
matched_foods = matched_foods.sort_values(by=['compound_public_id', 'standard_content'], ascending=[True, False])

matched_foods.to_excel('foodb_compound_foods_2nd_45.xlsx', index=False)