In [1]:
import pandas as pd
import numpy as np
from molecular_descriptor import *
import json


In [2]:
def to_canonical_smiles(smiles):
    if pd.isna(smiles):
        return None
    try:
        mol = Chem.MolFromSmiles(smiles)
        if mol is None:
            return None
        return Chem.MolToSmiles(mol, canonical=True)
    except:
        return None

def smiles_to_MD_FP(raw_data, ignore3d):
    smiles_list = raw_data['canonical_SMILES'].tolist()
    print("총 분자 수:", len(smiles_list))

    # MD
    desc_df = calc_descriptors(smiles_list, leave_cores=4, ignore3D=ignore3d)
    if 'canonical_SMILES' not in desc_df.columns:
        desc_df.insert(0, 'canonical_SMILES', smiles_list)
    dataset = raw_data.merge(desc_df, on='canonical_SMILES', how='left')

    compound_info = dataset.iloc[:, :4]
    descriptor_data = dataset.iloc[:, 4:]
    print(f"정제 전 데이터 shape: {dataset.shape}")

    # MD 전처리
    ## 결측치 제거
    descriptor_data_cleaned, _ = remove_invalid_descriptors(descriptor_data)
    print(f"결측 제거 후 descriptor shape: {descriptor_data_cleaned.shape}")

    ## 분산이 낮은 descriptor 제거
    descriptor_data_after_var, _ = remove_low_variance_descriptors(descriptor_data_cleaned, threshold=1e-6)
    print(f"분산 제거 후 descriptor shape: {descriptor_data_after_var.shape}")

    ## 상관관계 높은 descriptor 제거
    descriptor_data_final, _ = remove_correlated_descriptors(descriptor_data_after_var, threshold=0.9)
    print(f"상관관계 제거 후 descriptor shape: {descriptor_data_final.shape}")

    # MD + Compound info
    descriptor_data = pd.concat([compound_info, descriptor_data_final], axis=1)
    print(f"Descriptor 생성 후 데이터 shape (compound info 포함): {descriptor_data.shape}")

    # MD + Compound info + FP
    fingerprints = descriptor_data['canonical_SMILES'].apply(lambda x: smiles_to_fingerprint(x))
    fingerprints_array = np.vstack(fingerprints.values)
    fp_df = pd.DataFrame(fingerprints_array, columns=[f'X{i+1}' for i in range(fingerprints_array.shape[1])])
    final_data = pd.concat([descriptor_data, fp_df], axis=1)
    print(f"Fingerprint 추가 후 최종 데이터 shape: {final_data.shape}")

    return final_data

def balance_dataset_by_source(df, scale, random_seed=42):
    np.random.seed(random_seed)
    
    active_data = df[df['source'] == 'active'].copy()
    n_active = len(active_data)
    
    assay_inactive_data = df[df['source'] == 'assay_inactive'].copy()
    decoy_data = df[df['source'] == 'decoy'].copy()
    
    target_inactive_count = n_active * scale
    selected_inactive = pd.DataFrame()
    
    if len(assay_inactive_data) >= target_inactive_count:
        selected_inactive = assay_inactive_data.sample(n=target_inactive_count, random_state=random_seed)
    else:
        selected_inactive = assay_inactive_data.copy()
        remaining = target_inactive_count - len(assay_inactive_data)
        if len(decoy_data) >= remaining:
            additional_decoy = decoy_data.sample(n=remaining, random_state=random_seed)
            selected_inactive = pd.concat([selected_inactive, additional_decoy], ignore_index=True)
        else:
            selected_inactive = pd.concat([selected_inactive, decoy_data], ignore_index=True)
            print(f"inactive 데이터 부족! assay_inactive {len(assay_inactive_data)}개 + decoy {len(decoy_data)}개 = 총 {len(selected_inactive)}개")

    balanced_data = pd.concat([active_data, selected_inactive], ignore_index=True)
    balanced_data = balanced_data.sample(frac=1, random_state=random_seed).reset_index(drop=True)  # 섞기
    
    print(f"\n데이터 균형 조정 (scale={scale}):")
    print(f"Active: {len(active_data)}개")
    print(f"Inactive: {len(selected_inactive)}개") 
    print(f"총: {len(balanced_data)}개")
    
    return balanced_data

In [None]:
# Training data
for data_name in ["FTO_training_total"]:
    print(f"\n{'='*50}Processing dataset: {data_name}{'='*50}")
    for ignore3d in [True, False]:
        # 원본 데이터 불러오기
        input_path = f"raw/FTO_training/{data_name}.csv"
        raw_data = pd.read_csv(input_path)

        # MD, FP 생성
        final_data = smiles_to_MD_FP(raw_data, ignore3d)

        # 최종 데이터 저장
        output_cleaned_path = f"preprocessed/filtered_{data_name}_ignore3D_{ignore3d}.csv"
        final_data.to_csv(output_cleaned_path, index=False)

        for scale in [1, 3, 5, 10, 20, 50]:
            balanced_df = balance_dataset_by_source(final_data, scale, random_seed=42)
            
            output_path = f"preprocessed/filtered_FTO_training_{scale}x_ignore3D_{ignore3d}.csv"
            balanced_df.to_csv(output_path, index=False)
            print(f"저장 완료: {output_path}")

In [5]:
# FooDB data
foodb_json_file = "foodb/foodb_2020_04_07_json/Compound.json"
compounds = []

with open(foodb_json_file, 'r') as f:
    for line in f:
        line = line.strip()
        if line:
            try:
                compound = json.loads(line)
                smiles = compound.get('moldb_smiles')
                name = compound.get('name')
                compound_id = compound.get('public_id')
                if smiles:
                    compounds.append({
                        'id': compound_id,
                        'name': name,
                        'raw_SMILES': smiles
                    })
            except json.JSONDecodeError:
                continue

foodb_df = pd.DataFrame(compounds)
print(f"로드된 화합물 수: {len(foodb_df)}")

foodb_df['canonical_SMILES'] = foodb_df['raw_SMILES'].apply(to_canonical_smiles)

smiles_list = foodb_df['canonical_SMILES'].tolist()
print("총 분자 수:", len(smiles_list))

# MD
desc_df = calc_descriptors(smiles_list, leave_cores=4, ignore3D=ignore3d)
if 'canonical_SMILES' not in desc_df.columns:
    desc_df.insert(0, 'canonical_SMILES', smiles_list)
dataset = foodb_df.merge(desc_df, on='canonical_SMILES', how='left')

compound_info = dataset.iloc[:, :4]
descriptor_data = dataset.iloc[:, 4:]
print(f"정제 전 데이터 shape: {dataset.shape}")

# MD 전처리
## 결측치 제거
descriptor_data_cleaned, _ = remove_invalid_descriptors(descriptor_data)
print(f"결측 제거 후 descriptor shape: {descriptor_data_cleaned.shape}")

## 분산이 낮은 descriptor 제거
descriptor_data_after_var, _ = remove_low_variance_descriptors(descriptor_data_cleaned, threshold=1e-6)
print(f"분산 제거 후 descriptor shape: {descriptor_data_after_var.shape}")

## 상관관계 높은 descriptor 제거
descriptor_data_final, _ = remove_correlated_descriptors(descriptor_data_after_var, threshold=0.9)
print(f"상관관계 제거 후 descriptor shape: {descriptor_data_final.shape}")

# MD + Compound info
descriptor_data = pd.concat([compound_info, descriptor_data_final], axis=1)
print(f"Descriptor 생성 후 데이터 shape (compound info 포함): {descriptor_data.shape}")

로드된 화합물 수: 70413


[08:37:01] Explicit valence for atom # 31 N, 4, is greater than permitted
[08:37:01] Explicit valence for atom # 21 N, 4, is greater than permitted
[08:37:02] Explicit valence for atom # 1 Cl, 4, is greater than permitted
[08:37:03] Explicit valence for atom # 13 B, 4, is greater than permitted
[08:37:05] Explicit valence for atom # 34 N, 4, is greater than permitted
[08:37:05] Explicit valence for atom # 0 P, 11, is greater than permitted


총 분자 수: 70413


Processing SMILES:   4%|▍         | 2829/70413 [00:00<00:15, 4331.23it/s]

Error processing SMILES 1983: None, Error: No registered converter was able to produce a C++ rvalue of type class std::basic_string<wchar_t,struct std::char_traits<wchar_t>,class std::allocator<wchar_t> > from this Python object of type NoneType


Processing SMILES:   8%|▊         | 5651/70413 [00:01<00:15, 4122.81it/s]

Error processing SMILES 5020: None, Error: No registered converter was able to produce a C++ rvalue of type class std::basic_string<wchar_t,struct std::char_traits<wchar_t>,class std::allocator<wchar_t> > from this Python object of type NoneType


Processing SMILES:  11%|█         | 7709/70413 [00:01<00:15, 3986.29it/s]

Error processing SMILES 7149: None, Error: No registered converter was able to produce a C++ rvalue of type class std::basic_string<wchar_t,struct std::char_traits<wchar_t>,class std::allocator<wchar_t> > from this Python object of type NoneType


Processing SMILES:  14%|█▎        | 9681/70413 [00:02<00:16, 3706.31it/s]

Error processing SMILES 8974: None, Error: No registered converter was able to produce a C++ rvalue of type class std::basic_string<wchar_t,struct std::char_traits<wchar_t>,class std::allocator<wchar_t> > from this Python object of type NoneType


Processing SMILES:  22%|██▏       | 15313/70413 [00:04<00:14, 3896.61it/s]

Error processing SMILES 14685: None, Error: No registered converter was able to produce a C++ rvalue of type class std::basic_string<wchar_t,struct std::char_traits<wchar_t>,class std::allocator<wchar_t> > from this Python object of type NoneType


Processing SMILES:  24%|██▍       | 17223/70413 [00:04<00:17, 3041.17it/s]

Error processing SMILES 16619: None, Error: No registered converter was able to produce a C++ rvalue of type class std::basic_string<wchar_t,struct std::char_traits<wchar_t>,class std::allocator<wchar_t> > from this Python object of type NoneType


Processing SMILES: 100%|██████████| 70413/70413 [00:31<00:00, 2268.82it/s]


Valid molecules for descriptor calculation: 70407/70413
Using 8/12 cores for descriptor calculation


  0%|          | 23/70407 [00:06<5:04:20,  3.85it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


  0%|          | 90/70407 [00:17<13:22:05,  1.46it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


  0%|          | 335/70407 [00:39<2:08:57,  9.06it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 31%|███       | 21529/70407 [46:11<3:26:41,  3.94it/s] 

  s += (eig.vec[i, eig.max] * eig.vec[j, eig.max]) ** -0.5


100%|██████████| 70407/70407 [3:12:45<00:00,  6.09it/s]   


정제 전 데이터 shape: (72977, 1617)
제거된 결측/비정상 값 포함 컬럼 수: 1033개
결측 제거 후 descriptor shape: (72977, 580)
제거된 낮은 분산 descriptor 수: 0개
분산 제거 후 descriptor shape: (72977, 580)
제거된 상관관계 높은 descriptor 수: 346개
상관관계 제거 후 descriptor shape: (72977, 234)
Descriptor 생성 후 데이터 shape (compound info 포함): (72977, 238)


In [12]:
descriptor_data = descriptor_data.dropna(subset=['canonical_SMILES']).reset_index(drop=True)

In [13]:
# MD + Compound info + FP
fingerprints = descriptor_data['canonical_SMILES'].apply(lambda x: smiles_to_fingerprint(x))
fingerprints_array = np.vstack(fingerprints.values)
fp_df = pd.DataFrame(fingerprints_array, columns=[f'X{i+1}' for i in range(fingerprints_array.shape[1])])
final_data = pd.concat([descriptor_data, fp_df], axis=1)
print(f"Fingerprint 추가 후 최종 데이터 shape: {final_data.shape}")

output_cleaned_path = "foodb/filtered_foodb.csv"
final_data.to_csv(output_cleaned_path, index=False)



Fingerprint 추가 후 최종 데이터 shape: (72971, 1262)
