In [1]:
import os
import pandas as pd
import pickle
import joblib

In [2]:
for ratio in ['5x', '10x']:
    # 1. 훈련 시와 동일한 방식으로 데이터 준비
    selected_descriptor = pd.read_csv('../data/descriptor_selection.csv')
    file_md_list = {}
    for column in selected_descriptor.columns:
        filename = column
        selected_columns = selected_descriptor[column].iloc[0:].dropna().tolist()
        if filename and selected_columns:
            file_md_list[filename] = selected_columns

    file_name = f'descriptors_filtered_FTO_training_{ratio}_ignore3D_False.csv'
    md_cols = file_md_list[file_name]
    fp_cols = [f'X{i+1}' for i in range(1024)]

    # 2. 훈련 데이터로 PyCaret 환경 설정
    train_data_path = f'../data/preprocessed/filtered_FTO_training_{ratio}_ignore3D_False.csv'
    train_df = pd.read_csv(train_data_path)
    filtered_train_df = train_df[['potency'] + fp_cols + md_cols]

    from pycaret.classification import *
    exp = setup(
        data=filtered_train_df, 
        target='potency',
        session_id=42,
        train_size=0.9,
        fold=10,
        normalize=True,
        fix_imbalance=True,
        remove_outliers=True,
        verbose=False
    )

    # 3. 모델 로드 (PyCaret 방식)
    model = load_model(f'../result/FTO_Final/{ratio}_w3D/blend_models/{ratio}_blended_model2')

    # 4. FooDB 데이터 준비 (동일한 컬럼 구조)
    foodb_df = pd.read_csv(f"../data/foodb/filtered_foodb_{ratio}.csv")
    foodb_prediction_data = foodb_df[fp_cols + md_cols].dropna()

    # 5. 예측 (자동으로 동일한 정규화 적용)
    predictions = predict_model(model, data=foodb_prediction_data, verbose=False, probability_threshold=0.7)

    print(f"Class 0: {sum(predictions['prediction_label'] == 0)}개")
    print(f"Class 1: {sum(predictions['prediction_label'] == 1)}개")

    # 6. 결과 저장
    #predictions.to_csv(f'foodb_predictions_{ratio}_pycaret.csv', index=False)
    summary_df = pd.DataFrame({
        'compound_id': range(len(predictions)),
        'prediction': predictions['prediction_label'],
        'probability': predictions['prediction_score'],
    })
    compound_result = pd.concat([foodb_df, summary_df], axis=1)[['id','canonical_SMILES','prediction','probability']].drop_duplicates('canonical_SMILES')
    true = compound_result[compound_result['prediction'] == 1].sort_values(by='probability', ascending=False)
    true.to_csv(f'../result/FTO_Final/{ratio}_w3D/blend_models/foodb_predictions_{ratio}_summary.csv', index=False)

Transformation Pipeline and Model Successfully Loaded
Class 0: 23612개
Class 1: 379개
Transformation Pipeline and Model Successfully Loaded
Class 0: 23467개
Class 1: 524개


### 결과 합치기

In [None]:
import json
import pandas as pd

json_file="../data/foodb/foodb_2020_04_07_json/Compound.json"
compounds = []
with open(json_file, 'r') as f:
    for line in f:
        line = line.strip()
        if line:
            try:
                compound = json.loads(line)
                smiles = compound.get('moldb_smiles')
                if smiles:
                    compounds.append({
                        'id': compound.get('public_id'),
                        'name': compound.get('name'),
                    })
            except json.JSONDecodeError:
                continue

foodb_df = pd.DataFrame(compounds)

In [14]:
import pandas as pd

# 원본 파일 경로
file_path = '../result/FTO_Final/foodb_predictions_summary.xlsx'

# 데이터 불러오기
five_candidates = pd.read_excel(file_path, sheet_name='5x_predictions')
ten_candidates = pd.read_excel(file_path, sheet_name='10x_predictions')
optnc_candidates = pd.read_excel(file_path, sheet_name='optnc_predictions')

# 교집합 기준 컬럼
key_cols = ["id", "public_id", "name", "canonical_SMILES"]

# 교집합 추출
five_ten_overlap = pd.merge(five_candidates, ten_candidates, on=key_cols, how='inner', suffixes=('_5x', '_10x'))
five_optnc_overlap = pd.merge(five_candidates, optnc_candidates, on=key_cols, how='inner', suffixes=('_5x', '_optnc'))
ten_optnc_overlap = pd.merge(ten_candidates, optnc_candidates, on=key_cols, how='inner', suffixes=('_10x', '_optnc'))

# 세개 모두 겹치는 것
three_way_overlap = pd.merge(five_candidates, ten_candidates, on=key_cols, how='inner')
three_way_overlap = pd.merge(three_way_overlap, optnc_candidates, on=key_cols, how='inner', suffixes=('', '_optnc'))

# 기존 파일에 새로운 시트 추가
with pd.ExcelWriter(file_path, engine='openpyxl', mode='a', if_sheet_exists='replace') as writer:
    five_ten_overlap.to_excel(writer, sheet_name='5x_10x_overlap', index=False)
    five_optnc_overlap.to_excel(writer, sheet_name='5x_optnc_overlap', index=False)
    ten_optnc_overlap.to_excel(writer, sheet_name='10x_optnc_overlap', index=False)
    three_way_overlap.to_excel(writer, sheet_name='common_predictions', index=False)

### 식품 매칭

In [20]:
import json
import pandas as pd

json_file="../data/foodb/foodb_2020_04_07_json/Content.json"
contents = []
with open(json_file, 'r') as f:
    for line in f:
        line = line.strip()
        if line:
            try:
                content = json.loads(line)
                food_common_name = content.get('orig_food_common_name')
                food_scientific_name = content.get('orig_food_scientific_name')
                orig_food_part = content.get('orig_food_part')
                standard_content = content.get('standard_content')
                orig_unit = content.get('orig_unit')
                citation = content.get('citation')

                contents.append({
                    'source_id': content.get('source_id'),
                    'food_id': content.get('food_id'),
                    'name': food_common_name,
                    'scientific_name': food_scientific_name,
                    'part': orig_food_part,
                    'content': standard_content,
                    'unit': orig_unit,
                    'citation': citation
                })
            except json.JSONDecodeError:
                continue

content_df = pd.DataFrame(contents)

In [31]:
optnc_food = optnc_candidates.merge(content_df, left_on='id', right_on='source_id')
optnc_food = optnc_food.rename(columns={'name_x':'name', 'name_y':'food_common_name', 'content':'standard_content', 'unit':'orig_unit'}).drop(columns=['source_id'])
optnc_food = optnc_food[['id', 'public_id', 'name', 'canonical_SMILES', 'probability', 'food_id', 'food_common_name', 'scientific_name', 'part','standard_content', 'orig_unit', 'citation']]

with pd.ExcelWriter(file_path, engine='openpyxl', mode='a', if_sheet_exists='replace') as writer:
    optnc_food.to_excel(writer, sheet_name='optnc_foods', index=False)