In [None]:
import pandas as pd
import numpy as np
import os
import warnings
warnings.filterwarnings('ignore', category=UserWarning, module='lightgbm')

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler
from tabpfn import TabPFNClassifier

In [None]:
import torch
print(torch.cuda.is_available())

# 세팅

## 사용할 descriptor

In [None]:
selected_descriptor = pd.read_csv('../data/descriptor_selection.csv')

file_md_list = {}
for column in selected_descriptor.columns:
    filename = column
    selected_columns = selected_descriptor[column].iloc[0:].dropna().tolist()
    if filename and selected_columns:
        file_md_list[filename] = selected_columns

## 저장 함수

In [None]:
def safe_save_csv(df, path):
    try:
        os.makedirs(os.path.dirname(path), exist_ok=True)
        df.to_csv(path, index=False)
        return True
    except Exception as e:
        print(f"CSV 저장 실패: {path}, 에러: {e}")
        return False

In [None]:
data_dir = os.path.join("..", "data", "preprocessed")
result_dir = os.path.join("..", "result")

In [None]:
# 데이터셋 로드
ratio = '5x'

file_name = f'descriptors_filtered_FTO_training_{ratio}_ignore3D_False.csv'
base_path = f'FTO_Final/{ratio}_w3D'

data_path = os.path.join(data_dir, f"filtered_FTO_training_{ratio}_ignore3D_False.csv")
df = pd.read_csv(data_path).drop(columns=['canonical_SMILES', 'raw_SMILES', 'source'])

feature_names = df.columns.tolist()[1:500]
target_names = df['potency'].unique().tolist()

X_df = df[feature_names].to_numpy()
y = df['potency'].tolist()

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X_df, y, test_size=0.20, random_state=42
)

# Train and evaluate the TabPFN classifier
tabpfn_classifier = TabPFNClassifier(device='cuda')
tabpfn_classifier.fit(X_train, y_train)
y_pred_proba = tabpfn_classifier.predict_proba(X_test)

# Calculate the ROC AUC score
roc_auc = roc_auc_score(y_test, y_pred_proba[:, 1])
print(f"TabPFN ROC AUC Score: {roc_auc:.4f}")

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X_df, y, test_size=0.2, random_state=42, stratify=y
)

scaler = StandardScaler()
X_train_s = scaler.fit_transform(X_train)
X_test_s  = scaler.transform(X_test)

clf = TabPFNClassifier()

clf.fit(X_train_s, y_train)

y_pred = clf.predict(X_test_s) # 각 샘플에 대한 예측 라벨 정보
y_proba = clf.predict_proba(X_test_s) # 각 샘플의 클래스별 확률 분포

acc = accuracy_score(y_test, y_pred)
auc_macro = roc_auc_score(y_test, y_proba, multi_class="ovr", average="macro")
cm = confusion_matrix(y_test, y_pred)
report_df = pd.DataFrame(
    classification_report(y_test, y_pred, target_names=target_names, output_dict=True)
).T.round(4)

print("Accuracy:", round(acc, 4))
print("ROC-AUC (macro, OVR):", round(auc_macro, 4))
print("\nClassification report:")
print(report_df.to_string())