In [None]:

import pandas as pd
import numpy as np
import optuna
import os
# 환경 변수로 인코딩 문제 해결
os.environ['JOBLIB_TEMP_FOLDER'] = 'C:/temp'  # 영문 경로 사용
os.makedirs('C:/temp', exist_ok=True)
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.ensemble import RandomForestRegressor
from rdkit import Chem
from rdkit.Chem import Descriptors, Lipinski, MACCSkeys, AllChem
import warnings
warnings.filterwarnings('ignore')
import xgboost as xgb


In [34]:
print("데이터 로딩 중...")
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
submission = pd.read_csv("sample_submission.csv")

print(f"훈련 데이터 : {train.shape}")
print(f"테스트 데이터 : {test.shape}")



데이터 로딩 중...
훈련 데이터 : (1681, 3)
테스트 데이터 : (100, 2)


In [35]:
def get_molecule_descriptors(smiles):
    try:
        mol = Chem.MolFromSmiles(smiles)
        if mol is None:
            return [0] * 2232

        basic_descriptors = [
            Descriptors.MolWt(mol),
            Descriptors.MolLogP(mol),
            Descriptors.NumHAcceptors(mol),
            Descriptors.NumHDonors(mol),
            Descriptors.TPSA(mol),
            Descriptors.NumRotatableBonds(mol),
            Descriptors.NumAromaticRings(mol),
            Descriptors.NumHeteroatoms(mol),
            Descriptors.FractionCSP3(mol),
            Descriptors.NumAliphaticRings(mol),
            Lipinski.NumAromaticHeterocycles(mol),
            Lipinski.NumSaturatedHeterocycles(mol),
            Lipinski.NumAliphaticHeterocycles(mol),
            Descriptors.HeavyAtomCount(mol),
            Descriptors.RingCount(mol),
            Descriptors.NOCount(mol),
            Descriptors.NHOHCount(mol),
            Descriptors.NumRadicalElectrons(mol),
        ]

        morgan_fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=2048)
        morgan_features = [int(bit) for bit in morgan_fp.ToBitString()]

        maccs_fp = MACCSkeys.GenMACCSKeys(mol)
        maccs_features = [int(bit) for bit in maccs_fp.ToBitString()]

        all_features = basic_descriptors + morgan_features + maccs_features

        return all_features
    except:
        return [0] * 2232

print("분자 특성 추출 중...")



분자 특성 추출 중...


In [36]:
# 1. 훈련 데이터 특성 추출
train['features'] = train['Canonical_Smiles'].apply(get_molecule_descriptors)

# 2. 훈련 데이터 길이 통일
X_train_list = train['features'].tolist()
feature_lengths = [len(x) for x in X_train_list]

if len(set(feature_lengths)) != 1:
    max_length = max(feature_lengths)
    X_train_list = [x + [0] * (max_length - len(x)) for x in X_train_list]

# 3. NumPy 배열로 변환
X_train = np.array(X_train_list)
y_train = train['Inhibition'].values

# 4. 테스트 데이터 특성 추출
test['features'] = test['Canonical_Smiles'].apply(get_molecule_descriptors)
X_test_list = test['features'].tolist()
feature_lengths = [len(x) for x in X_test_list]

if len(set(feature_lengths)) != 1:
    max_length = max(feature_lengths)
    X_test_list = [x + [0] * (max_length - len(x)) for x in X_test_list]

# 5. 훈련/테스트 데이터 간 차원 정합
if X_train.shape[1] != len(X_test_list[0]):
    diff = abs(X_train.shape[1] - len(X_test_list[0]))
    if X_train.shape[1] > len(X_test_list[0]):
        X_test_list = [x + [0] * diff for x in X_test_list]
    else:
        X_train = np.array([x.tolist() + [0] * diff for x in X_train])

# 6. 최종 테스트 데이터 배열 변환
X_test = np.array(X_test_list)

print(f"X_train: {X_train.shape}, X_test: {X_test.shape}")
print(f"특성 벡터 구성: 기본 특성(18) + Morgan(2048) + MACCS(166) = 총 {X_train.shape[1]}개")





X_train: (1681, 2233), X_test: (100, 2233)
특성 벡터 구성: 기본 특성(18) + Morgan(2048) + MACCS(166) = 총 2233개




In [37]:
# 데이터 스케일링 (Random Forest는 스케일링이 필수는 아니지만 성능 향상 가능)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 검증을 위한 데이터 분할
X_train_final, X_val, y_train_final, y_val = train_test_split(
    X_train_scaled, y_train, test_size=0.2, random_state=42
)

# 평가 함수들
def normalized_rmse(y_true, y_pred):
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    return rmse / (np.max(y_true) - np.min(y_true))

def pearson_correlation(y_true, y_pred):
    corr = np.corrcoef(y_true, y_pred)[0, 1]
    return np.clip(corr, 0, 1)

def competition_score(y_true, y_pred):
    nrmse = min(normalized_rmse(y_true, y_pred), 1)
    pearson = pearson_correlation(y_true, y_pred)
    return 0.5 * (1 - nrmse) + 0.5 * pearson

# make_scorer: 사용자 정의 함수를 scikit-learn 호환 스코어러로 변환
competition_scorer = make_scorer(competition_score, greater_is_better=True)



In [38]:
def objective(trial):
    # 논문 기반 하이퍼파라미터 범위 설정
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 2000),  # 논문: 100-2000
        'max_depth': trial.suggest_int('max_depth', 5, 50),  # 논문: 5-50, None도 포함
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 20),  # 논문: 2-20
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10),  # 논문: 1-10
        'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2', 0.1, 0.3, 0.5]),  # 논문: 다양한 선택
        'bootstrap': True,  # 고정값
        'oob_score': True,  # OOB 점수 계산
        'random_state': 42,  # 재현성
        'n_jobs': -1  # 병렬 처리
    }
    
    # max_depth가 너무 클 경우 None으로 설정
    if params['max_depth'] > 40:
        params['max_depth'] = None
    
    # Random Forest 모델 생성
    model = RandomForestRegressor(**params)
    
    # 3-fold CV로 Competition Score 평가 (빠른 평가를 위해)
    cv_scores = cross_val_score(
        model, X_train_scaled, y_train, 
        cv=3, scoring=competition_scorer, n_jobs=-1
    )
    
    return cv_scores.mean()

In [39]:
# Optuna 스터디 생성
print("Optuna 하이퍼파라미터 최적화 시작...")
study = optuna.create_study(
    direction="maximize",
    sampler=optuna.samplers.TPESampler(seed=42),
    pruner=optuna.pruners.MedianPruner(n_startup_trials=10, n_warmup_steps=5)
)

# 최적화 실행 (논문 기준: 100-200회 시도)
n_trials = 100
study.optimize(objective, n_trials=n_trials, timeout=3600)  # 1시간 제한

print("최적화 완료!")
print(f"최고 점수: {study.best_value:.4f}")
print(f"최적 하이퍼파라미터:")
for key, value in study.best_params.items():
    print(f"  {key}: {value}")

[I 2025-07-06 13:36:36,603] A new study created in memory with name: no-name-2eccf4ad-32ed-4efb-9428-ad83b0f5c231


Optuna 하이퍼파라미터 최적화 시작...


[I 2025-07-06 13:36:47,528] Trial 0 finished with value: 0.5921311325416251 and parameters: {'n_estimators': 812, 'max_depth': 48, 'min_samples_split': 15, 'min_samples_leaf': 6, 'max_features': 0.3}. Best is trial 0 with value: 0.5921311325416251.
[I 2025-07-06 13:37:02,767] Trial 1 finished with value: 0.5866816705491664 and parameters: {'n_estimators': 1446, 'max_depth': 5, 'min_samples_split': 20, 'min_samples_leaf': 9, 'max_features': 0.5}. Best is trial 0 with value: 0.5921311325416251.
[I 2025-07-06 13:37:16,476] Trial 2 finished with value: 0.5931111859500033 and parameters: {'n_estimators': 921, 'max_depth': 18, 'min_samples_split': 13, 'min_samples_leaf': 2, 'max_features': 0.3}. Best is trial 2 with value: 0.5931111859500033.
[I 2025-07-06 13:37:29,260] Trial 3 finished with value: 0.593227072891214 and parameters: {'n_estimators': 1077, 'max_depth': 32, 'min_samples_split': 2, 'min_samples_leaf': 7, 'max_features': 0.3}. Best is trial 3 with value: 0.593227072891214.
[I 202

최적화 완료!
최고 점수: 0.5981
최적 하이퍼파라미터:
  n_estimators: 1892
  max_depth: 31
  min_samples_split: 5
  min_samples_leaf: 1
  max_features: 0.1


In [41]:
# 최적 하이퍼파라미터로 모델 훈련
print("\n검증 세트로 성능 확인...")
X_train_split, X_val, y_train_split, y_val = train_test_split(
    X_train_scaled, y_train, test_size=0.2, random_state=42
)

best_model = RandomForestRegressor(**study.best_params, random_state=42, n_jobs=-1)
best_model.fit(X_train_split, y_train_split)

y_val_pred = best_model.predict(X_val)
val_score = competition_score(y_val, y_val_pred)
val_nrmse = normalized_rmse(y_val, y_val_pred)
val_pearson = pearson_correlation(y_val, y_val_pred)

print(f"검증 세트 성능:")
print(f"  Competition Score: {val_score:.4f}")
print(f"  NRMSE: {val_nrmse:.4f}")
print(f"  Pearson: {val_pearson:.4f}")



검증 세트로 성능 확인...
검증 세트 성능:
  Competition Score: 0.6205
  NRMSE: 0.2345
  Pearson: 0.4754


In [42]:
# 특성 중요도 분석
feature_importance = best_model.feature_importances_
top_features_idx = np.argsort(feature_importance)[-15:]

print(f"\n상위 15개 중요 특성:")
feature_names = (['MolWt', 'MolLogP', 'NumHAcceptors', 'NumHDonors', 'TPSA', 
                 'NumRotatableBonds', 'NumAromaticRings', 'NumHeteroatoms', 
                 'FractionCSP3', 'NumAliphaticRings', 'NumAromaticHeterocycles',
                 'NumSaturatedHeterocycles', 'NumAliphaticHeterocycles', 
                 'HeavyAtomCount', 'RingCount', 'NOCount', 'NHOHCount', 
                 'NumRadicalElectrons'] + 
                [f'Morgan_{i}' for i in range(2048)] + 
                [f'MACCS_{i}' for i in range(166)])

for idx in reversed(top_features_idx):
    print(f"  {feature_names[idx]}: {feature_importance[idx]:.4f}")


상위 15개 중요 특성:
  MolLogP: 0.0663
  MolWt: 0.0324
  HeavyAtomCount: 0.0258
  FractionCSP3: 0.0174
  TPSA: 0.0161
  NumAromaticRings: 0.0126
  NumRotatableBonds: 0.0102
  RingCount: 0.0097
  Morgan_1603: 0.0071
  MACCS_125: 0.0070
  NumHeteroatoms: 0.0064
  NumHAcceptors: 0.0060
  NOCount: 0.0057
  MACCS_136: 0.0057
  MACCS_98: 0.0046


In [43]:
# 전체 데이터로 최종 훈련
print("\n전체 데이터로 최종 모델 훈련...")
final_model = RandomForestRegressor(**study.best_params, random_state=42, n_jobs=-1)
final_model.fit(X_train_scaled, y_train)


전체 데이터로 최종 모델 훈련...


0,1,2
,n_estimators,1892
,criterion,'squared_error'
,max_depth,31
,min_samples_split,5
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,0.1
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [46]:
# 테스트 데이터 예측
print("테스트 데이터 예측 중...")
test_preds = final_model.predict(X_test_scaled)

# 결과 저장
submission['Inhibition'] = test_preds
submission.to_csv('random_forest_optuna_submission.csv', index=False)

print(f"\n=== 최종 결과 ===")
print(f"시도 횟수: {n_trials}")
print(f"최고 CV 점수: {study.best_value:.4f}")
print(f"검증 점수: {val_score:.4f}")
print("예측 결과 저장: random_forest_optuna_submission.csv")

테스트 데이터 예측 중...

=== 최종 결과 ===
시도 횟수: 100
최고 CV 점수: 0.5981
검증 점수: 0.6205
예측 결과 저장: random_forest_optuna_submission.csv
