In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from rdkit import Chem
from rdkit.Chem import Descriptors, Lipinski, MACCSkeys, AllChem
import warnings
warnings.filterwarnings('ignore')



In [2]:
print("데이터 로딩 중...")
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
submission = pd.read_csv("sample_submission.csv")

print(f"훈련 데이터 : {train.shape}")
print(f"테스트 데이터 : {test.shape}")



데이터 로딩 중...
훈련 데이터 : (1681, 3)
테스트 데이터 : (100, 2)


In [3]:
def get_molecule_descriptors(smiles):
    try:
        mol = Chem.MolFromSmiles(smiles)
        if mol is None:
            return [0] * 2232

        basic_descriptors = [
            Descriptors.MolWt(mol),
            Descriptors.MolLogP(mol),
            Descriptors.NumHAcceptors(mol),
            Descriptors.NumHDonors(mol),
            Descriptors.TPSA(mol),
            Descriptors.NumRotatableBonds(mol),
            Descriptors.NumAromaticRings(mol),
            Descriptors.NumHeteroatoms(mol),
            Descriptors.FractionCSP3(mol),
            Descriptors.NumAliphaticRings(mol),
            Lipinski.NumAromaticHeterocycles(mol),
            Lipinski.NumSaturatedHeterocycles(mol),
            Lipinski.NumAliphaticHeterocycles(mol),
            Descriptors.HeavyAtomCount(mol),
            Descriptors.RingCount(mol),
            Descriptors.NOCount(mol),
            Descriptors.NHOHCount(mol),
            Descriptors.NumRadicalElectrons(mol),
        ]

        morgan_fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=2048)
        morgan_features = [int(bit) for bit in morgan_fp.ToBitString()]

        maccs_fp = MACCSkeys.GenMACCSKeys(mol)
        maccs_features = [int(bit) for bit in maccs_fp.ToBitString()]

        all_features = basic_descriptors + morgan_features + maccs_features
        return all_features
    except:
        return [0] * 2232

print("분자 특성 추출 중...")

# 특성 추출
train['features'] = train['Canonical_Smiles'].apply(get_molecule_descriptors)
test['features'] = test['Canonical_Smiles'].apply(get_molecule_descriptors)

# 데이터 전처리
X_train_list = train['features'].tolist()
X_test_list = test['features'].tolist()

# 길이 통일
max_length = max(max(len(x) for x in X_train_list), max(len(x) for x in X_test_list))
X_train_list = [x + [0] * (max_length - len(x)) for x in X_train_list]
X_test_list = [x + [0] * (max_length - len(x)) for x in X_test_list]

X_train = np.array(X_train_list)
X_test = np.array(X_test_list)
y_train = train['Inhibition'].values

print(f"특성 차원: {X_train.shape[1]}")



분자 특성 추출 중...




특성 차원: 2233


In [4]:
# 스케일링 (Random Forest용)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 검증용 데이터 분할
X_train_split, X_val, y_train_split, y_val = train_test_split(
    X_train, y_train, test_size=0.2, random_state=42
)
X_train_split_scaled, X_val_scaled = scaler.fit_transform(X_train_split), scaler.transform(X_val)



In [5]:
# 평가 함수들
def normalized_rmse(y_true, y_pred):
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    return rmse / (np.max(y_true) - np.min(y_true))

def pearson_correlation(y_true, y_pred):
    corr = np.corrcoef(y_true, y_pred)[0, 1]
    return np.clip(corr, 0, 1)

def competition_score(y_true, y_pred):
    nrmse = min(normalized_rmse(y_true, y_pred), 1)
    pearson = pearson_correlation(y_true, y_pred)
    return 0.5 * (1 - nrmse) + 0.5 * pearson



In [6]:
print("최적 파라미터로 개별 모델 훈련 중...")

# XGBoost 최적 모델 (원본 데이터 사용)
xgb_params = {
    'n_estimators': 1390,
    'max_depth': 3, 
    'learning_rate': 0.010284177423073436,
    'subsample': 0.700137063241384,
    'colsample_bytree': 0.7311486894133992,
    'min_child_weight': 2,
    'gamma': 0.6298588544619622,
    'reg_alpha': 1.0051694518581773,
    'reg_lambda': 4.30863619181162,
    'random_state': 42,
    'n_jobs': -1
}

xgb_model = xgb.XGBRegressor(**xgb_params)
xgb_model.fit(X_train_split, y_train_split)

# Random Forest 최적 모델 (스케일 데이터 사용)
rf_params = {
    'n_estimators': 1892,
    'max_depth': 31,
    'min_samples_split': 5,
    'min_samples_leaf': 1,
    'max_features': 0.1,
    'bootstrap': True,
    'oob_score': True,
    'random_state': 42,
    'n_jobs': -1
}

rf_model = RandomForestRegressor(**rf_params)
rf_model.fit(X_train_split_scaled, y_train_split)

print("개별 모델 검증 성능:")

# XGBoost 검증
xgb_pred = xgb_model.predict(X_val)
xgb_score = competition_score(y_val, xgb_pred)
print(f"XGBoost Score: {xgb_score:.4f}")

# Random Forest 검증
rf_pred = rf_model.predict(X_val_scaled)
rf_score = competition_score(y_val, rf_pred)
print(f"Random Forest Score: {rf_score:.4f}")
print(f"Random Forest OOB Score: {rf_model.oob_score_:.4f}")



최적 파라미터로 개별 모델 훈련 중...
개별 모델 검증 성능:
XGBoost Score: 0.6467
Random Forest Score: 0.6204
Random Forest OOB Score: 0.1780


In [7]:
# 앙상블 가중치 최적화 (간단한 그리드 서치)
print("\n앙상블 가중치 최적화 중...")
best_ensemble_score = 0
best_weights = (0.5, 0.5)

for xgb_weight in np.arange(0.3, 0.8, 0.05):
    rf_weight = 1 - xgb_weight
    
    ensemble_pred = xgb_weight * xgb_pred + rf_weight * rf_pred
    ensemble_score = competition_score(y_val, ensemble_pred)
    
    if ensemble_score > best_ensemble_score:
        best_ensemble_score = ensemble_score
        best_weights = (xgb_weight, rf_weight)

print(f"최적 앙상블 가중치: XGBoost {best_weights[0]:.3f}, Random Forest {best_weights[1]:.3f}")
print(f"앙상블 검증 점수: {best_ensemble_score:.4f}")

# 성능 비교
print(f"\n=== 성능 비교 ===")
print(f"XGBoost 단독:     {xgb_score:.4f}")
print(f"Random Forest 단독: {rf_score:.4f}")
print(f"앙상블:           {best_ensemble_score:.4f}")
print(f"앙상블 향상:      +{best_ensemble_score - max(xgb_score, rf_score):.4f}")




앙상블 가중치 최적화 중...
최적 앙상블 가중치: XGBoost 0.750, Random Forest 0.250
앙상블 검증 점수: 0.6448

=== 성능 비교 ===
XGBoost 단독:     0.6467
Random Forest 단독: 0.6204
앙상블:           0.6448
앙상블 향상:      +-0.0019


In [8]:
# 전체 데이터로 최종 모델 훈련
print("\n전체 데이터로 최종 모델 훈련 중...")

# 전체 데이터 스케일링
X_train_final_scaled = scaler.fit_transform(X_train)
X_test_final_scaled = scaler.transform(X_test)

# XGBoost 최종 훈련
final_xgb = xgb.XGBRegressor(**xgb_params)
final_xgb.fit(X_train, y_train)

# Random Forest 최종 훈련
final_rf = RandomForestRegressor(**rf_params)
final_rf.fit(X_train_final_scaled, y_train)




전체 데이터로 최종 모델 훈련 중...


0,1,2
,n_estimators,1892
,criterion,'squared_error'
,max_depth,31
,min_samples_split,5
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,0.1
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [None]:
# 테스트 데이터 예측
print("테스트 데이터 예측 중...")
xgb_test_pred = final_xgb.predict(X_test)
rf_test_pred = final_rf.predict(X_test_final_scaled)

# 앙상블 예측
ensemble_test_pred = (best_weights[0] * xgb_test_pred + 
                     best_weights[1] * rf_test_pred)

# 최종 결과 저장
submission['Inhibition'] = ensemble_test_pred
submission.to_csv('final_submission.csv', index=False)

print(f"\n=== 최종 결과 ===")
print(f"최적 앙상블 가중치: XGBoost {best_weights[0]:.3f}, Random Forest {best_weights[1]:.3f}")
print(f"최종 앙상블 점수: {best_ensemble_score:.4f}")
print(f"예측 범위: {ensemble_test_pred.min():.2f} ~ {ensemble_test_pred.max():.2f}")
print(f"실제 데이터 범위: {y_train.min():.2f} ~ {y_train.max():.2f}")
print("✅ 최종 제출 파일 저장: final_submission.csv")

테스트 데이터 예측 중...

=== 최종 결과 ===
XGBoost 파라미터 개수: 11
Random Forest 파라미터 개수: 9
최적 앙상블 가중치: 0.750 : 0.250
최종 앙상별 점수: 0.6448
앙상블 결과 저장: ensemble_xgb_rf_submission.csv


In [10]:
# 예측값 분석
print(f"\n=== 예측값 분석 ===")
print(f"XGBoost 예측 범위: {xgb_test_pred.min():.2f} ~ {xgb_test_pred.max():.2f}")
print(f"Random Forest 예측 범위: {rf_test_pred.min():.2f} ~ {rf_test_pred.max():.2f}")
print(f"앙상블 예측 범위: {ensemble_test_pred.min():.2f} ~ {ensemble_test_pred.max():.2f}")
print(f"실제 데이터 범위: {y_train.min():.2f} ~ {y_train.max():.2f}")

# 개별 모델 저장 (비교용)
submission_xgb = submission.copy()
submission_xgb['Inhibition'] = xgb_test_pred
submission_xgb.to_csv('xgb_final_submission.csv', index=False)

submission_rf = submission.copy()
submission_rf['Inhibition'] = rf_test_pred
submission_rf.to_csv('rf_final_submission.csv', index=False)

print("\n개별 모델 결과도 저장:")
print("- xgb_final_submission.csv")
print("- rf_final_submission.csv")
print("- ensemble_xgb_rf_submission.csv (메인)")


=== 예측값 분석 ===
XGBoost 예측 범위: 11.74 ~ 65.27
Random Forest 예측 범위: 18.05 ~ 55.59
앙상블 예측 범위: 13.76 ~ 62.16
실제 데이터 범위: 0.00 ~ 99.38

개별 모델 결과도 저장:
- xgb_final_submission.csv
- rf_final_submission.csv
- ensemble_xgb_rf_submission.csv (메인)
