In [1]:
!pip install /kaggle/input/rdkit-2025-3-3-cp311/rdkit-2025.3.3-cp311-cp311-manylinux_2_28_x86_64.whl
!pip install /kaggle/input/optuna/optuna_integration-4.2.1-py3-none-any.whl

Processing /kaggle/input/rdkit-2025-3-3-cp311/rdkit-2025.3.3-cp311-cp311-manylinux_2_28_x86_64.whl
Installing collected packages: rdkit
Successfully installed rdkit-2025.3.3
Processing /kaggle/input/optuna/optuna_integration-4.2.1-py3-none-any.whl
Installing collected packages: optuna-integration
Successfully installed optuna-integration-4.2.1


In [None]:
# ====================================================
# 셀 2: 라이브러리 및 기본 설정
# ====================================================
print("라이브러리 임포트 및 기본 설정 중...")

# --- 데이터 처리 및 수학 연산 ---
import pandas as pd
import numpy as np
import joblib # 모델 저장 및 로딩

# --- 시스템 및 유틸리티 ---
from tqdm.auto import tqdm
import random
import os
import gc # Garbage Collection: 메모리 관리
import warnings

# --- RDKit: 분자 화학 정보학 ---
from rdkit import Chem, rdBase
from rdkit.Chem import Descriptors, AllChem
from rdkit.Chem.rdMolDescriptors import GetMorganFingerprintAsBitVect

# --- 머신러닝 모델 ---
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.metrics import mean_absolute_error
from sklearn.feature_selection import SelectKBest, mutual_info_regression
from catboost import CatBoostRegressor

# --- 하이퍼파라미터 최적화 ---
import optuna

# --- 경고 메시지 비활성화 ---
rdBase.DisableLog('rdApp.warning') 
warnings.filterwarnings('ignore')

print("임포트 및 기본 설정 완료.")
print("-" * 50)


# ====================================================
# 셀 3: 환경 설정 (Configuration) 및 시드 고정
# ====================================================
class CFG:
    # 경로 설정
    TRAIN_PATH = '/kaggle/input/neurips-open-polymer-prediction-2025/train.csv'
    TEST_PATH = '/kaggle/input/neurips-open-polymer-prediction-2025/test.csv'
    
    # 캐시 파일 경로
    CACHE_DIR = '/kaggle/working/cache/'
    TRAIN_FEAT_PATH = os.path.join(CACHE_DIR, 'train_features_final.pkl')
    TEST_FEAT_PATH = os.path.join(CACHE_DIR, 'test_features_final.pkl')

    # 디바이스 설정
    DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    # 모델 및 훈련 파라미터
    N_FOLDS = 10
    TARGETS = ['Tg', 'FFV', 'Tc', 'Density', 'Rg']
    RANDOM_STATE = 42
    N_FEATURES = 1000
    
    # 신경망 모델 파라미터
    NN_BATCH_SIZE = 64
    NN_EPOCHS = 100
    NN_LR = 1e-4

print(f"디바이스 설정: {CFG.DEVICE}")

# 재현성을 위한 시드 고정
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False

seed_everything(CFG.RANDOM_STATE)
print(f"시드 값 {CFG.RANDOM_STATE}로 고정 완료.")
os.makedirs(CFG.CACHE_DIR, exist_ok=True)
print(f"캐시 디렉토리 '{CFG.CACHE_DIR}' 생성/확인 완료.")
print("-" * 50)


# ====================================================
# 셀 4: 특성 공학 클래스 정의
# ====================================================
class AdvancedFeaturizer:
    def __init__(self, n_features=1000):
        self.n_features = n_features
        self.feature_selector = None
        self.base_feature_names = []

    def _get_features(self, smiles):
        try:
            mol = Chem.MolFromSmiles(smiles)
            if mol is None: return np.zeros(1224)
            
            descriptors = [func(mol) for _, func in Descriptors.descList]
            morgan_fp = list(GetMorganFingerprintAsBitVect(mol, 2, nBits=1024))
            return np.array(descriptors + morgan_fp, dtype=np.float32)
        except:
            return np.zeros(1224)

    def fit_transform(self, df, targets=None):
        # 기본 특성 이름 설정
        if not self.base_feature_names:
            descriptor_names = [d[0] for d in Descriptors.descList]
            morgan_names = [f'morgan_{i}' for i in range(1024)]
            self.base_feature_names = descriptor_names + morgan_names
        
        features = np.array([self._get_features(s) for s in tqdm(df['SMILES'], desc="특성 추출 중")])
        features_df = pd.DataFrame(features, columns=self.base_feature_names)
        
        # 데이터 정제
        features_df = features_df.apply(pd.to_numeric, errors='coerce')
        features_df.replace([np.inf, -np.inf], np.nan, inplace=True)
        features_df.fillna(0, inplace=True)
        
        # 특성 선택 (훈련 데이터에만 적용)
        if targets is not None and self.feature_selector is None:
            print(f"{self.n_features}개 특성 선택...")
            ffv_mask = ~np.isnan(targets[:, 1])
            if ffv_mask.sum() > 100:
                self.feature_selector = SelectKBest(mutual_info_regression, k=min(self.n_features, features_df.shape[1]))
                self.feature_selector.fit(features_df.loc[ffv_mask], targets[ffv_mask, 1])
        
        if self.feature_selector:
            features_df = pd.DataFrame(self.feature_selector.transform(features_df), 
                                     columns=self.feature_selector.get_feature_names_out())
        
        return features_df

print("특성 공학 클래스 'AdvancedFeaturizer' 정의 완료.")
print("-" * 50)


# ====================================================
# 셀 5: PyTorch 모델 및 데이터셋 정의
# ====================================================
class PolymerDataset(Dataset):
    def __init__(self, features, targets):
        self.features = torch.tensor(features, dtype=torch.float32)
        self.targets = torch.tensor(targets, dtype=torch.float32)
    def __len__(self): return len(self.features)
    def __getitem__(self, idx): return self.features[idx], self.targets[idx]

class PolymerTransformer(nn.Module):
    def __init__(self, input_dim, embed_dim=256, num_heads=8, num_layers=4, dropout=0.1):
        super().__init__()
        self.input_proj = nn.Linear(input_dim, embed_dim)
        encoder_layer = nn.TransformerEncoderLayer(d_model=embed_dim, nhead=num_heads, dropout=dropout, batch_first=True)
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.output_heads = nn.ModuleList([nn.Linear(embed_dim, 1) for _ in range(len(CFG.TARGETS))])

    def forward(self, x):
        x = self.input_proj(x.unsqueeze(1))
        x = self.transformer(x).squeeze(1)
        return {CFG.TARGETS[i]: head(x).squeeze(-1) for i, head in enumerate(self.output_heads)}

print("PyTorch 관련 클래스 및 함수 정의 완료.")
print("-" * 50)

# ====================================================
# 셀 6: 데이터 로딩 및 전처리 실행
# ====================================================
def get_processed_data():
    if os.path.exists(CFG.TRAIN_FEAT_PATH) and os.path.exists(CFG.TEST_FEAT_PATH):
        print("캐시된 특성 파일 로딩...")
        train_df = pd.read_csv(CFG.TRAIN_PATH)
        test_df = pd.read_csv(CFG.TEST_PATH)
        train_features = joblib.load(CFG.TRAIN_FEAT_PATH)
        test_features = joblib.load(CFG.TEST_FEAT_PATH)
        return train_df, test_df, train_features, test_features
    
    print("데이터 로드 및 전처리 시작...")
    train_df = pd.read_csv(CFG.TRAIN_PATH)
    test_df = pd.read_csv(CFG.TEST_PATH)
    
    all_smiles = pd.concat([train_df[['SMILES']], test_df[['SMILES']]], ignore_index=True)
    featurizer = AdvancedFeaturizer(n_features=CFG.N_FEATURES)
    train_targets_for_selection = train_df[CFG.TARGETS].values.astype(np.float32)
    
    all_features = featurizer.fit_transform(all_smiles, np.vstack([train_targets_for_selection, np.full((len(test_df), 5), np.nan)]))

    train_features = all_features.iloc[:len(train_df)]
    test_features = all_features.iloc[len(train_df):]
    
    joblib.dump(train_features, CFG.TRAIN_FEAT_PATH)
    joblib.dump(test_features, CFG.TEST_FEAT_PATH)
    
    return train_df, test_df, train_features, test_features

train_df, test_df, train_features, test_features = get_processed_data()
print(f"훈련 특성 데이터 형태: {train_features.shape}")
print(f"테스트 특성 데이터 형태: {test_features.shape}")
print("-" * 50)


# ====================================================
# 셀 7: 1단계 - 유사 레이블 생성을 위한 CatBoost 모델 훈련
# ====================================================
print("===== 1단계: 유사 레이블 생성 모델 훈련 시작 =====")
pseudo_train_df = train_df.copy()
catboost_oof_maes = {}

for target in CFG.TARGETS:
    print(f"\n--- 유사 레이블 생성을 위한 '{target}' 모델 훈련 ---")
    
    not_na_idx = train_df[target].notna()
    X = train_features[not_na_idx].values
    y = train_df.loc[not_na_idx, target].values
    X_to_predict = train_features[~not_na_idx].values
    
    kf = KFold(n_splits=CFG.N_FOLDS, shuffle=True, random_state=CFG.RANDOM_STATE)
    oof_preds = np.zeros(len(X))
    pseudo_labels = []
    
    for fold, (train_idx, val_idx) in enumerate(kf.split(X, y)):
        X_train, X_val = X[train_idx], X[val_idx]
        y_train, y_val = y[train_idx], y[val_idx]

        scaler = StandardScaler()
        X_train_s = scaler.fit_transform(X_train)
        X_val_s = scaler.transform(X_val)
        
        model = CatBoostRegressor(iterations=4000, learning_rate=0.02, depth=8, loss_function='MAE', verbose=0, task_type='GPU' if CFG.DEVICE.type=='cuda' else 'CPU')
        model.fit(X_train_s, y_train, eval_set=[(X_val_s, y_val)], early_stopping_rounds=100, verbose=0)
        
        oof_preds[val_idx] = model.predict(X_val_s)
        
        if X_to_predict.shape[0] > 0:
            X_to_predict_s = scaler.transform(X_to_predict)
            pseudo_labels.append(model.predict(X_to_predict_s))

    catboost_oof_maes[target] = mean_absolute_error(y, oof_preds)
    print(f"'{target}' 모델 OOF MAE (유사 레이블 생성용): {catboost_oof_maes[target]:.5f}")
    
    if len(pseudo_labels) > 0:
        pseudo_train_df.loc[~not_na_idx, target] = np.mean(pseudo_labels, axis=0)

print("\n유사 레이블 생성 완료. 훈련 데이터가 보강되었습니다.")
print(f"보강 후 훈련 데이터 NaN 개수:\n{pseudo_train_df[CFG.TARGETS].isnull().sum()}")
print("-" * 50)


# ====================================================
# 셀 8: 2단계 - 최종 앙상블 모델 훈련 (CatBoost 및 Transformer)
# ====================================================
print("===== 2단계: 최종 앙상블 모델 훈련 시작 =====")

# 최종 예측 결과를 저장할 데이터프레임
oof_catboost_final = pd.DataFrame(index=train_df.index)
oof_transformer_final = pd.DataFrame(index=train_df.index)
submission_catboost = pd.DataFrame({'id': test_df['id']})
submission_transformer = pd.DataFrame({'id': test_df['id']})

# 최종 훈련에 사용할 보강된 데이터
final_train_targets = pseudo_train_df[CFG.TARGETS].values

# --- 2.1 CatBoost 모델 훈련 (보강된 데이터 사용) ---
for target in CFG.TARGETS:
    print(f"\n--- 최종 CatBoost '{target}' 모델 훈련 ---")
    kf = KFold(n_splits=CFG.N_FOLDS, shuffle=True, random_state=CFG.RANDOM_STATE)
    y_target = final_train_targets[:, CFG.TARGETS.index(target)]
    oof_preds = np.zeros(len(train_features))
    test_preds_folds = []
    
    for fold, (train_idx, val_idx) in enumerate(kf.split(train_features, y_target)):
        X_train, X_val = train_features.values[train_idx], train_features.values[val_idx]
        y_train, y_val = y_target[train_idx], y_target[val_idx]

        scaler = StandardScaler()
        X_train_s = scaler.fit_transform(X_train)
        X_val_s = scaler.transform(X_val)
        
        model = CatBoostRegressor(iterations=4000, learning_rate=0.02, depth=8, loss_function='MAE', verbose=0, task_type='GPU' if CFG.DEVICE.type=='cuda' else 'CPU')
        model.fit(X_train_s, y_train, eval_set=[(X_val_s, y_val)], early_stopping_rounds=100, verbose=0)
        
        oof_preds[val_idx] = model.predict(X_val_s)
        test_preds_folds.append(model.predict(scaler.transform(test_features.values)))

    oof_catboost_final[target] = oof_preds
    submission_catboost[target] = np.mean(test_preds_folds, axis=0)
    print(f"최종 CatBoost '{target}' 모델 OOF MAE: {mean_absolute_error(y_target, oof_preds):.5f}")

# --- 2.2 Transformer 모델 훈련 (보강된 데이터 사용) ---
print("\n--- 최종 Transformer 모델 훈련 ---")
kf = KFold(n_splits=CFG.N_FOLDS, shuffle=True, random_state=CFG.RANDOM_STATE)
oof_preds_nn = np.zeros_like(final_train_targets)
test_preds_folds_nn = []

for fold, (train_idx, val_idx) in enumerate(kf.split(train_features, final_train_targets)):
    print(f"  Fold {fold+1}/{CFG.N_FOLDS} 훈련 중...")
    X_train, X_val = train_features.values[train_idx], train_features.values[val_idx]
    y_train, y_val = final_train_targets[train_idx], final_train_targets[val_idx]

    scaler = StandardScaler()
    X_train_s = scaler.fit_transform(X_train)
    X_val_s = scaler.transform(X_val)
    
    train_dataset = PolymerDataset(X_train_s, y_train)
    val_dataset = PolymerDataset(X_val_s, y_val)
    train_loader = DataLoader(train_dataset, batch_size=CFG.NN_BATCH_SIZE, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=CFG.NN_BATCH_SIZE, shuffle=False)
    
    model = PolymerTransformer(input_dim=CFG.N_FEATURES).to(CFG.DEVICE)
    optimizer = torch.optim.AdamW(model.parameters(), lr=CFG.NN_LR)
    criterion = nn.L1Loss() # MAE Loss

    best_val_loss = float('inf')
    for epoch in range(CFG.NN_EPOCHS):
        model.train()
        for features, targets in train_loader:
            features, targets = features.to(CFG.DEVICE), targets.to(CFG.DEVICE)
            optimizer.zero_grad()
            outputs = model(features)
            loss = sum(criterion(outputs[t], targets[:, i]) for i, t in enumerate(CFG.TARGETS))
            loss.backward()
            optimizer.step()
    
    model.eval()
    val_preds_fold = np.zeros_like(y_val)
    with torch.no_grad():
        outputs = model(torch.tensor(X_val_s, dtype=torch.float32).to(CFG.DEVICE))
        for i, t in enumerate(CFG.TARGETS):
            val_preds_fold[:, i] = outputs[t].cpu().numpy()
    oof_preds_nn[val_idx] = val_preds_fold
    
    test_preds_fold = np.zeros((len(test_features), len(CFG.TARGETS)))
    with torch.no_grad():
        outputs = model(torch.tensor(scaler.transform(test_features.values), dtype=torch.float32).to(CFG.DEVICE))
        for i, t in enumerate(CFG.TARGETS):
            test_preds_fold[:, i] = outputs[t].cpu().numpy()
    test_preds_folds_nn.append(test_preds_fold)
    
oof_transformer_final[CFG.TARGETS] = oof_preds_nn
submission_transformer[CFG.TARGETS] = np.mean(test_preds_folds_nn, axis=0)

print("\n최종 모델 훈련 완료.")
print("-" * 50)


# ====================================================
# 셀 9: 3단계 - 적응형 앙상블 및 최종 제출
# ====================================================
print("===== 3단계: 적응형 앙상블 및 최종 제출 파일 생성 =====")

# 각 모델의 OOF 점수 계산
catboost_final_scores = {t: mean_absolute_error(final_train_targets[:, i], oof_catboost_final[t]) for i, t in enumerate(CFG.TARGETS)}
transformer_final_scores = {t: mean_absolute_error(final_train_targets[:, i], oof_transformer_final[t]) for i, t in enumerate(CFG.TARGETS)}

print("\n--- 최종 OOF MAE 점수 ---")
print("CatBoost:", catboost_final_scores)
print("Transformer:", transformer_final_scores)

# 가중 평균을 위한 가중치 계산
final_predictions = pd.DataFrame({'id': test_df['id']})
print("\n--- 앙상블 가중치 ---")
for target in CFG.TARGETS:
    score1 = catboost_final_scores[target]
    score2 = transformer_final_scores[target]
    
    # 더 좋은 성능(낮은 MAE)에 높은 가중치 부여
    w1 = 1 / (score1 + 1e-6)
    w2 = 1 / (score2 + 1e-6)
    w_sum = w1 + w2
    
    weight_catboost = w1 / w_sum
    weight_transformer = w2 / w_sum
    
    print(f"'{target}': CatBoost 가중치={weight_catboost:.3f}, Transformer 가중치={weight_transformer:.3f}")
    
    final_predictions[target] = (submission_catboost[target] * weight_catboost + 
                                 submission_transformer[target] * weight_transformer)

# 최종 제출 파일 저장
final_predictions.to_csv('submission.csv', index=False)
print("\n최종 제출 파일 'submission.csv'이 성공적으로 생성되었습니다.")
print("\n최종 제출 파일 샘플:")
print(final_predictions.head())



라이브러리 설치 중...
^C
[31mERROR: Operation cancelled by user[0m[31m
[0m설치 완료.
--------------------------------------------------
라이브러리 임포트 및 기본 설정 중...
임포트 및 기본 설정 완료.
--------------------------------------------------
디바이스 설정: cuda
시드 값 42로 고정 완료.
캐시 디렉토리 '/kaggle/working/cache/' 생성/확인 완료.
--------------------------------------------------
특성 공학 클래스 'AdvancedFeaturizer' 정의 완료.
--------------------------------------------------
PyTorch 관련 클래스 및 함수 정의 완료.
--------------------------------------------------
데이터 로드 및 전처리 시작...


특성 추출 중:   0%|          | 0/7976 [00:00<?, ?it/s]

1000개 특성 선택...
훈련 특성 데이터 형태: (7973, 1000)
테스트 특성 데이터 형태: (3, 1000)
--------------------------------------------------
===== 1단계: 유사 레이블 생성 모델 훈련 시작 =====

--- 유사 레이블 생성을 위한 'Tg' 모델 훈련 ---


Default metric period is 5 because MAE is/are not implemented for GPU
Default metric period is 5 because MAE is/are not implemented for GPU
Default metric period is 5 because MAE is/are not implemented for GPU
Default metric period is 5 because MAE is/are not implemented for GPU
Default metric period is 5 because MAE is/are not implemented for GPU
Default metric period is 5 because MAE is/are not implemented for GPU
Default metric period is 5 because MAE is/are not implemented for GPU
