<a href="https://colab.research.google.com/github/Hanbin-git/dacon_new_drug/blob/main/20250707_xgboost.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!unzip -o "/content/drive/MyDrive/data.zip" -d "/content/data_1"


Archive:  /content/drive/MyDrive/data.zip
  inflating: /content/data_1/sample_submission.csv  
  inflating: /content/data_1/test.csv  
  inflating: /content/data_1/train.csv  


In [None]:
# ✅ 필수 라이브러리 설치
!pip install -q torch==2.0.1+cu118 torchvision==0.15.2+cu118 torchaudio==2.0.2 --index-url https://download.pytorch.org/whl/cu118
!pip install -q torch-geometric rdkit catboost lightgbm xgboost

# ✅ numpy 호환 버전 재설치 (필수!)
!pip install numpy==1.23.5 --force-reinstall --no-cache-dir


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.3/2.3 GB[0m [31m564.7 kB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.1/6.1 MB[0m [31m64.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.4/4.4 MB[0m [31m67.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.3/63.3 MB[0m [31m10.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m132.3/132.3 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for lit (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.1/63.1 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m17.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
import os
os.kill(os.getpid(), 9)  # 🔁 런타임 강제 재시작


In [None]:
import pandas as pd
import numpy as np
import os
import torch
from rdkit import Chem
from rdkit.Chem import Descriptors, Crippen, Lipinski, rdMolDescriptors, AllChem
from torch_geometric.data import Data
from torch_geometric.nn import global_mean_pool, GCNConv
from torch_geometric.loader import DataLoader
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from sklearn.model_selection import KFold
from catboost import CatBoostRegressor, Pool
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor

# ✅ 데이터 경로 설정
extract_path = "/content/data_1"
def get_path(filename):
    return os.path.join(extract_path, filename)

# ✅ 데이터 로딩
train = pd.read_csv(get_path("train.csv"))
test = pd.read_csv(get_path("test.csv"))
submission = pd.read_csv(get_path("sample_submission.csv"))
y_train = train["Inhibition"]

# ✅ RDKit Features
def extract_rdkit_features(df):
    mols = [Chem.MolFromSmiles(smi) for smi in df['Canonical_Smiles']]
    features = {
        'MolWt': [Descriptors.MolWt(mol) if mol else np.nan for mol in mols],
        'LogP': [Crippen.MolLogP(mol) if mol else np.nan for mol in mols],
        'NumHDonors': [Lipinski.NumHDonors(mol) if mol else np.nan for mol in mols],
        'NumHAcceptors': [Lipinski.NumHAcceptors(mol) if mol else np.nan for mol in mols],
        'TPSA': [rdMolDescriptors.CalcTPSA(mol) if mol else np.nan for mol in mols],
        'NumRotatableBonds': [Descriptors.NumRotatableBonds(mol) if mol else np.nan for mol in mols],
        'RingCount': [mol.GetRingInfo().NumRings() if mol else np.nan for mol in mols],
        'HeavyAtomCount': [mol.GetNumHeavyAtoms() if mol else np.nan for mol in mols],
        'FractionCSP3': [rdMolDescriptors.CalcFractionCSP3(mol) if mol else np.nan for mol in mols],
        'NumAliphaticRings': [rdMolDescriptors.CalcNumAliphaticRings(mol) if mol else np.nan for mol in mols],
        'NumAromaticRings': [rdMolDescriptors.CalcNumAromaticRings(mol) if mol else np.nan for mol in mols]
    }
    return pd.DataFrame(features)

# ✅ Morgan Fingerprint
def get_morgan_fingerprint(smiles, radius=2, nBits=512):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius=radius, nBits=nBits)
        return np.array(fp)
    else:
        return np.zeros(nBits)

def extract_morgan_df(df, nBits=512):
    fps = df['Canonical_Smiles'].apply(lambda x: get_morgan_fingerprint(x, nBits=nBits))
    return pd.DataFrame(fps.tolist(), columns=[f'MFP_{i}' for i in range(nBits)])

# ✅ GNN용 그래프 변환
def mol_to_graph(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None
    node_feats = [[atom.GetAtomicNum()] for atom in mol.GetAtoms()]
    edge_index = []
    for bond in mol.GetBonds():
        a1, a2 = bond.GetBeginAtomIdx(), bond.GetEndAtomIdx()
        edge_index.extend([[a1, a2], [a2, a1]])
    return Data(
        x=torch.tensor(node_feats, dtype=torch.float),
        edge_index=torch.tensor(edge_index, dtype=torch.long).t().contiguous()
    )

# ✅ GCN 모델 정의
class SimpleGCN(torch.nn.Module):
    def __init__(self, hidden_dim=64):
        super().__init__()
        self.conv1 = GCNConv(1, hidden_dim)
        self.conv2 = GCNConv(hidden_dim, hidden_dim)
        self.pool = global_mean_pool
    def forward(self, data):
        x, edge_index, batch = data.x, data.edge_index, data.batch
        x = torch.relu(self.conv1(x, edge_index))
        x = torch.relu(self.conv2(x, edge_index))
        return self.pool(x, batch)

# ✅ GNN 임베딩 추출 함수
def extract_gnn_features(df, batch_size=128):
    graphs = [mol_to_graph(smi) for smi in df['Canonical_Smiles']]
    graphs = [g for g in graphs if g is not None]
    for i, g in enumerate(graphs):
        g.batch = torch.tensor([i]*g.x.size(0))
    loader = DataLoader(graphs, batch_size=batch_size)
    model = SimpleGCN()
    model.eval()
    emb_list = []
    with torch.no_grad():
        for batch in loader:
            emb = model(batch)
            emb_list.append(emb)
    return torch.cat(emb_list, dim=0).numpy()

# ✅ Feature 추출
X_train_rdkit = extract_rdkit_features(train)
X_test_rdkit = extract_rdkit_features(test)
X_train_morgan = extract_morgan_df(train)
X_test_morgan = extract_morgan_df(test)
X_train_gnn = extract_gnn_features(train)
X_test_gnn = extract_gnn_features(test)

# ✅ 병합
X_train = pd.concat([
    X_train_rdkit.reset_index(drop=True),
    X_train_morgan.reset_index(drop=True),
    pd.DataFrame(X_train_gnn, columns=[f"GNN_{i}" for i in range(X_train_gnn.shape[1])])
], axis=1)

X_test = pd.concat([
    X_test_rdkit.reset_index(drop=True),
    X_test_morgan.reset_index(drop=True),
    pd.DataFrame(X_test_gnn, columns=[f"GNN_{i}" for i in range(X_test_gnn.shape[1])])
], axis=1)

# ✅ 정규화
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train_df = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_test_df = pd.DataFrame(X_test_scaled, columns=X_test.columns)

# ✅ Stacking
sample_weight = np.log1p(y_train)
kf = KFold(n_splits=5, shuffle=True, random_state=42)
oof_cat, oof_lgb, oof_xgb = np.zeros(len(X_train_df)), np.zeros(len(X_train_df)), np.zeros(len(X_train_df))
test_cat, test_lgb, test_xgb = np.zeros(len(X_test_df)), np.zeros(len(X_test_df)), np.zeros(len(X_test_df))

for tr_idx, val_idx in kf.split(X_train_df):
    X_tr, X_val = X_train_df.iloc[tr_idx], X_train_df.iloc[val_idx]
    y_tr, y_val = y_train.iloc[tr_idx], y_train.iloc[val_idx]
    w_tr = sample_weight.iloc[tr_idx]

    cat = CatBoostRegressor(iterations=1500, learning_rate=0.03, depth=6, verbose=0, early_stopping_rounds=100)
    cat.fit(Pool(X_tr, y_tr, weight=w_tr), eval_set=Pool(X_val, y_val))
    oof_cat[val_idx] = cat.predict(X_val)
    test_cat += cat.predict(X_test_df) / kf.n_splits

    lgb = LGBMRegressor(n_estimators=1500, learning_rate=0.03, max_depth=6)
    lgb.fit(X_tr, y_tr, sample_weight=w_tr)
    oof_lgb[val_idx] = lgb.predict(X_val)
    test_lgb += lgb.predict(X_test_df) / kf.n_splits

    xgb = XGBRegressor(n_estimators=1500, learning_rate=0.03, max_depth=6)
    xgb.fit(X_tr, y_tr, sample_weight=w_tr)
    oof_xgb[val_idx] = xgb.predict(X_val)
    test_xgb += xgb.predict(X_test_df) / kf.n_splits

# ✅ 메타 모델
stacked_train = np.vstack([oof_cat, oof_lgb, oof_xgb]).T
stacked_test = np.vstack([test_cat, test_lgb, test_xgb]).T

meta = Ridge(alpha=1.0)
meta.fit(stacked_train, y_train)
final_preds = meta.predict(stacked_test)

# ✅ 제출
submission['Inhibition'] = final_preds
submission.to_csv('submission_stacking_with_gnn.csv', index=False)
print("✅ 최종 제출 파일 'submission_stacking_with_gnn.csv' 생성 완료!")


Disabling PyTorch because PyTorch >= 2.1 is required but found 2.0.1+cu118


[1;30;43m스트리밍 출력 내용이 길어서 마지막 5000줄이 삭제되었습니다.[0m
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.007347 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 10937
[LightGBM] [Info] Number of data points in the train set: 1345, number of used features: 554
[LightGBM] [Info] Start training from score 42.865850
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.007680 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 10930
[LightGBM] [Info] Number of data points in the train set: 1345, number of used features: 554
[LightGBM] [Info] Start training from score 43.249144
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.008646 seconds.
You can set `force_col_wise=tr

In [None]:
import pandas as pd

# 제출 파일 경로
file_path = "/content/submission_stacking_with_gnn.csv"
df = pd.read_csv(file_path)

# 통계 요약
print("✅ 제출 결과 통계 요약:")
print(df["Inhibition"].describe())
print("\nTop 10 예측값:")
print(df["Inhibition"].head(10))


✅ 제출 결과 통계 요약:
count    100.000000
mean      34.127825
std        6.958935
min       20.381480
25%       29.293191
50%       33.439125
75%       38.868247
max       54.089632
Name: Inhibition, dtype: float64

Top 10 예측값:
0    36.401657
1    38.367006
2    28.361161
3    32.036931
4    43.969104
5    29.603717
6    34.513428
7    26.346382
8    41.862748
9    22.293131
Name: Inhibition, dtype: float64


In [None]:
import numpy as np

# GNN 임베딩만 로드해서 확인
print("GNN feature shape:", X_train_gnn.shape)
print("GNN feature 예시 (첫 3개):")
print(X_train_gnn[:3])
print("평균값:", np.mean(X_train_gnn), "표준편차:", np.std(X_train_gnn))


GNN feature shape: (1681, 64)
GNN feature 예시 (첫 3개):
[[0.         0.20440848 1.477163   0.9467569  0.0162536  0.
  0.         0.         0.         0.         0.2812866  0.5311665
  0.         0.         0.9533382  0.         0.         0.20728734
  0.         0.19683208 0.         0.         0.93246996 0.685789
  0.         0.36946237 1.5783175  0.         0.         0.1267869
  0.         0.06543677 0.         1.5273057  0.709877   0.5465393
  0.         0.         0.         0.42957377 0.         0.75455385
  1.0049845  1.031187   0.14854844 0.60571504 0.         0.
  0.         0.16037953 0.9834224  0.         0.         1.1502254
  0.62731826 0.7460754  0.         1.126849   0.23661374 1.6792494
  0.         0.         0.68320864 0.        ]
 [0.         0.19371274 1.3998702  0.89721733 0.01540312 0.
  0.         0.         0.         0.         0.2665682  0.50337297
  0.         0.         0.9034544  0.         0.         0.19644102
  0.         0.18653278 0.         0.         0

In [None]:
print(train.columns)


Index(['ID', 'Canonical_Smiles', 'Inhibition'], dtype='object')


In [None]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import QuantileTransformer
from lightgbm import LGBMRegressor, early_stopping, log_evaluation

# ✅ 경로 지정
path = "/content/data_1/"
train = pd.read_csv(path + "train.csv")
test = pd.read_csv(path + "test.csv")
submission = pd.read_csv(path + "sample_submission.csv")

# ✅ Morgan Fingerprint 추출 함수
def smiles_to_morgan(smiles, radius=2, n_bits=2048):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        return AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=n_bits)
    else:
        return None

def extract_morgan_df(df):
    morgan_list = []
    for smi in df['Canonical_Smiles']:
        fp = smiles_to_morgan(smi)
        if fp is not None:
            arr = np.zeros((1,))
            AllChem.DataStructs.ConvertToNumpyArray(fp, arr)
            morgan_list.append(arr)
        else:
            morgan_list.append(np.zeros(2048))  # fallback
    return np.array(morgan_list)

# ✅ Morgan feature 생성
X = extract_morgan_df(train)
X_test = extract_morgan_df(test)
y = train["Inhibition"]

# ✅ 전처리
imputer = SimpleImputer(strategy="mean")
scaler = QuantileTransformer()

X = imputer.fit_transform(X)
X_test = imputer.transform(X_test)

X = scaler.fit_transform(X)
X_test = scaler.transform(X_test)

# ✅ KFold + LightGBM 학습
kf = KFold(n_splits=5, shuffle=True, random_state=42)
oof_preds = np.zeros(len(train))
test_preds = np.zeros(len(test))

for fold, (train_idx, val_idx) in enumerate(kf.split(X)):
    print(f"\n🟢 Fold {fold + 1}")
    X_train, y_train = X[train_idx], y.iloc[train_idx]
    X_val, y_val = X[val_idx], y.iloc[val_idx]

    model = LGBMRegressor(n_estimators=500, learning_rate=0.05, random_state=fold)

    model.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        callbacks=[
            early_stopping(stopping_rounds=30),
            log_evaluation(period=50)
        ]
    )

    oof_preds[val_idx] = model.predict(X_val)
    test_preds += model.predict(X_test) / kf.n_splits

# ✅ 결과 출력
rmse = mean_squared_error(y, oof_preds, squared=False)
print(f"\n✅ CV RMSE: {rmse:.4f}")

# ✅ 제출 파일 저장
submission["Inhibition"] = test_preds
submission.to_csv("submission_morgan_lgbm.csv", index=False)
print("📁 제출 파일 저장 완료: submission_morgan_lgbm.csv")





🟢 Fold 1
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.012077 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1208
[LightGBM] [Info] Number of data points in the train set: 1344, number of used features: 604
[LightGBM] [Info] Start training from score 33.391242




Training until validation scores don't improve for 30 rounds
[50]	valid_0's l2: 606.377
[100]	valid_0's l2: 607.484
Early stopping, best iteration is:
[73]	valid_0's l2: 601.366

🟢 Fold 2




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.007332 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1218
[LightGBM] [Info] Number of data points in the train set: 1345, number of used features: 609
[LightGBM] [Info] Start training from score 33.637152
Training until validation scores don't improve for 30 rounds
[50]	valid_0's l2: 565.132
Early stopping, best iteration is:
[55]	valid_0's l2: 560.583

🟢 Fold 3




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.007389 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1200
[LightGBM] [Info] Number of data points in the train set: 1345, number of used features: 600
[LightGBM] [Info] Start training from score 33.054065
Training until validation scores don't improve for 30 rounds
[50]	valid_0's l2: 683.519
Early stopping, best iteration is:
[48]	valid_0's l2: 681.164

🟢 Fold 4




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.007270 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1194
[LightGBM] [Info] Number of data points in the train set: 1345, number of used features: 597
[LightGBM] [Info] Start training from score 33.229167
Training until validation scores don't improve for 30 rounds
[50]	valid_0's l2: 603.896
[100]	valid_0's l2: 599.744
Early stopping, best iteration is:
[78]	valid_0's l2: 595.669

🟢 Fold 5




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.008255 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1186
[LightGBM] [Info] Number of data points in the train set: 1345, number of used features: 593
[LightGBM] [Info] Start training from score 32.797655
Training until validation scores don't improve for 30 rounds
[50]	valid_0's l2: 648.589
Early stopping, best iteration is:
[65]	valid_0's l2: 644.39




TypeError: got an unexpected keyword argument 'squared'