<a href="https://colab.research.google.com/github/Hanbin-git/dacon_new_drug/blob/main/20250626(3).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# google Drive 마운트
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import zipfile

zip_path = '/content/drive/MyDrive/data.zip'
extract_path = '/content/project_data'  # 원하는 경로

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)


In [3]:
import pandas as pd

path = '/content/project_data/'  # 압축 해제 경로
train = pd.read_csv(path + 'train.csv')
test = pd.read_csv(path + 'test.csv')
submission = pd.read_csv(path + 'sample_submission.csv')

print(train.shape, test.shape, submission.shape)


(1681, 3) (100, 2) (100, 2)


In [4]:
# # RDKit 설치 (Colab에서 가능)
# !pip uninstall -y rdkit-pypi
# !pip install rdkit-pypi==2022.9.5 optuna xgboost



In [5]:
# !pip install numpy==1.23.5
# import os
# os.kill(os.getpid(), 9)  # 런타임 재시작 (필수)


In [6]:
# !pip install rdkit-pypi catboost


In [7]:
# !pip uninstall xgboost -y
# !pip install xgboost==1.6.0

In [8]:
# # ✅ Kaggle 환경에서 패키지 호환 버전으로 재설치
# !pip install --quiet --upgrade scikit-learn==1.1.3
# !pip install --quiet xgboost==1.6.2 lightgbm==3.3.5 catboost==1.2.2


In [9]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import Descriptors, MACCSkeys, AllChem
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import mean_absolute_error
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.model_selection import GroupKFold

# 경로 설정
path = "/content/project_data/"
train = pd.read_csv(path + "train.csv")
test = pd.read_csv(path + "test.csv")
submission = pd.read_csv(path + "sample_submission.csv")

# SMILES feature 생성
desc_funcs = [f for _, f in Descriptors._descList]
def featurize(smiles):
    mol = Chem.MolFromSmiles(str(smiles))
    if mol is None:
        return [0] * (len(desc_funcs) + 2048 + 167)
    desc = [safe(f, mol) for f in desc_funcs]
    morgan = list(AllChem.GetMorganFingerprintAsBitVect(mol, 2, 2048))
    maccs = list(MACCSkeys.GenMACCSKeys(mol))
    return desc + morgan + maccs

def safe(f, mol):
    try:
        val = f(mol)
        return 0 if (np.isnan(val) or np.isinf(val)) else val
    except:
        return 0

X = np.array([featurize(s) for s in train["Canonical_Smiles"]])
X_test = np.array([featurize(s) for s in test["Canonical_Smiles"]])
y = train["Inhibition"].values
groups = train["Canonical_Smiles"].values

# 스케일링 + PCA
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_test_scaled = scaler.transform(X_test)

X_scaled = np.nan_to_num(X_scaled)
X_test_scaled = np.nan_to_num(X_test_scaled)

pca = PCA(n_components=300, random_state=42)
X_pca = pca.fit_transform(X_scaled)
X_test_pca = pca.transform(X_test_scaled)

# 모델 비교 (GroupKFold)
cv = GroupKFold(n_splits=5)
models = {
    "LGBM": LGBMRegressor(random_state=42),
    "XGB": XGBRegressor(random_state=42, tree_method="hist"),
    "CatBoost": CatBoostRegressor(random_state=42, verbose=0)
}

for name, model in models.items():
    scores = []
    for train_idx, val_idx in cv.split(X_pca, y, groups):
        model.fit(X_pca[train_idx], y[train_idx])
        preds = model.predict(X_pca[val_idx])
        score = mean_absolute_error(y[val_idx], preds)
        scores.append(score)
    print(f"{name} MAE: {np.mean(scores):.4f}")

# 최고 모델 선택 후 전체 학습 및 예측
best_model = LGBMRegressor(random_state=42)
best_model.fit(X_pca, y)
submission["Inhibition"] = best_model.predict(X_test_pca)
submission.to_csv("submission.csv", index=False)
print("✅ submission.csv 저장 완료!")


LGBM MAE: 20.8543



stdout:



stderr:

Traceback (most recent call last):
  File "<string>", line 4, in <module>
  File "/usr/local/lib/python3.11/dist-packages/numba_cuda/numba/cuda/cudadrv/driver.py", line 314, in __getattr__
    raise CudaSupportError("Error at driver init: \n%s:" %
numba.cuda.cudadrv.error.CudaSupportError: Error at driver init: 

CUDA driver library cannot be found.
If you are sure that a CUDA driver is installed,
try setting environment variable NUMBA_CUDA_DRIVER
with the file path of the CUDA driver shared library.
:


Not patching Numba


XGB MAE: 21.6727
CatBoost MAE: 20.2283
✅ submission.csv 저장 완료!
