#Import

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install pandas scikit-learn catboost

Collecting catboost
  Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold, cross_validate
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.linear_model import Ridge
from sklearn.ensemble import StackingRegressor, VotingRegressor
from sklearn.neural_network import MLPRegressor
from google.colab import files
from scipy.stats import uniform, randint
from sklearn.metrics import mean_squared_error
from scipy.stats import pearsonr

path = '/content/drive/MyDrive/dacon_1/input_10.csv'
train = pd.read_csv(path)


In [None]:
import pandas as pd
import numpy as np


features_with_nan = train.columns[train.isna().any()].tolist()
if features_with_nan:
    print(f"\n[NaN 값을 포함하는 피처(열) 이름]: {features_with_nan}")
else:
    print("\nDataFrame에 NaN 값을 포함하는 피처(열)가 없습니다.")


[NaN 값을 포함하는 피처(열) 이름]: ['SpherocityIndex', 'SASA', 'MolVolume']


In [None]:
import pandas as pd
import numpy as np

columns_to_drop = [
    'SpherocityIndex', 'SASA', 'MolVolume'
    ]

existing_features_to_remove = [col for col in columns_to_drop if col in train.columns]

if existing_features_to_remove:
    train = train.drop(columns=existing_features_to_remove)
    print(f"train.shape: {train.shape}")

print("\ntrain head:")
print(train.head())

train.shape: (1681, 1318)

train head:
           ID         PMI1         PMI2          PMI3      NPR1      NPR2  \
0  TRAIN_0000   575.069242  2555.122302   2920.656196  0.196897  0.874845   
1  TRAIN_0001   456.829070  1469.302773   1912.906155  0.238814  0.768100   
2  TRAIN_0002   953.401111  2584.031148   3261.716948  0.292300  0.792230   
3  TRAIN_0003  1996.471896  9465.097548  10133.290487  0.197021  0.934060   
4  TRAIN_0004  1884.272386  2887.023659   4234.712130  0.444959  0.681752   

   SlogP_VSA1  SlogP_VSA2  SlogP_VSA3  SlogP_VSA4  ...  sulfone  thioether  \
0    5.309813   26.307476    6.372925    0.000000  ...        0          0   
1    4.736863   11.542964    6.372925    0.000000  ...        0          0   
2    0.000000   29.335040    4.837589   15.931539  ...        0          0   
3   10.046676   46.879761   11.295848    5.817221  ...        0          0   
4    5.428790   21.153071   10.799569    5.893958  ...        0          0   

   alkene  alkyne  halide  an

In [None]:
X = train.drop(columns=['Inhibition','ID'])
y = train['Inhibition']

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.1, random_state=42)
import re
X_train.columns = [re.sub('[^0-9a-zA-Z_]+', '_', str(col)) for col in X_train.columns]

#Model

In [None]:
models = {
    "RandomForest": RandomForestRegressor(n_estimators=100, random_state=42),
    "LightGBM": LGBMRegressor(n_estimators=100, random_state=42),
    "NeuralNet": MLPRegressor(hidden_layer_sizes=(100, 50), max_iter=1000, random_state=42),
    "AdaBoost": AdaBoostRegressor(n_estimators=100, random_state=42),
    "GradientBoost": GradientBoostingRegressor(n_estimators=100, random_state=42),
    "XGBoost": XGBRegressor(use_label_encoder=False, eval_metric="logloss", random_state=42),
    "CatBoost" : CatBoostRegressor(verbose=0, random_state=42)
}

model_sample = { }

meta_model = Ridge(alpha = 1.0)

models["Voting"] = VotingRegressor(
    estimators=[
        ('rf', models["RandomForest"]),
        ('ada', models["AdaBoost"]),
        ('cat', models["CatBoost"])
    ],
    n_jobs=-1
)

models["Stacking"] = StackingRegressor(
    estimators=[
        ('rf', models["RandomForest"]),
        ('ada', models["AdaBoost"]),
        ('cat', models["CatBoost"]),
    ],
    final_estimator=meta_model,
    cv=5,
    n_jobs=-1,
    passthrough=False
)

In [None]:
def scoring(y_true, y_pred):
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    y_range = y_true.max() - y_true.min()
    norm_rmse = rmse / y_range if y_range != 0 else 0  # 방어
    pearson_corr = pearsonr(y_true, y_pred)[0]
    score = 0.5 * (1 - min(norm_rmse, 1)) + 0.5 * pearson_corr
    return norm_rmse, pearson_corr, score

cv = KFold(n_splits=5, shuffle=True, random_state=42)
cv_results = {}

for name, model in models.items():
    print(f"\n=== {name} ===")
    fold_results = []
    for fold, (train_idx, val_idx) in enumerate(cv.split(X_train), 1):
        X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
        y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]

        model.fit(X_tr, y_tr)
        y_pred = model.predict(X_val)
        norm_rmse, pearson_corr, score = scoring(y_val, y_pred)
        fold_results.append((norm_rmse, pearson_corr, score))
        print(f"Fold {fold} | A (norm_rmse): {norm_rmse:.4f} | B (pearson): {pearson_corr:.4f} | Score: {score:.4f}")
    avg_A = np.mean([fr[0] for fr in fold_results])
    avg_B = np.mean([fr[1] for fr in fold_results])
    avg_score = np.mean([fr[2] for fr in fold_results])
    cv_results[name] = {'A_norm_rmse_mean': avg_A, 'B_pearson_mean': avg_B, 'Score_mean': avg_score}
    print(f">>> {name} 평균 | A: {avg_A:.4f} | B: {avg_B:.4f} | Score: {avg_score:.4f}")




=== RandomForest ===
Fold 1 | A (norm_rmse): 0.2370 | B (pearson): 0.4309 | Score: 0.5970
Fold 2 | A (norm_rmse): 0.2624 | B (pearson): 0.3937 | Score: 0.5657
Fold 3 | A (norm_rmse): 0.2438 | B (pearson): 0.4246 | Score: 0.5904
Fold 4 | A (norm_rmse): 0.2556 | B (pearson): 0.3681 | Score: 0.5563
Fold 5 | A (norm_rmse): 0.2431 | B (pearson): 0.4178 | Score: 0.5873
>>> RandomForest 평균 | A: 0.2484 | B: 0.4070 | Score: 0.5793

=== LightGBM ===
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.010955 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 11981
[LightGBM] [Info] Number of data points in the train set: 1209, number of used features: 906
[LightGBM] [Info] Start training from score 33.364207
Fold 1 | A (norm_rmse): 0.2428 | B (pearson): 0.4070 | Score: 0.5821
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of test

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Fold 1 | A (norm_rmse): 0.2552 | B (pearson): 0.3485 | Score: 0.5466


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Fold 2 | A (norm_rmse): 0.2745 | B (pearson): 0.3520 | Score: 0.5388


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Fold 3 | A (norm_rmse): 0.2532 | B (pearson): 0.4054 | Score: 0.5761


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Fold 4 | A (norm_rmse): 0.2648 | B (pearson): 0.3459 | Score: 0.5406


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Fold 5 | A (norm_rmse): 0.2622 | B (pearson): 0.3556 | Score: 0.5467
>>> XGBoost 평균 | A: 0.2620 | B: 0.3615 | Score: 0.5498

=== CatBoost ===
Fold 1 | A (norm_rmse): 0.2449 | B (pearson): 0.3855 | Score: 0.5703
Fold 2 | A (norm_rmse): 0.2585 | B (pearson): 0.4234 | Score: 0.5824
Fold 3 | A (norm_rmse): 0.2410 | B (pearson): 0.4468 | Score: 0.6029
Fold 4 | A (norm_rmse): 0.2533 | B (pearson): 0.3939 | Score: 0.5703
Fold 5 | A (norm_rmse): 0.2457 | B (pearson): 0.4165 | Score: 0.5854
>>> CatBoost 평균 | A: 0.2487 | B: 0.4132 | Score: 0.5823

=== Voting ===
Fold 1 | A (norm_rmse): 0.2378 | B (pearson): 0.4358 | Score: 0.5990


KeyboardInterrupt: 

In [None]:
import pandas as pd
import numpy as np
import shap
from sklearn.model_selection import train_test_split, ShuffleSplit
from sklearn.ensemble import RandomForestRegressor
import matplotlib.pyplot as plt
import seaborn as sns
import os

base_dir = '/content/drive/MyDrive/dacon_1/'
output_filename = 'candidates_03_1_new.csv'
output_path = os.path.join(base_dir, output_filename)

try:
    train.to_csv(output_path, index=False)
    print(f"\n'{output_filename}' 파일이 '{output_path}'에 성공적으로 저장되었습니다.")
    # 저장된 파일이 실제로 존재하는지 확인 (선택 사항)
    if os.path.exists(output_path):
        print(f"파일 존재 확인: {output_path}")
    else:
        print(f"오류: 파일 저장 후 존재 여부 확인 실패: {output_path}")


'candidates_03_1_new.csv' 파일이 '/content/drive/MyDrive/dacon_1/candidates_03_1_new.csv'에 성공적으로 저장되었습니다.
파일 존재 확인: /content/drive/MyDrive/dacon_1/candidates_03_1_new.csv


In [None]:
path = '/content/drive/MyDrive/dacon_1/03_1_test.csv'
test = pd.read_csv(path)

In [None]:
import pandas as pd
import numpy as np

columns_to_drop = [
    'MaxPartialCharge', 'MinPartialCharge'
    ]

existing_features_to_remove = [col for col in columns_to_drop if col in test.columns]

if existing_features_to_remove:
    test = test.drop(columns=existing_features_to_remove)
    print(f"train.shape: {test.shape}")

print("\ntrain head:")
print(test.head())

train.shape: (100, 1058)

train head:
         ID  ExactMolWt  HeavyAtomCount  NumAtoms  NumValenceElectrons  \
0  TEST_000  415.247107            30.0      30.0                164.0   
1  TEST_001  346.042923            21.0      21.0                108.0   
2  TEST_002  417.115855            29.0      29.0                152.0   
3  TEST_003  390.157102            27.0      27.0                144.0   
4  TEST_004  459.068135            29.0      29.0                152.0   

      MolMR  FractionCSP3  RingCount  NumAromaticRings  NumAliphaticRings  \
0  113.2879      0.695652        4.0               1.0                3.0   
1   82.8520      0.266667        3.0               3.0                0.0   
2  106.8959      0.200000        3.0               3.0                0.0   
3  100.3857      0.500000        5.0               3.0                2.0   
4  111.3915      0.318182        3.0               2.0                1.0   

   ...  ecfp_fp_1014  ecfp_fp_1015  ecfp_fp_1016  ecfp

In [None]:
import pandas as pd
import numpy as np
import shap
from sklearn.model_selection import train_test_split, ShuffleSplit
from sklearn.ensemble import RandomForestRegressor
import matplotlib.pyplot as plt
import seaborn as sns
import os

base_dir = '/content/drive/MyDrive/dacon_1/'
output_filename = '03_1_test_new.csv'
output_path = os.path.join(base_dir, output_filename)

try:
    test.to_csv(output_path, index=False)
    print(f"\n'{output_filename}' 파일이 '{output_path}'에 성공적으로 저장되었습니다.")
    if os.path.exists(output_path):
        print(f"파일 존재 확인: {output_path}")
    else:
        print(f"오류: 파일 저장 후 존재 여부 확인 실패: {output_path}")


'03_1_test_new.csv' 파일이 '/content/drive/MyDrive/dacon_1/03_1_test_new.csv'에 성공적으로 저장되었습니다.
파일 존재 확인: /content/drive/MyDrive/dacon_1/03_1_test_new.csv
