In [3]:
# 📦 설치 (최초 1회만)
# pip install torch torch-geometric rdkit pandas scikit-learn

import pandas as pd
import torch
import torch.nn.functional as F
from rdkit import Chem
from torch_geometric.data import Data, DataLoader, Batch
from torch_geometric.nn import GCNConv, global_mean_pool
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, f1_score

# 🧬 SMILES → Graph + fingerprint 등 feature 포함
def mol_to_graph_with_features(row, use_label=True):
    mol = Chem.MolFromSmiles(row['SMILES'])
    if mol is None:
        return None
    atom_features = [[atom.GetAtomicNum()] for atom in mol.GetAtoms()]
    edge_index = [[], []]
    for bond in mol.GetBonds():
        i, j = bond.GetBeginAtomIdx(), bond.GetEndAtomIdx()
        edge_index[0] += [i, j]
        edge_index[1] += [j, i]
    x = torch.tensor(atom_features, dtype=torch.float)
    edge_index = torch.tensor(edge_index, dtype=torch.long)

    if use_label:
        y = torch.tensor([int(row['label'])], dtype=torch.long)
        graph_features = torch.tensor(row[1:-1].values.astype(float), dtype=torch.float).view(1, -1)
    else:
        y = torch.tensor([-1])
        graph_features = torch.tensor(row[1:].values.astype(float), dtype=torch.float).view(1, -1)

    return Data(x=x, edge_index=edge_index, y=y, graph_feat=graph_features)

# 🔧 Custom collate function
def custom_collate(batch_list):
    graph_feats = torch.cat([data.graph_feat.view(1, -1) for data in batch_list], dim=0)
    batch = Batch.from_data_list(batch_list)
    batch.graph_feat = graph_feats
    return batch

# 🧠 GCN 모델
class GCN(torch.nn.Module):
    def __init__(self, graph_feat_dim):
        super(GCN, self).__init__()
        self.conv1 = GCNConv(1, 64)
        self.conv2 = GCNConv(64, 64)
        self.lin1 = torch.nn.Linear(64 + graph_feat_dim, 64)
        self.lin2 = torch.nn.Linear(64, 2)

    def forward(self, data):
        x, edge_index, batch = data.x, data.edge_index, data.batch
        g_feat = data.graph_feat
        if g_feat.dim() == 1:
            g_feat = g_feat.unsqueeze(0)
        x = F.relu(self.conv1(x, edge_index))
        x = F.dropout(x, p=0.3, training=self.training)
        x = F.relu(self.conv2(x, edge_index))
        x = global_mean_pool(x, batch)
        x = torch.cat([x, g_feat], dim=1)
        x = F.relu(self.lin1(x))
        return F.log_softmax(self.lin2(x), dim=1)

# 📥 학습 데이터 로딩
df = pd.read_csv("/Users/ijaein/Desktop/train.csv")
feature_cols = df.columns[1:-1]  # SMILES 제외, label 제외
df = df[df['label'].notna()]

# 🔀 데이터 분할
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# 🔄 그래프 변환
train_data = [mol_to_graph_with_features(row, use_label=True) for _, row in train_df.iterrows() if mol_to_graph_with_features(row, use_label=True)]
val_data = [mol_to_graph_with_features(row, use_label=True) for _, row in val_df.iterrows() if mol_to_graph_with_features(row, use_label=True)]

train_loader = DataLoader(train_data, batch_size=32, shuffle=True, collate_fn=custom_collate)
val_loader = DataLoader(val_data, batch_size=32, collate_fn=custom_collate)

# 🚀 모델 학습
model = GCN(graph_feat_dim=len(feature_cols))
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
loss_fn = torch.nn.NLLLoss()

for epoch in range(20):
    model.train()
    total_loss = 0
    for batch in train_loader:
        optimizer.zero_grad()
        out = model(batch)
        loss = loss_fn(out, batch.y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"📚 Epoch {epoch+1} | Loss: {total_loss:.4f}")

# 📊 검증
model.eval()
y_true, y_pred = [], []
with torch.no_grad():
    for batch in val_loader:
        out = model(batch)
        pred = out.argmax(dim=1).tolist()
        y_pred.extend(pred)
        y_true.extend(batch.y.tolist())

print("\n📊 Classification Report:")
print(classification_report(y_true, y_pred))
print("🧩 Confusion Matrix:")
print(confusion_matrix(y_true, y_pred))
print("🔥 F1 Score (macro):", f1_score(y_true, y_pred, average='macro'))

# 🔮 예측 데이터 준비
predict_df = pd.read_csv("/Users/ijaein/Desktop/predict_input.csv")

# 🔧 feature_cols 기준으로 누락된 컬럼 보완
for col in feature_cols:
    if col not in predict_df.columns:
        predict_df[col] = 0
predict_df = predict_df[['SMILES'] + list(feature_cols)]  # 순서 정렬

predict_graphs = [
    mol_to_graph_with_features(row, use_label=False)
    for _, row in predict_df.iterrows()
    if mol_to_graph_with_features(row, use_label=False)
]
predict_loader = DataLoader(predict_graphs, batch_size=32, collate_fn=custom_collate)

# 🔮 예측 수행
predictions = []
with torch.no_grad():
    for batch in predict_loader:
        out = model(batch)
        pred = out.argmax(dim=1).tolist()
        predictions.extend(pred)

predict_df['gnn_predicted_label'] = predictions
predict_df.to_csv("/Users/ijaein/Desktop/gnn_prediction_output.csv", index=False)
print("✅ 예측 완료 → /Users/ijaein/Desktop/gnn_prediction_output.csv 저장됨")




📚 Epoch 1 | Loss: 155.9214
📚 Epoch 2 | Loss: 144.0348
📚 Epoch 3 | Loss: 144.0821
📚 Epoch 4 | Loss: 144.0196
📚 Epoch 5 | Loss: 144.0358
📚 Epoch 6 | Loss: 144.0416
📚 Epoch 7 | Loss: 144.0466
📚 Epoch 8 | Loss: 144.1875
📚 Epoch 9 | Loss: 144.0291
📚 Epoch 10 | Loss: 144.0335
📚 Epoch 11 | Loss: 144.0691
📚 Epoch 12 | Loss: 144.0086
📚 Epoch 13 | Loss: 143.9889
📚 Epoch 14 | Loss: 143.9695
📚 Epoch 15 | Loss: 144.0227
📚 Epoch 16 | Loss: 144.1388
📚 Epoch 17 | Loss: 144.0095
📚 Epoch 18 | Loss: 144.1418
📚 Epoch 19 | Loss: 144.0925
📚 Epoch 20 | Loss: 144.1731

📊 Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       778
           1       0.53      1.00      0.70       892

    accuracy                           0.53      1670
   macro avg       0.27      0.50      0.35      1670
weighted avg       0.29      0.53      0.37      1670

🧩 Confusion Matrix:
[[  0 778]
 [  0 892]]
🔥 F1 Score (macro): 0.3481654957064793


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


✅ 예측 완료 → /Users/ijaein/Desktop/gnn_prediction_output.csv 저장됨


In [1]:
!pip install -q torch-scatter -f https://data.pyg.org/whl/torch-2.0.0+cpu.html
!pip install -q torch-sparse -f https://data.pyg.org/whl/torch-2.0.0+cpu.html
!pip install -q torch-geometric
!pip install -q rdkit

In [4]:
df = pd.read_csv("/Users/ijaein/Desktop/train.csv")

print("✅ 라벨 분포:")
print(df['label'].value_counts())

print("\n✅ 라벨 유니크 값:")
print(sorted(df['label'].unique()))

print("\n✅ NaN 있는지 체크:")
print(df['label'].isna().sum())

✅ 라벨 분포:
label
1    4542
0    3807
Name: count, dtype: int64

✅ 라벨 유니크 값:
[0, 1]

✅ NaN 있는지 체크:
0


In [5]:
# 📦 최초 1회만 설치
# pip install torch torch-geometric rdkit pandas scikit-learn

import pandas as pd
import torch
import torch.nn.functional as F
from rdkit import Chem
from torch_geometric.data import Data, DataLoader, Batch
from torch_geometric.nn import GCNConv, global_mean_pool
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, f1_score

# ✅ SMILES → Graph + feature 포함
def mol_to_graph_with_features(row, use_label=True):
    mol = Chem.MolFromSmiles(row['SMILES'])
    if mol is None:
        return None
    atom_features = [[atom.GetAtomicNum()] for atom in mol.GetAtoms()]
    edge_index = [[], []]
    for bond in mol.GetBonds():
        i, j = bond.GetBeginAtomIdx(), bond.GetEndAtomIdx()
        edge_index[0] += [i, j]
        edge_index[1] += [j, i]
    x = torch.tensor(atom_features, dtype=torch.float)
    edge_index = torch.tensor(edge_index, dtype=torch.long)

    if use_label:
        y = torch.tensor([int(row['label'])], dtype=torch.long)
        graph_features = torch.tensor(row[1:-1].values.astype(float), dtype=torch.float).view(1, -1)
    else:
        y = torch.tensor([-1])
        graph_features = torch.tensor(row[1:].values.astype(float), dtype=torch.float).view(1, -1)

    return Data(x=x, edge_index=edge_index, y=y, graph_feat=graph_features)

# 🧩 batch에서 graph_feat 처리
def custom_collate(batch_list):
    graph_feats = torch.cat([data.graph_feat.view(1, -1) for data in batch_list], dim=0)
    batch = Batch.from_data_list(batch_list)
    batch.graph_feat = graph_feats
    return batch

# 🧠 모델 정의
class GCN(torch.nn.Module):
    def __init__(self, graph_feat_dim):
        super(GCN, self).__init__()
        self.conv1 = GCNConv(1, 64)
        self.conv2 = GCNConv(64, 64)
        self.lin1 = torch.nn.Linear(64 + graph_feat_dim, 64)
        self.lin2 = torch.nn.Linear(64, 2)

    def forward(self, data):
        x, edge_index, batch = data.x, data.edge_index, data.batch
        g_feat = data.graph_feat
        if g_feat.dim() == 1:
            g_feat = g_feat.unsqueeze(0)
        x = F.relu(self.conv1(x, edge_index))
        x = F.dropout(x, p=0.3, training=self.training)
        x = F.relu(self.conv2(x, edge_index))
        x = global_mean_pool(x, batch)
        x = torch.cat([x, g_feat], dim=1)
        x = F.relu(self.lin1(x))
        return F.log_softmax(self.lin2(x), dim=1)

# 📥 학습 데이터
df = pd.read_csv("/Users/ijaein/Desktop/train.csv")
feature_cols = df.columns[1:-1]
df = df[df['label'].notna()]

train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

train_data = [mol_to_graph_with_features(row, use_label=True) for _, row in train_df.iterrows() if mol_to_graph_with_features(row, True)]
val_data = [mol_to_graph_with_features(row, use_label=True) for _, row in val_df.iterrows() if mol_to_graph_with_features(row, True)]

train_loader = DataLoader(train_data, batch_size=32, shuffle=True, collate_fn=custom_collate)
val_loader = DataLoader(val_data, batch_size=32, collate_fn=custom_collate)

# 🚀 학습
model = GCN(graph_feat_dim=len(feature_cols))
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
loss_fn = torch.nn.NLLLoss()

for epoch in range(20):
    model.train()
    total_loss = 0
    for batch in train_loader:
        optimizer.zero_grad()
        out = model(batch)
        loss = loss_fn(out, batch.y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"📚 Epoch {epoch+1} | Loss: {total_loss:.4f}")

# 📊 검증
model.eval()
y_true, y_pred = [], []
with torch.no_grad():
    for batch in val_loader:
        out = model(batch)
        pred = out.argmax(dim=1).tolist()
        y_pred.extend(pred)
        y_true.extend(batch.y.tolist())

print("\n📊 Classification Report:")
print(classification_report(y_true, y_pred))
print("🧩 Confusion Matrix:")
print(confusion_matrix(y_true, y_pred))
print("🔥 F1 Score (macro):", f1_score(y_true, y_pred, average='macro'))

# 🔮 예측
predict_df = pd.read_csv("/Users/ijaein/Desktop/predict_input.csv")

# 🛡️ feature 누락 보완 + 정렬
for col in feature_cols:
    if col not in predict_df.columns:
        predict_df[col] = 0
predict_df = predict_df[['SMILES'] + list(feature_cols)]

predict_graphs = [
    mol_to_graph_with_features(row, use_label=False)
    for _, row in predict_df.iterrows()
    if mol_to_graph_with_features(row, use_label=False)
]

predict_loader = DataLoader(predict_graphs, batch_size=32, collate_fn=custom_collate)

predictions = []
with torch.no_grad():
    for batch in predict_loader:
        out = model(batch)
        pred = out.argmax(dim=1).tolist()
        predictions.extend(pred)

predict_df['gnn_predicted_label'] = predictions
predict_df.to_csv("/Users/ijaein/Desktop/gnn_prediction_output.csv", index=False)
print("✅ 예측 완료 → /Users/ijaein/Desktop/gnn_prediction_output.csv 저장됨")



📚 Epoch 1 | Loss: 151.6836
📚 Epoch 2 | Loss: 144.0381
📚 Epoch 3 | Loss: 144.0756
📚 Epoch 4 | Loss: 144.1718
📚 Epoch 5 | Loss: 144.0976
📚 Epoch 6 | Loss: 144.0322
📚 Epoch 7 | Loss: 144.0113
📚 Epoch 8 | Loss: 144.0727
📚 Epoch 9 | Loss: 144.1199
📚 Epoch 10 | Loss: 144.0692
📚 Epoch 11 | Loss: 144.0085
📚 Epoch 12 | Loss: 144.0832
📚 Epoch 13 | Loss: 144.0315
📚 Epoch 14 | Loss: 144.1151
📚 Epoch 15 | Loss: 144.0489
📚 Epoch 16 | Loss: 144.0828
📚 Epoch 17 | Loss: 144.0743
📚 Epoch 18 | Loss: 144.1036
📚 Epoch 19 | Loss: 144.0991
📚 Epoch 20 | Loss: 144.0463

📊 Classification Report:


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


              precision    recall  f1-score   support

           0       0.00      0.00      0.00       778
           1       0.53      1.00      0.70       892

    accuracy                           0.53      1670
   macro avg       0.27      0.50      0.35      1670
weighted avg       0.29      0.53      0.37      1670

🧩 Confusion Matrix:
[[  0 778]
 [  0 892]]
🔥 F1 Score (macro): 0.3481654957064793




✅ 예측 완료 → /Users/ijaein/Desktop/gnn_prediction_output.csv 저장됨


In [6]:
# 학습 샘플에서 라벨 확인
for i, g in enumerate(train_data[:10]):
    print(f"{i}번 sample y 값:", g.y)

0번 sample y 값: tensor([0])
1번 sample y 값: tensor([1])
2번 sample y 값: tensor([1])
3번 sample y 값: tensor([0])
4번 sample y 값: tensor([1])
5번 sample y 값: tensor([1])
6번 sample y 값: tensor([1])
7번 sample y 값: tensor([1])
8번 sample y 값: tensor([1])
9번 sample y 값: tensor([0])


In [7]:
model.eval()
for batch in train_loader:
    out = model(batch)
    print("🧪 logits:", out)
    print("🧪 y:", batch.y)
    break

🧪 logits: tensor([[-0.7978, -0.5984],
        [-0.7978, -0.5984],
        [-0.7978, -0.5984],
        [-0.7978, -0.5984],
        [-0.7978, -0.5984],
        [-0.7978, -0.5984],
        [-0.7978, -0.5984],
        [-0.7978, -0.5984],
        [-0.7978, -0.5984],
        [-0.7978, -0.5984],
        [-0.7978, -0.5984],
        [-0.7978, -0.5984],
        [-0.7978, -0.5984],
        [-0.7978, -0.5984],
        [-0.7978, -0.5984],
        [-0.7978, -0.5984],
        [-0.7978, -0.5984],
        [-0.7978, -0.5984],
        [-0.7978, -0.5984],
        [-0.7978, -0.5984],
        [-0.7978, -0.5984],
        [-0.7978, -0.5984],
        [-0.7978, -0.5984],
        [-0.7978, -0.5984],
        [-0.7978, -0.5984],
        [-0.7978, -0.5984],
        [-0.7978, -0.5984],
        [-0.7978, -0.5984],
        [-0.7978, -0.5984],
        [-0.7978, -0.5984],
        [-0.7978, -0.5984],
        [-0.7978, -0.5984]], grad_fn=<LogSoftmaxBackward0>)
🧪 y: tensor([1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1

In [8]:
# 📦 최초 1회 설치
# pip install torch torch-geometric rdkit pandas scikit-learn

import pandas as pd
import torch
import torch.nn.functional as F
from rdkit import Chem
from torch_geometric.data import Data, DataLoader, Batch
from torch_geometric.nn import GCNConv, global_mean_pool
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, f1_score

# 🧬 atom feature 추출 함수
def get_atom_features(atom):
    return [
        atom.GetAtomicNum(),
        atom.GetDegree(),
        atom.GetFormalCharge(),
        int(atom.GetIsAromatic()),
        int(atom.GetHybridization())
    ]

# 🔁 SMILES → Graph 변환
def mol_to_graph_with_features(row, use_label=True):
    mol = Chem.MolFromSmiles(row['SMILES'])
    if mol is None:
        return None

    atom_features = [get_atom_features(atom) for atom in mol.GetAtoms()]
    edge_index = [[], []]
    for bond in mol.GetBonds():
        i, j = bond.GetBeginAtomIdx(), bond.GetEndAtomIdx()
        edge_index[0] += [i, j]
        edge_index[1] += [j, i]

    x = torch.tensor(atom_features, dtype=torch.float)
    edge_index = torch.tensor(edge_index, dtype=torch.long)

    if use_label:
        y = torch.tensor([int(row['label'])], dtype=torch.long)
        graph_features = torch.tensor(row[1:-1].values.astype(float), dtype=torch.float).view(1, -1)
    else:
        y = torch.tensor([-1])
        graph_features = torch.tensor(row[1:].values.astype(float), dtype=torch.float).view(1, -1)

    return Data(x=x, edge_index=edge_index, y=y, graph_feat=graph_features)

# 🧩 batch에서 graph_feat 처리
def custom_collate(batch_list):
    graph_feats = torch.cat([data.graph_feat.view(1, -1) for data in batch_list], dim=0)
    batch = Batch.from_data_list(batch_list)
    batch.graph_feat = graph_feats
    return batch

# 🧠 GCN 모델
class GCN(torch.nn.Module):
    def __init__(self, graph_feat_dim):
        super(GCN, self).__init__()
        self.conv1 = GCNConv(5, 64)
        self.conv2 = GCNConv(64, 64)
        self.lin1 = torch.nn.Linear(64 + graph_feat_dim, 64)
        self.lin2 = torch.nn.Linear(64, 2)

    def forward(self, data):
        x, edge_index, batch = data.x, data.edge_index, data.batch
        g_feat = data.graph_feat
        if g_feat.dim() == 1:
            g_feat = g_feat.unsqueeze(0)
        x = F.relu(self.conv1(x, edge_index))
        x = F.dropout(x, p=0.3, training=self.training)
        x = F.relu(self.conv2(x, edge_index))
        x = global_mean_pool(x, batch)
        x = torch.cat([x, g_feat], dim=1)
        x = F.relu(self.lin1(x))
        return F.log_softmax(self.lin2(x), dim=1)

# 📥 데이터 로딩
df = pd.read_csv("/Users/ijaein/Desktop/train.csv")
feature_cols = df.columns[1:-1]
df = df[df['label'].notna()]

train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

train_data = [mol_to_graph_with_features(row, use_label=True) for _, row in train_df.iterrows() if mol_to_graph_with_features(row, True)]
val_data = [mol_to_graph_with_features(row, use_label=True) for _, row in val_df.iterrows() if mol_to_graph_with_features(row, True)]

train_loader = DataLoader(train_data, batch_size=32, shuffle=True, collate_fn=custom_collate)
val_loader = DataLoader(val_data, batch_size=32, collate_fn=custom_collate)

# 🚀 학습
model = GCN(graph_feat_dim=len(feature_cols))
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
loss_fn = torch.nn.NLLLoss()

for epoch in range(20):
    model.train()
    total_loss = 0
    for batch in train_loader:
        optimizer.zero_grad()
        out = model(batch)
        loss = loss_fn(out, batch.y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"📚 Epoch {epoch+1} | Loss: {total_loss:.4f}")

# 📊 검증
model.eval()
y_true, y_pred = [], []
with torch.no_grad():
    for batch in val_loader:
        out = model(batch)
        pred = out.argmax(dim=1).tolist()
        y_pred.extend(pred)
        y_true.extend(batch.y.tolist())

print("\n📊 Classification Report:")
print(classification_report(y_true, y_pred))
print("🧩 Confusion Matrix:")
print(confusion_matrix(y_true, y_pred))
print("🔥 F1 Score (macro):", f1_score(y_true, y_pred, average='macro'))

# 🔮 예측
test_df = pd.read_csv("/Users/ijaein/Desktop/predict_input.csv")
for col in feature_cols:
    if col not in test_df.columns:
        test_df[col] = 0
test_df = test_df[['SMILES'] + list(feature_cols)]

predict_graphs = [
    mol_to_graph_with_features(row, use_label=False)
    for _, row in test_df.iterrows()
    if mol_to_graph_with_features(row, use_label=False)
]
predict_loader = DataLoader(predict_graphs, batch_size=32, collate_fn=custom_collate)

predictions = []
with torch.no_grad():
    for batch in predict_loader:
        out = model(batch)
        pred = out.argmax(dim=1).tolist()
        predictions.extend(pred)

test_df['gnn_predicted_label'] = predictions
test_df.to_csv("/Users/ijaein/Desktop/gnn_prediction_output.csv", index=False)
print("✅ 예측 완료 → /Users/ijaein/Desktop/gnn_prediction_output.csv 저장됨")




📚 Epoch 1 | Loss: 171.5132
📚 Epoch 2 | Loss: 144.1028
📚 Epoch 3 | Loss: 144.1159
📚 Epoch 4 | Loss: 144.0314
📚 Epoch 5 | Loss: 144.0097
📚 Epoch 6 | Loss: 144.0866
📚 Epoch 7 | Loss: 143.9540
📚 Epoch 8 | Loss: 144.0987
📚 Epoch 9 | Loss: 144.1014
📚 Epoch 10 | Loss: 144.0265
📚 Epoch 11 | Loss: 144.0799
📚 Epoch 12 | Loss: 144.0665
📚 Epoch 13 | Loss: 144.0590
📚 Epoch 14 | Loss: 144.0979
📚 Epoch 15 | Loss: 144.0436
📚 Epoch 16 | Loss: 144.0704
📚 Epoch 17 | Loss: 144.1131
📚 Epoch 18 | Loss: 144.0552
📚 Epoch 19 | Loss: 144.0911
📚 Epoch 20 | Loss: 144.0146

📊 Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       778
           1       0.53      1.00      0.70       892

    accuracy                           0.53      1670
   macro avg       0.27      0.50      0.35      1670
weighted avg       0.29      0.53      0.37      1670

🧩 Confusion Matrix:
[[  0 778]
 [  0 892]]
🔥 F1 Score (macro): 0.3481654957064793


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


✅ 예측 완료 → /Users/ijaein/Desktop/gnn_prediction_output.csv 저장됨


In [12]:
# 📦 라이브러리
import pandas as pd
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, f1_score

# 1. 데이터 로딩
df = pd.read_csv("/Users/ijaein/Desktop/train.csv")

# 2. 컬럼 정의
ecfp_cols = [f'ecfp_{i}' for i in range(1024)]
fcfp_cols = [f'fcfp_{i}' for i in range(1024)]
ptfp_cols = [f'ptfp_{i}' for i in range(1024)]
extra_cols = ['MolWt', 'clogp', 'sa_score']
bit_cols = ecfp_cols + fcfp_cols + ptfp_cols

# 3. 비트 컬럼 정수로 변환
df[bit_cols] = df[bit_cols].astype(int)

# 4. 시그니처 점수 생성
safe_bits = ['ptfp_9', 'ptfp_37']
toxic_bits = ['ptfp_41']
blocked_safe_bits = ['fcfp_164']
blocked_toxic_bits = ['fcfp_619']

def create_signature_score(row):
    score = 0
    score += sum([int(row[bit]) if bit in row else 0 for bit in safe_bits])
    score += sum([int(row[bit]) if bit in row else 0 for bit in toxic_bits])
    score -= sum([int(row[bit]) if bit in row else 0 for bit in blocked_safe_bits])
    score -= sum([int(row[bit]) if bit in row else 0 for bit in blocked_toxic_bits])
    return score

df['signature_score'] = df.apply(create_signature_score, axis=1)
df = df.copy()  # 💡 DataFrame defragmentation (속도 향상)

# 5. 연속형 피처 스케일링
scaler = StandardScaler()
df[extra_cols] = scaler.fit_transform(df[extra_cols])

# 6. 학습/테스트 분리
X = df[bit_cols + extra_cols + ['signature_score']]
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 7. Pool 생성
train_pool = Pool(X_train, y_train)

# 8. CatBoost 모델 정의
cat_model = CatBoostClassifier(
    iterations=500,
    eval_metric='F1',
    random_seed=42,
    verbose=100
)

# 9. 튜닝 파라미터
from sklearn.model_selection import ParameterSampler
from itertools import product
import random

# 전체 그리드 → 샘플링으로 20~30개만 선택
param_grid = {
    'learning_rate': [0.01, 0.03, 0.05],
    'depth': [4, 6, 8, 10],
    'l2_leaf_reg': [1, 3, 5, 7, 9]
}

from sklearn.model_selection import ParameterSampler
params_list = list(ParameterSampler(param_grid, n_iter=25, random_state=42))

# 10. CatBoost 튜닝 실행
cat_model.grid_search(param_grid, train_pool, plot=False)

# 11. 최종 평가
y_pred = cat_model.predict(X_test)
print("📊 Classification Report (CatBoost Tuned):")
print(classification_report(y_test, y_pred))
print("🧩 Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("🔥 F1-score:", f1_score(y_test, y_pred))

  df['signature_score'] = df.apply(create_signature_score, axis=1)


0:	learn: 0.6834744	test: 0.6887791	best: 0.6887791 (0)	total: 125ms	remaining: 1m 2s
100:	learn: 0.7450857	test: 0.7378517	best: 0.7421222 (34)	total: 1.61s	remaining: 6.37s
200:	learn: 0.7574187	test: 0.7503201	best: 0.7506394 (195)	total: 2.98s	remaining: 4.43s
300:	learn: 0.7671233	test: 0.7579083	best: 0.7579083 (300)	total: 4.39s	remaining: 2.9s
400:	learn: 0.7760911	test: 0.7632600	best: 0.7640595 (391)	total: 5.82s	remaining: 1.44s
499:	learn: 0.7832724	test: 0.7662338	best: 0.7670344 (491)	total: 7.22s	remaining: 0us

bestTest = 0.7670343933
bestIteration = 491

0:	loss: 0.7670344	best: 0.7670344 (0)	total: 7.47s	remaining: 7m 20s
0:	learn: 0.6834744	test: 0.6887791	best: 0.6887791 (0)	total: 13.5ms	remaining: 6.74s
100:	learn: 0.7693275	test: 0.7512821	best: 0.7554417 (93)	total: 1.57s	remaining: 6.21s
200:	learn: 0.7870163	test: 0.7653194	best: 0.7690306 (191)	total: 2.96s	remaining: 4.4s
300:	learn: 0.8096549	test: 0.7675033	best: 0.7699407 (298)	total: 4.28s	remaining: 2.8

KeyboardInterrupt: 