https://chatgpt.com/g/g-cKXjWStaE-python

In [None]:
pip install rdkit-pypi torch torchvision torch-geometric scikit-learn

from google.colab import output
output.clear()

In [None]:
!sudo apt-get -qq install graphviz

In [18]:
import torch
from torch_geometric.data import Data, DataLoader
from torch_geometric.nn import MessagePassing, global_mean_pool
from rdkit import Chem
from rdkit.Chem import AllChem
from torch.nn import Linear
import torch.nn.functional as F
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# SMILES를 그래프로 변환하는 함수 (RDKit 사용)
def smiles_to_graph(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None

    # 노드(원자) 특징
    atom_features = []
    for atom in mol.GetAtoms():
        atom_features.append(atom.GetAtomicNum())  # 원자 번호 사용

    # 엣지(결합) 정보
    edge_index = []
    bond_types = []
    for bond in mol.GetBonds():
        i = bond.GetBeginAtomIdx()
        j = bond.GetEndAtomIdx()
        edge_index.append((i, j))
        edge_index.append((j, i))  # 무방향 그래프

        bond_types.append(bond.GetBondTypeAsDouble())  # 결합 종류

    if len(edge_index) == 0:  # 결합이 없는 경우
        return None

    edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()
    atom_features = torch.tensor(atom_features, dtype=torch.float).view(-1, 1)
    bond_types = torch.tensor(bond_types, dtype=torch.float)

    return Data(x=atom_features, edge_index=edge_index)

# 데이터프레임에서 SMILES 및 레이블을 불러오는 함수
def load_dataset(df):
    graphs = []
    labels = []
    for _, row in df.iterrows():
        graph = smiles_to_graph(row['Smiles'])
        if graph:
            graphs.append(graph)
            labels.append([row['log_IC50_nM']])
    return graphs, torch.tensor(labels, dtype=torch.float)

# MPNN 모델 구현
class MPNN(MessagePassing):
    def __init__(self, in_channels, hidden_channels, out_channels):
        super(MPNN, self).__init__(aggr='add')  # 노드 간 정보 집계 방법
        self.lin1 = Linear(in_channels, hidden_channels)
        self.lin2 = Linear(hidden_channels, out_channels)

    def forward(self, x, edge_index):
        # x: 노드 피처, edge_index: 엣지 정보
        x = self.lin1(x)
        x = self.propagate(edge_index, x=x)
        return self.lin2(x)

    def message(self, x_j, edge_index, size):
        return x_j  # 메시지 전달 과정 (이웃 노드 피처 사용)

    def update(self, aggr_out):
        return aggr_out  # 업데이트 과정

class IC50Predictor(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels):
        super(IC50Predictor, self).__init__()
        self.mpnn = MPNN(in_channels, hidden_channels, hidden_channels)
        self.lin = Linear(hidden_channels, 1)

    def forward(self, data):
        # MPNN 모델에 입력을 전달하고, 그래프 풀링 후 예측값 도출
        x, edge_index, batch = data.x, data.edge_index, data.batch
        x = self.mpnn(x, edge_index)
        x = global_mean_pool(x, batch)  # 노드의 평균 풀링
        return self.lin(x)

# 데이터 준비
def prepare_data(df):
    graphs, labels = load_dataset(df)
    for i, g in enumerate(graphs):
        g.y = labels[i]
    return graphs

# 모델 학습 및 평가
def train_model(model, loader, optimizer, criterion):
    model.train()
    for data in loader:
        optimizer.zero_grad()
        output = model(data)
        loss = criterion(output, data.y)
        loss.backward()
        optimizer.step()

def evaluate_model(model, loader, criterion):
    model.eval()
    losses = []
    with torch.no_grad():
        for data in loader:
            output = model(data)
            loss = criterion(output, data.y)
            losses.append(loss.item())
    return np.mean(losses)

In [24]:
df = pd.read_csv('/content/train.csv')[['Smiles', 'IC50_nM']]
df['log_IC50_nM'] = df['IC50_nM'].apply(lambda x: np.log(x + 1e-9))

# 데이터 로드 및 전처리
train_graphs = prepare_data(df)
train_loader = DataLoader(train_graphs, batch_size=2, shuffle=True)

In [27]:
# 모델 초기화
model = IC50Predictor(in_channels=1, hidden_channels=64)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = torch.nn.MSELoss()

In [28]:
# 모델 학습
for epoch in range(100):
    train_model(model, train_loader, optimizer, criterion)
    loss = evaluate_model(model, train_loader, criterion)
    print(f'Epoch {epoch+1}, Loss: {loss:.4f}')

Epoch 1, Loss: 6969415.8497
Epoch 2, Loss: 6984569.1103
Epoch 3, Loss: 6976707.5938
Epoch 4, Loss: 6972331.7680
Epoch 5, Loss: 6980619.5402
Epoch 6, Loss: 6966951.6960
Epoch 7, Loss: 6966894.1622
Epoch 8, Loss: 6968293.8852
Epoch 9, Loss: 6965670.3140
Epoch 10, Loss: 6975955.3293
Epoch 11, Loss: 6968589.5300
Epoch 12, Loss: 6969806.4925
Epoch 13, Loss: 6975940.9543
Epoch 14, Loss: 6970678.7345
Epoch 15, Loss: 6973538.5601
Epoch 16, Loss: 6968722.8690
Epoch 17, Loss: 6969460.6377
Epoch 18, Loss: 6967798.3979
Epoch 19, Loss: 6968537.3950
Epoch 20, Loss: 7011178.7928
Epoch 21, Loss: 6970426.4579
Epoch 22, Loss: 6974639.1401
Epoch 23, Loss: 6970461.1937
Epoch 24, Loss: 6967116.6310
Epoch 25, Loss: 6965752.0281
Epoch 26, Loss: 6966311.0928
Epoch 27, Loss: 6968703.0073
Epoch 28, Loss: 6977767.0115
Epoch 29, Loss: 6968499.4863
Epoch 30, Loss: 6964922.0860
Epoch 31, Loss: 6967622.3269
Epoch 32, Loss: 6966554.6873
Epoch 33, Loss: 6978704.9686
Epoch 34, Loss: 6967431.6557
Epoch 35, Loss: 6970296

KeyboardInterrupt: 