In [1]:
import numpy as np
import pandas as pd
import deepchem as dc
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import torch
from torch import nn
import torch.nn.functional as F
from torch_geometric.nn import GCNConv, global_max_pool
from torch_geometric.data import Batch, Data
from torch_geometric.loader import DataLoader
from sklearn import metrics


Instructions for updating:
experimental_relax_shapes is deprecated, use reduce_retracing instead


Skipped loading modules with pytorch-geometric dependency, missing a dependency. No module named 'dgl'
Skipped loading some Jax models, missing a dependency. No module named 'haiku'


In [2]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = GCNConv(30, 256)
        self.conv2 = GCNConv(256, 256)
        self.conv3 = GCNConv(256, 256)
        self.conv4 = GCNConv(256, 256)
        self.fc1 = nn.Linear(256, 256)
        self.fc2 = nn.Linear(256, 128)
        self.fc3 = nn.Linear(128, 1)
        self.dropout = nn.Dropout(p=0.4)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.dropout(x)
        x = self.conv2(x, edge_index)
        x = F.relu(x)
        x = self.conv3(x, edge_index)
        x = F.relu(x)
        x = self.conv4(x, edge_index)
        x = F.relu(x)
        x = global_max_pool(x, data.batch)
        x = self.fc1(x)
        x = F.relu(x)
        x = self.fc2(x)
        x = F.relu(x)
        x = self.fc3(x)
        return x

In [3]:
def custom_collate(batch):
    data_list, target_list = zip(*batch)
    batch_data = Batch.from_data_list(data_list)
    batch_target = torch.stack(target_list)
    return batch_data, batch_target

In [4]:
def calculate_statistics(group):
    r2_train = group['r2_train']
    r2_test = group['r2_test']
    return pd.Series({
        'r2_train_mean': np.mean(r2_train),
        'r2_train_std': np.std(r2_train, ddof=0),
        'r2_test_mean': np.mean(r2_test),
        'r2_test_std': np.std(r2_test, ddof=0)
    })

In [5]:
torch.manual_seed(0)

scaler = StandardScaler()

epochs = 100
lr = 1e-2
wd = 5e-4

results = []
for random_state in range(10):
    torch.manual_seed(0)
    print(f'Random State: {random_state}')
    
    for d in ["kappa2", "peoevsa6", "bertzct", "ggi10", "atsc4i", "bcutp1l", "kappa3", "estatevsa3", "kier3", "aats8p", "kier2"]:
        torch.manual_seed(0)
        print('Source : ', d)
        dataset = d
        
        for c in ["Yield_CO"]:
            torch.manual_seed(0)
            df = pd.read_csv('data_Real/data_real.csv')
            smiles = df["SMILES"]
            featurizer = dc.feat.MolGraphConvFeaturizer(use_edges=True)
            X = featurizer.featurize(smiles)
            
            y = df[c]
            data_train, data_test, target_train, target_test = train_test_split(X, y, test_size=0.5, random_state=random_state)

            target_train = scaler.fit_transform(target_train.values.reshape(-1, 1)).flatten()
            target_test = scaler.transform(target_test.values.reshape(-1, 1)).flatten()
            
            target_train = torch.tensor(target_train, dtype=torch.float32)
            target_test = torch.tensor(target_test, dtype=torch.float32)

            data_train_list = []
            for graph_data in data_train:
                node_features = torch.tensor(graph_data.node_features, dtype=torch.float32)
                edge_index = torch.tensor(graph_data.edge_index, dtype=torch.long)
                edge_features = torch.tensor(graph_data.edge_features, dtype=torch.float32)
                data = Data(x=node_features, edge_index=edge_index, edge_attr=edge_features)
                data_train_list.append(data)

            data_test_list = []
            for graph_data in data_test:
                node_features = torch.tensor(graph_data.node_features, dtype=torch.float32)
                edge_index = torch.tensor(graph_data.edge_index, dtype=torch.long)
                edge_features = torch.tensor(graph_data.edge_features, dtype=torch.float32)
                data = Data(x=node_features, edge_index=edge_index, edge_attr=edge_features)
                data_test_list.append(data)

            train_loader = DataLoader(list(zip(data_train_list, target_train)), batch_size=len(data_train_list), collate_fn=custom_collate)
            test_loader = DataLoader(list(zip(data_test_list, target_test)), batch_size=len(data_test_list), collate_fn=custom_collate)

            model = Net()
            model.load_state_dict(torch.load(f'data_Random/model_{dataset}_sc.pth'))
            model.fc3 = nn.Linear(128, 1)
        
            model.train()
            optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=wd)
            criterion = nn.MSELoss()
        
            for param in model.conv1.parameters():
                param.requires_grad = False
            for param in model.conv2.parameters():
                param.requires_grad = False
            for param in model.conv3.parameters():
                param.requires_grad = False
            for param in model.conv4.parameters():
                param.requires_grad = False

            device = torch.device('cpu')
            model.to(device)

            for epoch in range(epochs):
                for data, target in train_loader:
                    data = data.to(device)
                    target = target.to(device)
                    optimizer.zero_grad()
                    out = model(data)
                    loss = criterion(out, target.view(-1, 1))
                    loss.backward()
                    optimizer.step()

            model.eval()
            pred_train = []
            for data, target in train_loader:
                data = data.to(device)
                with torch.no_grad():
                    out = model(data)
                pred_train.append(out.cpu().numpy())
            pred_train = np.concatenate(pred_train)

            pred_test = []
            for data, target in test_loader:
                data = data.to(device)
                with torch.no_grad():
                    out = model(data)
                pred_test.append(out.cpu().numpy())
            pred_test = np.concatenate(pred_test)

            pred_train = scaler.inverse_transform(pred_train)
            pred_test = scaler.inverse_transform(pred_test)
            target_train = scaler.inverse_transform(target_train.numpy().reshape(-1, 1)).flatten()
            target_test = scaler.inverse_transform(target_test.numpy().reshape(-1, 1)).flatten()

            r2_train_score = metrics.r2_score(target_train, pred_train)
            r2_test_score = metrics.r2_score(target_test, pred_test)

            results.append({'source': d, 'target': c, 'r2_train': r2_train_score, 'r2_test': r2_test_score})
            print(f'R2 test for {c} with random state {random_state}: {r2_test_score}')

results_df = pd.DataFrame(results)
gen_results = results_df.groupby(['source', 'target']).apply(calculate_statistics).reset_index()
gen_results

Random State: 0
Source :  kappa2
R2 test for Yield_CO with random state 0: 0.7950414419174194
Source :  peoevsa6
R2 test for Yield_CO with random state 0: 0.5717265605926514
Source :  bertzct
R2 test for Yield_CO with random state 0: 0.813359797000885
Source :  ggi10
R2 test for Yield_CO with random state 0: 0.7373697757720947
Source :  atsc4i
R2 test for Yield_CO with random state 0: 0.6234185695648193
Source :  bcutp1l
R2 test for Yield_CO with random state 0: 0.6625875234603882
Source :  kappa3
R2 test for Yield_CO with random state 0: 0.7321821451187134
Source :  estatevsa3
R2 test for Yield_CO with random state 0: 0.7755917310714722
Source :  kier3
R2 test for Yield_CO with random state 0: 0.7259083390235901
Source :  aats8p
R2 test for Yield_CO with random state 0: 0.7739678621292114
Source :  kier2
R2 test for Yield_CO with random state 0: 0.7972241044044495
Random State: 1
Source :  kappa2
R2 test for Yield_CO with random state 1: 0.5945765972137451
Source :  peoevsa6
R2 test f

R2 test for Yield_CO with random state 9: 0.555678129196167
Source :  bcutp1l
R2 test for Yield_CO with random state 9: 0.41443347930908203
Source :  kappa3
R2 test for Yield_CO with random state 9: 0.7381504774093628
Source :  estatevsa3
R2 test for Yield_CO with random state 9: 0.6592820882797241
Source :  kier3
R2 test for Yield_CO with random state 9: 0.7132982015609741
Source :  aats8p
R2 test for Yield_CO with random state 9: 0.6464546322822571
Source :  kier2
R2 test for Yield_CO with random state 9: 0.6124082207679749


  gen_results = results_df.groupby(['source', 'target']).apply(calculate_statistics).reset_index()


Unnamed: 0,source,target,r2_train_mean,r2_train_std,r2_test_mean,r2_test_std
0,aats8p,Yield_CO,0.987846,0.007256,0.696633,0.094312
1,atsc4i,Yield_CO,0.972276,0.028024,0.611706,0.11103
2,bcutp1l,Yield_CO,0.786914,0.040367,0.529741,0.124105
3,bertzct,Yield_CO,0.980244,0.007908,0.779156,0.073695
4,estatevsa3,Yield_CO,0.989531,0.007015,0.715032,0.093409
5,ggi10,Yield_CO,0.982496,0.011572,0.687885,0.109656
6,kappa2,Yield_CO,0.968777,0.014461,0.721768,0.086016
7,kappa3,Yield_CO,0.980739,0.009047,0.746165,0.105895
8,kier2,Yield_CO,0.975263,0.015362,0.745656,0.072847
9,kier3,Yield_CO,0.977959,0.010254,0.735949,0.084024


In [6]:
gen_results.to_csv(f'result/Yield/onestep_{epochs}.csv')