In [1]:
import warnings
import tensorflow as tf
warnings.filterwarnings("ignore")
tf.get_logger().setLevel('ERROR')

import numpy as np
import pandas as pd
import deepchem as dc
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import torch
from torch import nn
import torch.nn.functional as F
from torch_geometric.nn import GCNConv, global_max_pool
from torch_geometric.data import Batch, Data
from torch_geometric.loader import DataLoader
from sklearn import metrics




Skipped loading modules with pytorch-geometric dependency, missing a dependency. No module named 'dgl'
Skipped loading some Jax models, missing a dependency. No module named 'jax'


In [2]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = GCNConv(30, 256)
        self.conv2 = GCNConv(256, 256)
        self.conv3 = GCNConv(256, 256)
        self.conv4 = GCNConv(256, 256)
        self.fc1 = nn.Linear(256, 256)
        self.fc2 = nn.Linear(256, 128)
        self.fc3 = nn.Linear(128, 1)
        self.dropout1 = nn.Dropout(p=0.2)
        self.dropout2 = nn.Dropout(p=0.2)
        self.dropout3 = nn.Dropout(p=0.2)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.dropout1(x)
        x = self.conv2(x, edge_index)
        x = F.relu(x)
        x = self.dropout2(x)
        x = self.conv3(x, edge_index)
        x = F.relu(x)
        x = self.conv4(x, edge_index)
        x = F.relu(x)
        x = global_max_pool(x, data.batch)
        x = self.fc1(x)
        x = F.relu(x)
        x = self.dropout3(x)
        x = self.fc2(x)
        x = F.relu(x)
        x = self.fc3(x)
        return x

In [3]:
def custom_collate(batch):
    data_list, target_list = zip(*batch)
    batch_data = Batch.from_data_list(data_list)
    batch_target = torch.stack(target_list)
    return batch_data, batch_target

In [4]:
def calculate_statistics(group):
    r2_test = group['r2_test']
    r2_test_dict = {f'run{i}': r2_test_val for i, r2_test_val in enumerate(r2_test)}
    return pd.Series({
        **r2_test_dict, 
        'r2_test_mean': np.mean(r2_test),
        'r2_test_max': np.max(r2_test),
        'r2_test_min': np.min(r2_test),
        'r2_test_std': np.std(r2_test, ddof=0),
    })

def calculate_statistics2(group):
    rmse_test = group['rmse_test']
    rmse_test_dict = {f'run{i}': rmse_test_val for i, rmse_test_val in enumerate(rmse_test)}
    return pd.Series({
        **rmse_test_dict, 
        'rmse_test_mean': np.mean(rmse_test),
        'rmse_test_max': np.max(rmse_test),
        'rmse_test_min': np.min(rmse_test),
        'rmse_test_std': np.std(rmse_test, ddof=0),
    })

In [5]:
torch.manual_seed(0)

scaler = StandardScaler()

first_epochs = 50
second_epochs = 200
second_lr = 9e-3
second_wd = 3e-5

results_r2 = []
results_rmse = []
for random_state in range(10):
    torch.manual_seed(0)
    
    for dataset in ["abcgg", "aatsc3d", "atsc3d", "kappa2", "peoevsa6", "bertzct", "ggi10", "vsaestate3",
                    "atsc4i", "bcutp1l", "kappa3", "estatevsa3", "kier3", "aats8p", "kier2", "frnh0"]:
        torch.manual_seed(0)
        
        for t in ["Yield_CO_l"]:
            torch.manual_seed(0)
            df = pd.read_csv('data_Real/data_real.csv')
            smiles = df["SMILES"]
            featurizer = dc.feat.MolGraphConvFeaturizer(use_edges=True)
            X = featurizer.featurize(smiles)
            
            y = df[t]
            data_train, data_test, target_train, target_test = train_test_split(X, y, test_size=0.5, random_state=random_state)

            target_train = scaler.fit_transform(target_train.values.reshape(-1, 1)).flatten()
            target_test = scaler.transform(target_test.values.reshape(-1, 1)).flatten()
            
            target_train = torch.tensor(target_train, dtype=torch.float32)
            target_test = torch.tensor(target_test, dtype=torch.float32)

            data_train_list = []
            for graph_data in data_train:
                node_features = torch.tensor(graph_data.node_features, dtype=torch.float32)
                edge_index = torch.tensor(graph_data.edge_index, dtype=torch.long)
                edge_features = torch.tensor(graph_data.edge_features, dtype=torch.float32)
                data = Data(x=node_features, edge_index=edge_index, edge_attr=edge_features)
                data_train_list.append(data)

            data_test_list = []
            for graph_data in data_test:
                node_features = torch.tensor(graph_data.node_features, dtype=torch.float32)
                edge_index = torch.tensor(graph_data.edge_index, dtype=torch.long)
                edge_features = torch.tensor(graph_data.edge_features, dtype=torch.float32)
                data = Data(x=node_features, edge_index=edge_index, edge_attr=edge_features)
                data_test_list.append(data)

            train_loader = DataLoader(list(zip(data_train_list, target_train)), batch_size=len(data_train_list), collate_fn=custom_collate)
            test_loader = DataLoader(list(zip(data_test_list, target_test)), batch_size=len(data_test_list), collate_fn=custom_collate)

            model = Net()
            model.load_state_dict(torch.load(f'data_AI+Human/model_{dataset}_sc.pth'))
            model.fc3 = nn.Linear(128, 1)
        
            for param in model.parameters():
                param.requires_grad = False
        
            model.train()
            optimizer = torch.optim.Adam(model.parameters())
            criterion = nn.MSELoss()
            
            device = torch.device('cpu')
            model.to(device)

            for epoch in range(first_epochs):
                for data, target in train_loader:
                    data = data.to(device)
                    target = target.to(device)
                    with torch.no_grad():
                        out = model(data)
                        loss = criterion(out, target.view(-1, 1))

            for param in model.fc1.parameters():
                param.requires_grad = True
            for param in model.fc2.parameters():
                param.requires_grad = True
            for param in model.fc3.parameters():
                param.requires_grad = True

            optimizer = torch.optim.Adam(model.parameters(), lr=second_lr, weight_decay=second_wd)

            for epoch in range(second_epochs):
                for data, target in train_loader:
                    data = data.to(device)
                    target = target.to(device)
                    optimizer.zero_grad()
                    out = model(data)
                    loss = criterion(out, target.view(-1, 1))
                    loss.backward()
                    optimizer.step()

            model.eval()
            pred_train = []
            for data, target in train_loader:
                data = data.to(device)
                with torch.no_grad():
                    out = model(data)
                pred_train.append(out.cpu().numpy())
            pred_train = np.concatenate(pred_train)

            pred_test = []
            for data, target in test_loader:
                data = data.to(device)
                with torch.no_grad():
                    out = model(data)
                pred_test.append(out.cpu().numpy())
            pred_test = np.concatenate(pred_test)

            pred_train = scaler.inverse_transform(pred_train)
            pred_test = scaler.inverse_transform(pred_test)
            target_train = scaler.inverse_transform(target_train.numpy().reshape(-1, 1)).flatten()
            target_test = scaler.inverse_transform(target_test.numpy().reshape(-1, 1)).flatten()

            r2_test_score = metrics.r2_score(target_test, pred_test)
            rmse_test_score = metrics.root_mean_squared_error(target_test, pred_test)
            results_r2.append({'source': dataset, 'target': t, 'r2_test': r2_test_score})
            results_rmse.append({'source': dataset, 'target': t, 'rmse_test': rmse_test_score})

results_df = pd.DataFrame(results_r2)
gen_results = results_df.groupby(['source', 'target']).apply(calculate_statistics).reset_index()
results_df2 = pd.DataFrame(results_rmse)
gen_results2 = results_df2.groupby(['source', 'target']).apply(calculate_statistics2).reset_index()

In [6]:
gen_results.T.to_csv('result/result_yield_l_r2.csv', header=False)
gen_results.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
source,aats8p,aatsc3d,abcgg,atsc3d,atsc4i,bcutp1l,bertzct,estatevsa3,frnh0,ggi10,kappa2,kappa3,kier2,kier3,peoevsa6,vsaestate3
target,Yield_CO_l,Yield_CO_l,Yield_CO_l,Yield_CO_l,Yield_CO_l,Yield_CO_l,Yield_CO_l,Yield_CO_l,Yield_CO_l,Yield_CO_l,Yield_CO_l,Yield_CO_l,Yield_CO_l,Yield_CO_l,Yield_CO_l,Yield_CO_l
run0,0.649392,0.61097,0.645439,0.513372,0.592144,0.269883,0.812801,0.551847,0.782722,0.656078,0.743976,0.77899,0.771879,0.623508,0.74919,0.750224
run1,0.637718,0.582033,0.632658,0.492685,0.5191,0.38466,0.730531,0.567091,0.621344,0.627303,0.706477,0.709109,0.696214,0.612447,0.434815,0.540272
run2,0.673924,0.786879,0.752974,0.610729,0.626806,0.541209,0.86877,0.714396,0.761118,0.750833,0.867585,0.787675,0.812739,0.731423,0.779552,0.68947
run3,0.783766,0.757735,0.813356,0.829347,0.809863,0.393738,0.843678,0.72649,0.780898,0.767473,0.828764,0.798319,0.794394,0.769773,0.810217,0.626807
run4,0.834621,0.704318,0.787701,0.683662,0.777027,0.495636,0.805093,0.816411,0.79574,0.716682,0.682293,0.736768,0.810371,0.734656,0.729102,0.708957
run5,0.787557,0.670463,0.706456,0.677229,0.593005,0.412733,0.78845,0.778011,0.745597,0.688521,0.732981,0.634234,0.667826,0.666916,0.617242,0.765446
run6,0.747547,0.644217,0.660416,0.650437,0.589431,0.374955,0.759998,0.760721,0.737953,0.682937,0.813488,0.804161,0.783546,0.749075,0.786008,0.668113
run7,0.777936,0.716725,0.798196,0.741974,0.747941,0.449164,0.843004,0.691703,0.768386,0.775483,0.778147,0.791514,0.788254,0.779334,0.754569,0.733546


In [7]:
gen_results2.T.to_csv('result/result_yield_l_rmse.csv', header=False)
gen_results2.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
source,aats8p,aatsc3d,abcgg,atsc3d,atsc4i,bcutp1l,bertzct,estatevsa3,frnh0,ggi10,kappa2,kappa3,kier2,kier3,peoevsa6,vsaestate3
target,Yield_CO_l,Yield_CO_l,Yield_CO_l,Yield_CO_l,Yield_CO_l,Yield_CO_l,Yield_CO_l,Yield_CO_l,Yield_CO_l,Yield_CO_l,Yield_CO_l,Yield_CO_l,Yield_CO_l,Yield_CO_l,Yield_CO_l,Yield_CO_l
run0,21.446987,22.591572,21.567553,25.266989,23.131758,30.949337,15.671379,24.247585,16.883516,21.241491,18.327175,17.027899,17.299662,22.224545,18.139587,18.102167
run1,19.81073,21.278841,19.948587,23.443138,22.824665,25.818695,17.085653,21.655844,20.253468,20.093466,17.831919,17.751776,18.140982,20.49003,24.744133,22.316555
run2,22.091127,17.859591,19.227833,24.137094,23.633383,26.203911,14.014421,20.674797,18.908207,19.310978,14.077548,17.826239,16.741018,20.049025,18.164011,21.558117
run3,16.404549,17.36393,15.240873,14.573371,15.38282,27.468365,13.948013,18.449701,16.513006,17.01136,14.598239,15.842909,15.996337,16.927021,15.368506,21.551094
run4,15.096298,20.185658,17.104244,20.878805,17.528982,26.363461,16.388678,15.905722,16.777294,19.759115,20.923954,19.045797,16.165239,19.122061,19.32115,20.026657
run5,15.977338,19.899172,18.781021,19.693815,22.114506,26.564438,15.943709,16.332365,17.484119,19.346264,17.912416,20.964499,19.978642,20.005976,21.445929,16.788197
run6,17.406504,20.663973,20.188068,20.482563,22.198021,27.389048,16.971825,16.946264,17.734186,19.507154,14.961494,15.331022,16.117725,17.353764,16.025824,19.957977
run7,17.388235,19.639084,16.576084,18.743402,18.525421,27.385963,14.620472,20.488092,17.758202,17.484035,17.379995,16.848251,16.979469,17.333443,18.280243,19.04707
