# Load dependencies

In [1]:
import sys
import os
import requests
import subprocess
import shutil
from logging import getLogger, StreamHandler, INFO


logger = getLogger(__name__)
logger.addHandler(StreamHandler())
logger.setLevel(INFO)


def install(
        chunk_size=4096,
        file_name="Miniconda3-latest-Linux-x86_64.sh",
        url_base="https://repo.continuum.io/miniconda/",
        conda_path=os.path.expanduser(os.path.join("~", "miniconda")),
        rdkit_version=None,
        add_python_path=True,
        force=False):
    """install rdkit from miniconda
    ```
    import rdkit_installer
    rdkit_installer.install()
    ```
    """

    python_path = os.path.join(
        conda_path,
        "lib",
        "python{0}.{1}".format(*sys.version_info),
        "site-packages",
    )

    if add_python_path and python_path not in sys.path:
        logger.info("add {} to PYTHONPATH".format(python_path))
        sys.path.append(python_path)

    if os.path.isdir(os.path.join(python_path, "rdkit")):
        logger.info("rdkit is already installed")
        if not force:
            return

        logger.info("force re-install")

    url = url_base + file_name
    python_version = "{0}.{1}.{2}".format(*sys.version_info)

    logger.info("python version: {}".format(python_version))

    if os.path.isdir(conda_path):
        logger.warning("remove current miniconda")
        shutil.rmtree(conda_path)
    elif os.path.isfile(conda_path):
        logger.warning("remove {}".format(conda_path))
        os.remove(conda_path)

    logger.info('fetching installer from {}'.format(url))
    res = requests.get(url, stream=True)
    res.raise_for_status()
    with open(file_name, 'wb') as f:
        for chunk in res.iter_content(chunk_size):
            f.write(chunk)
    logger.info('done')

    logger.info('installing miniconda to {}'.format(conda_path))
    subprocess.check_call(["bash", file_name, "-b", "-p", conda_path])
    logger.info('done')

    logger.info("installing rdkit")
    subprocess.check_call([
        os.path.join(conda_path, "bin", "conda"),
        "install",
        "--yes",
        "-c", "rdkit",
        "python=={}".format(python_version),
        "rdkit" if rdkit_version is None else "rdkit=={}".format(rdkit_version)])
    logger.info("done")

    import rdkit
    logger.info("rdkit-{} installation finished!".format(rdkit.__version__))


if __name__ == "__main__":
    install()

add /root/miniconda/lib/python3.6/site-packages to PYTHONPATH
python version: 3.6.9
fetching installer from https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh
done
installing miniconda to /root/miniconda
done
installing rdkit
done
rdkit-2020.09.1 installation finished!


In [2]:
from google.colab import drive
drive.mount("/content/drive/")

Mounted at /content/drive/


In [3]:
!cp -r "drive/My Drive/deepSIBA_pytorch/NGF" /content
!cp -r "drive/My Drive/deepSIBA_pytorch/NGF_layers" /content
!cp -r "drive/My Drive/deepSIBA_pytorch/utility" /content
!cp -r "drive/My Drive/deepSIBA_pytorch/utils" /content
!cp -r "drive/My Drive/deepsiba_tf2/data" /content
!cp "drive/My Drive/deepSIBA_pytorch/deepSIBA_model.py" /content

In [4]:
#!rm -r NGF_layers
#!rm -r utility
#!rm -r utils
#!rm -r data
#!rm deepSIBA_model.py

In [5]:
import torch
import torch.nn.functional as F
import torch.nn as nn
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import numpy as np
from numpy import inf, ndarray
import pandas as pd
import os
import random
import sklearn
import re
#from NGF.utils import filter_func_args, mol_shapes_to_dims
#import NGF.utils
import NGF_layers.features
import NGF_layers.graph_layers
from NGF_layers.features import one_of_k_encoding, one_of_k_encoding_unk, atom_features, bond_features, num_atom_features, num_bond_features, padaxis, tensorise_smiles #, concat_mol_tensors
from NGF_layers.graph_layers import temporal_padding, neighbour_lookup, NeuralGraphHidden
from math import ceil
from sklearn.metrics import mean_squared_error
from utility.gaussian import GaussianLayer, custom_loss, ConGaussianLayer
from utility.evaluator import r_square, get_cindex, pearson_r,custom_mse, mse_sliced, model_evaluate
from utility.Generator import train_generator,preds_generator
from deepSIBA_model import enc_graph, siamese_model
from pathlib import Path
from matplotlib import pyplot as plt
from tqdm.auto import tqdm
import seaborn as sns
sns.set()

# Load train and model parameters

In [6]:
#model_params
model_params = {
    "max_atoms" : int(60), "num_atom_features" : int(62), "max_degree" : int(5), "num_bond_features" : int(6),
    "graph_conv_width" : [128,128,128], "conv1d_in" : int(60), "conv1d_out" : int(32), "kernel_size" : int(1), "dropout_encoder" : 0.25,
    "conv1d_dist_in" : [32,16], "conv1d_dist_out" : [16,16], "conv1d_dist_kernels" : [1,1], "dropout_dist" : 0.25, "pool_size" : int(4),
    "dense_size" : [256,128,128], "l2reg" : 0.01, "dist_thresh" : 0.2, "lr" : 0.001 ,"ConGauss": False
}

In [7]:
train_params = {
    "cell_line" : "a375", "split" : "train_test_split", "number_folds" : [0],
    "output_dir" : "results",
    "batch_size" : int(128), "epochs" : int(20), 
    "N_ensemble" : int(1), "nmodel_start" : int(0), "prec_threshold" : 0.2,
    "Pre_training" : False,
    "Pre_trained_cell_dir" : '',
    "pattern_to_load" : 'siam_no_augment_',
    "model_id_to_load" : "20",
    "test_value_norm" : True,
    "predict_batch_size":int(2048)
}

# Load data

In [8]:
get_all = []
if train_params["split"] == "train_test_split":
  outer_loop = train_params["number_folds"]
elif train_params["split"] == "5_fold_cv_split":
  outer_loop = train_params["number_folds"]
elif train_params["split"] == "alldata":
  outer_loop = train_params["number_folds"]
#Load unique smiles and tensorize them
smiles = pd.read_csv("data/" + train_params["cell_line"] + "/" + train_params["cell_line"] + "q1smiles.csv", index_col=0)
X_atoms, X_bonds, X_edges = tensorise_smiles(smiles.x, model_params["max_degree"], model_params["max_atoms"])
smiles=list(smiles['x'])

In [9]:
df = pd.read_csv("data/" + train_params["cell_line"] + "/" + "train_test_split/" + "train.csv",index_col=0).reset_index(drop=True)
df_cold = pd.read_csv("data/" + train_params["cell_line"] + "/" + "train_test_split/" + "test.csv",index_col=0).reset_index(drop=True)
smiles_cold = list(set(list(df_cold['rdkit.x'])+list(df_cold['rdkit.y'])))
X_atoms_cold, X_bonds_cold, X_edges_cold = tensorise_smiles(smiles_cold,  model_params["max_degree"], model_params["max_atoms"])
#X_atoms_cold=X_atoms_cold.astype('float64')
#X_bonds_cold=X_bonds_cold.astype('float64')
#X_edges_cold=X_edges_cold.astype('int64')
if train_params["test_value_norm"]:
  Y_cold = df_cold.value
else:
  Y_cold = df_cold.value
  Y_cold = Y_cold/2

In [10]:
i=0
Path(train_params["output_dir"] + "/" + "fold_%s/models"%i).mkdir(parents=True, exist_ok=True)
cold_preds_mus = []
cold_preds_sigmas = []
n = train_params["nmodel_start"]

# Define,Compile,Train model

In [11]:
deepsiba = siamese_model(model_params)

In [12]:
def get_default_device():
  if torch.cuda.is_available():
    print('cuda mode')
    return torch.device('cuda')
  else:
    print('cpu mode')
  return torch.device('cpu')
device=get_default_device()

cuda mode


In [13]:
def to_device(data,device):
  if isinstance(data,(list,tuple)):
    return [to_device(x,device) for x in data]
  return data.to(device)

In [14]:
class train_generator(Dataset):

  def __init__(self, data,smiles,X_atoms, X_bonds, X_edges):
    self.df=data
    self.smiles=smiles
    self.X_atoms=X_atoms
    self.X_bonds=X_bonds
    self.X_edges=X_edges
    
  def __len__(self):
    return len(self.df)

  def __getitem__(self, index):
    smi1=self.df['rdkit.x'][index]
    smi2=self.df['rdkit.y'][index]
    d=self.df.value[index]/2
    ind1=self.smiles.index(smi1)
    ind2=self.smiles.index(smi2)
    atom_1=torch.tensor(X_atoms[ind1]).type(torch.FloatTensor)
    bond_1=torch.tensor(X_bonds[ind1]).type(torch.FloatTensor)
    edge_1=torch.tensor(X_edges[ind1]).type(torch.IntTensor)
    atom_2=torch.tensor(X_atoms[ind2]).type(torch.FloatTensor)
    bond_2=torch.tensor(X_bonds[ind2]).type(torch.FloatTensor)
    edge_2=torch.tensor(X_edges[ind2]).type(torch.IntTensor)
    return atom_1,bond_1,edge_1,atom_2,bond_2,edge_2,torch.tensor(d).type(torch.FloatTensor)

In [15]:
bs = train_params["batch_size"]
NUM_EPOCHS = train_params["epochs"]
df = df.sample(frac=1).reset_index(drop=True)
NUM_TRAIN = len(df)
NUM_STEPS=ceil(NUM_TRAIN/bs)
trainGen=train_generator(df,smiles,X_atoms, X_bonds, X_edges)

In [16]:
#num_workers=12 mporei na mpei ki ayto sto DataLoader
train_loader = DataLoader(trainGen,
                          batch_size=bs,
                          shuffle=True)

In [17]:
class DeviceDataLoader():
  def __init__(self,dl,device):
    self.dl=dl
    self.device=device
  def __iter__(self):
    for b in self.dl:
      yield to_device(b,self.device)

  def __len__(self):
    """Number of batches"""
    return len(self.dl)

In [18]:
train_loader=DeviceDataLoader(train_loader,device)

In [19]:
deepsiba=deepsiba.to(device)

In [20]:
adam = torch.optim.Adam(deepsiba.parameters(),lr=model_params["lr"])

In [25]:
def train(epoch):
  deepsiba.train()
  for atom1,bond1,edge1,atom2,bond2,edge2,y_true in train_data_iterator:
    #print(len(tb))
    #tb = tb.to(dev)
    adam.zero_grad()
    
    y_pred = deepsiba(atom1,bond1,edge1,atom2,bond2,edge2)
    loss = custom_loss(y_true,y_pred)
    r=r_square(y_true,y_pred)
    pear=pearson_r(y_true,y_pred)
    mse=custom_mse(y_true,y_pred)
    mse_similars=mse_sliced(y_true,y_pred,0.2)
    cindex=get_cindex(y_true,y_pred)
    
    train_data_iterator.set_postfix(
        Epoch=epoch + 1,
        r2='%.4f' % float(r),
        pearson='%.4f' % float(pear),
        Cindex='%.4f' % float(cindex),
        Loss='%.4f' % float(loss.item()))
    loss.backward(retain_graph=True)
    #torch.nn.utils.clip_grad_norm_(model.encoder.emb_layer.parameters(), 0.05)
    adam.step()
    
    #del tb

In [None]:
for epoch in range(NUM_EPOCHS):
  train_data_iterator = tqdm(train_loader,
                             leave=True,
                             unit='batch',
                             postfix={
                                 'Epoch': epoch + 1,
                                 'r2':'%.4f' % float("NaN"),
                                 'pearson':'%.4f' % float("NaN"),
                                 'Cindex':'%.4f' % float("NaN"),
                                 'Loss': '%.4f' % float("NaN")})
  train(epoch)

HBox(children=(FloatProgress(value=0.0, max=1367.0), HTML(value='')))