In [205]:
import pickle as pkl
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import Draw
import torch
import tensorflow as tf

In [206]:
with open ('Baseline Model/keys/train_keys.pkl', 'rb') as fp:
    train_keys = pkl.load(fp)
print(train_keys.shape)
with open ('Baseline Model/keys/test_keys.pkl', 'rb') as fp:
    test_keys = pkl.load(fp)
print(test_keys.shape)

(1944,)
(216,)


In [207]:
dataset = []
for train_key in train_keys:
    # print(train_keys[i])
    mol_w = Chem.MolFromPDBFile('Baseline Model/wild_pdb/' + train_key + '_wild.pdb')
    mol_m = Chem.MolFromPDBFile('Baseline Model/mutation_pdb/' + train_key + '_mutation.pdb')
    with open('Baseline Model/ddg/' + train_key, 'rb') as f:
        ddg = pkl.load(f)
    # Getting details about the mutation molecule
    num_atoms_m = mol_m.GetNumAtoms()
    positions_m = np.array(mol_m.GetConformers()[0].GetPositions())
    adjacency_matrix_m = Chem.rdmolops.GetAdjacencyMatrix(mol_m) + np.eye(num_atoms_m)
    # Getting details about the wild molecule
    num_atoms_w = mol_w.GetNumAtoms()
    positions_w = np.array(mol_w.GetConformers()[0].GetPositions())
    adjacency_matrix_w = Chem.rdmolops.GetAdjacencyMatrix(mol_w) + np.eye(num_atoms_w)
    # Getting one hot encoding for atoms in mutation molecule
    possible_atoms = ['C', 'N', 'O', 'S', 'F', 'Cl', 'Br', 'I', 'P', 'B', 'Si', 'H']
    possible_degrees = [0, 1, 2, 3, 4, 5, 6]
    possible_hydrogens = [0, 1, 2, 3, 4]
    possible_implicit_valence = [0, 1, 2, 3, 4, 5, 6]
    one_hot_encoding_m = []
    one_hot_encoding_w = []
    for atom in mol_m.GetAtoms():
        one_hot_encoding_atom = list(map(lambda s: atom.GetSymbol() == s, possible_atoms))
        one_hot_encoding_degree = list(map(lambda s: atom.GetDegree() == s, possible_degrees))
        one_hot_encoding_hydrogens = list(map(lambda s: atom.GetTotalNumHs() == s, possible_hydrogens))
        one_hot_encoding_implicit_valence = list(map(lambda s: atom.GetImplicitValence() == s, possible_implicit_valence))
        one_hot_encoding_aromatic = [atom.GetIsAromatic()]
        one_hot_encoding = one_hot_encoding_atom + one_hot_encoding_degree + one_hot_encoding_hydrogens + one_hot_encoding_implicit_valence + one_hot_encoding_aromatic
        one_hot_encoding = np.array(one_hot_encoding)
        one_hot_encoding_m.append(one_hot_encoding)
    one_hot_encoding_m = np.array(one_hot_encoding_m)
    for atom in mol_w.GetAtoms():
        one_hot_encoding_atom = list(map(lambda s: atom.GetSymbol() == s, possible_atoms))
        one_hot_encoding_degree = list(map(lambda s: atom.GetDegree() == s, possible_degrees))
        one_hot_encoding_hydrogens = list(map(lambda s: atom.GetTotalNumHs() == s, possible_hydrogens))
        one_hot_encoding_implicit_valence = list(map(lambda s: atom.GetImplicitValence() == s, possible_implicit_valence))
        one_hot_encoding_aromatic = [atom.GetIsAromatic()]
        one_hot_encoding = one_hot_encoding_atom + one_hot_encoding_degree + one_hot_encoding_hydrogens + one_hot_encoding_implicit_valence + one_hot_encoding_aromatic
        one_hot_encoding = np.array(one_hot_encoding)
        one_hot_encoding_w.append(one_hot_encoding)
    one_hot_encoding_w = np.array(one_hot_encoding_w)
    datapoint = {
        'num_atoms_m': num_atoms_m,
        'positions_m': positions_m,
        'adjacency_matrix_m': adjacency_matrix_m,
        'one_hot_encoding_m': one_hot_encoding_m,
        'num_atoms_w': num_atoms_w,
        'positions_w': positions_w,
        'adjacency_matrix_w': adjacency_matrix_w,
        'one_hot_encoding_w': one_hot_encoding_w,
        'ddg': ddg,
        'key': train_key
    }
    dataset.append(datapoint)
for test_key in test_keys:
    mol_w = Chem.MolFromPDBFile('Baseline Model/wild_pdb/' + test_key + '_wild.pdb')
    mol_m = Chem.MolFromPDBFile('Baseline Model/mutation_pdb/' + test_key + '_mutation.pdb')
    with open('Baseline Model/ddg/' + test_key, 'rb') as f:
        ddg = pkl.load(f)
    # Getting details about the mutation molecule
    num_atoms_m = mol_m.GetNumAtoms()
    positions_m = np.array(mol_m.GetConformers()[0].GetPositions())
    adjacency_matrix_m = Chem.rdmolops.GetAdjacencyMatrix(mol_m) + np.eye(num_atoms_m)
    # Getting details about the wild molecule
    num_atoms_w = mol_w.GetNumAtoms()
    positions_w = np.array(mol_w.GetConformers()[0].GetPositions())
    adjacency_matrix_w = Chem.rdmolops.GetAdjacencyMatrix(mol_w) + np.eye(num_atoms_w)
    # Getting one hot encoding for atoms in mutation molecule
    possible_atoms = ['C', 'N', 'O', 'S', 'F', 'Cl', 'Br', 'I', 'P', 'B', 'Si', 'H']
    possible_degrees = [0, 1, 2, 3, 4, 5, 6]
    possible_hydrogens = [0, 1, 2, 3, 4]
    possible_implicit_valence = [0, 1, 2, 3, 4, 5, 6]
    one_hot_encoding_m = []
    one_hot_encoding_w = []
    for atom in mol_m.GetAtoms():
        one_hot_encoding_atom = list(map(lambda s: atom.GetSymbol() == s, possible_atoms))
        one_hot_encoding_degree = list(map(lambda s: atom.GetDegree() == s, possible_degrees))
        one_hot_encoding_hydrogens = list(map(lambda s: atom.GetTotalNumHs() == s, possible_hydrogens))
        one_hot_encoding_implicit_valence = list(map(lambda s: atom.GetImplicitValence() == s, possible_implicit_valence))
        one_hot_encoding_aromatic = [atom.GetIsAromatic()]
        one_hot_encoding = one_hot_encoding_atom + one_hot_encoding_degree + one_hot_encoding_hydrogens + one_hot_encoding_implicit_valence + one_hot_encoding_aromatic
        one_hot_encoding = np.array(one_hot_encoding)
        one_hot_encoding_m.append(one_hot_encoding)
    one_hot_encoding_m = np.array(one_hot_encoding_m)
    for atom in mol_w.GetAtoms():
        one_hot_encoding_atom = list(map(lambda s: atom.GetSymbol() == s, possible_atoms))
        one_hot_encoding_degree = list(map(lambda s: atom.GetDegree() == s, possible_degrees))
        one_hot_encoding_hydrogens = list(map(lambda s: atom.GetTotalNumHs() == s, possible_hydrogens))
        one_hot_encoding_implicit_valence = list(map(lambda s: atom.GetImplicitValence() == s, possible_implicit_valence))
        one_hot_encoding_aromatic = [atom.GetIsAromatic()]
        one_hot_encoding = one_hot_encoding_atom + one_hot_encoding_degree + one_hot_encoding_hydrogens + one_hot_encoding_implicit_valence + one_hot_encoding_aromatic
        one_hot_encoding = np.array(one_hot_encoding)
        one_hot_encoding_w.append(one_hot_encoding)
    one_hot_encoding_w = np.array(one_hot_encoding_w)
    datapoint = {
        'num_atoms_m': num_atoms_m,
        'positions_m': positions_m,
        'adjacency_matrix_m': adjacency_matrix_m,
        'one_hot_encoding_m': one_hot_encoding_m,
        'num_atoms_w': num_atoms_w,
        'positions_w': positions_w,
        'adjacency_matrix_w': adjacency_matrix_w,
        'one_hot_encoding_w': one_hot_encoding_w,
        'ddg': ddg,
        'key': test_key
    }
    dataset.append(datapoint)
print(dataset[0])

{'num_atoms_m': 21, 'positions_m': array([[19.64 , 30.905,  6.993],
       [18.418, 30.207,  7.375],
       [18.624, 28.68 ,  7.307],
       [17.344, 27.855,  7.539],
       [17.585, 26.355,  7.571],
       [18.725, 25.887,  7.498],
       [16.505, 25.592,  7.678],
       [18.061, 30.623,  8.799],
       [18.933, 30.661,  9.676],
       [16.792, 30.95 ,  9.096],
       [16.387, 31.423, 10.397],
       [14.984, 32.074, 10.368],
       [16.437, 30.281, 11.379],
       [16.526, 29.125, 10.964],
       [16.427, 30.56 , 12.674],
       [16.537, 29.562, 13.745],
       [16.176, 31.903, 13.233],
       [16.392, 30.408, 15.01 ],
       [15.547, 31.573, 14.555],
       [15.471, 28.466, 13.657],
       [15.746, 27.296, 13.944]]), 'adjacency_matrix_m': array([[1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0.],
       [1., 1., 1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0.],
       [0., 1., 1., 1., 0., 0., 0., 0., 0., 0.

In [208]:
# make 0 instead of false and 1 instead of true
for i in range(len(dataset)):
    dataset[i]['one_hot_encoding_m'] = dataset[i]['one_hot_encoding_m'].astype(int)
    dataset[i]['one_hot_encoding_w'] = dataset[i]['one_hot_encoding_w'].astype(int)

print(dataset[0])

{'num_atoms_m': 21, 'positions_m': array([[19.64 , 30.905,  6.993],
       [18.418, 30.207,  7.375],
       [18.624, 28.68 ,  7.307],
       [17.344, 27.855,  7.539],
       [17.585, 26.355,  7.571],
       [18.725, 25.887,  7.498],
       [16.505, 25.592,  7.678],
       [18.061, 30.623,  8.799],
       [18.933, 30.661,  9.676],
       [16.792, 30.95 ,  9.096],
       [16.387, 31.423, 10.397],
       [14.984, 32.074, 10.368],
       [16.437, 30.281, 11.379],
       [16.526, 29.125, 10.964],
       [16.427, 30.56 , 12.674],
       [16.537, 29.562, 13.745],
       [16.176, 31.903, 13.233],
       [16.392, 30.408, 15.01 ],
       [15.547, 31.573, 14.555],
       [15.471, 28.466, 13.657],
       [15.746, 27.296, 13.944]]), 'adjacency_matrix_m': array([[1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0.],
       [1., 1., 1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0.],
       [0., 1., 1., 1., 0., 0., 0., 0., 0., 0.

In [209]:
print(type(dataset))

# get the train dataset
train_dataset = []
for i in range(len(train_keys)):
    train_dataset.append(dataset[i])
print(train_dataset[0])

# get the test dataset
test_dataset = []
for i in range(len(train_keys), len(train_keys) + len(test_keys)):
    test_dataset.append(dataset[i])
print(test_dataset[0])

# loss function for tensorflow
def loss_function(y_true, y_pred):
    return tf.keras.losses.mean_squared_error(y_true, y_pred)


# preprocess the data for the model
def preprocess_data(dataset):
    num_atoms_m = []
    positions_m = []
    adjacency_matrix_m = []
    one_hot_encoding_m = []
    num_atoms_w = []
    positions_w = []
    adjacency_matrix_w = []
    one_hot_encoding_w = []
    ddg = []
    for i in range(len(dataset)):
        num_atoms_m.append(dataset[i]['num_atoms_m'])
        positions_m.append(dataset[i]['positions_m'])
        adjacency_matrix_m.append(dataset[i]['adjacency_matrix_m'])
        one_hot_encoding_m.append(dataset[i]['one_hot_encoding_m'])
        num_atoms_w.append(dataset[i]['num_atoms_w'])
        positions_w.append(dataset[i]['positions_w'])
        adjacency_matrix_w.append(dataset[i]['adjacency_matrix_w'])
        one_hot_encoding_w.append(dataset[i]['one_hot_encoding_w'])
        ddg.append(dataset[i]['ddg'])
    num_atoms_m = np.array(num_atoms_m)
    positions_m = np.array(positions_m)
    adjacency_matrix_m = np.array(adjacency_matrix_m)
    one_hot_encoding_m = np.array(one_hot_encoding_m)
    num_atoms_w = np.array(num_atoms_w)
    positions_w = np.array(positions_w)
    adjacency_matrix_w = np.array(adjacency_matrix_w)
    one_hot_encoding_w = np.array(one_hot_encoding_w)
    ddg = np.array(ddg)
    return num_atoms_m, positions_m, adjacency_matrix_m, one_hot_encoding_m, num_atoms_w, positions_w, adjacency_matrix_w, one_hot_encoding_w, ddg

n_m,p_m,a_m,o_m,n_w,p_w,a_w,o_w,ddg = preprocess_data(dataset)

<class 'list'>
{'num_atoms_m': 21, 'positions_m': array([[19.64 , 30.905,  6.993],
       [18.418, 30.207,  7.375],
       [18.624, 28.68 ,  7.307],
       [17.344, 27.855,  7.539],
       [17.585, 26.355,  7.571],
       [18.725, 25.887,  7.498],
       [16.505, 25.592,  7.678],
       [18.061, 30.623,  8.799],
       [18.933, 30.661,  9.676],
       [16.792, 30.95 ,  9.096],
       [16.387, 31.423, 10.397],
       [14.984, 32.074, 10.368],
       [16.437, 30.281, 11.379],
       [16.526, 29.125, 10.964],
       [16.427, 30.56 , 12.674],
       [16.537, 29.562, 13.745],
       [16.176, 31.903, 13.233],
       [16.392, 30.408, 15.01 ],
       [15.547, 31.573, 14.555],
       [15.471, 28.466, 13.657],
       [15.746, 27.296, 13.944]]), 'adjacency_matrix_m': array([[1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0.],
       [1., 1., 1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0.],
       [0., 1., 1., 1., 0., 0.,

  positions_m = np.array(positions_m)
  adjacency_matrix_m = np.array(adjacency_matrix_m)
  one_hot_encoding_m = np.array(one_hot_encoding_m)
  positions_w = np.array(positions_w)
  adjacency_matrix_w = np.array(adjacency_matrix_w)
  one_hot_encoding_w = np.array(one_hot_encoding_w)


In [210]:
print(n_m.shape)

(2160,)


In [211]:
# create a input_data1 in which for each datapoint it should have the number of atoms, positions, adjacency matrix, one hot encoding for the ligand and the receptor
input_data1 = []
for i in range(len(dataset)):
    input_data1.append([p_m[i], a_m[i], o_m[i]])
    input_data1.append([p_w[i], a_w[i], o_w[i]])
input_data1 = np.array(input_data1)
print(input_data1[2].shape)
print(type(input_data1[2]))
print(len(input_data1))
print(input_data1)

(3,)
<class 'numpy.ndarray'>
4320
[[array([[19.64 , 30.905,  6.993],
         [18.418, 30.207,  7.375],
         [18.624, 28.68 ,  7.307],
         [17.344, 27.855,  7.539],
         [17.585, 26.355,  7.571],
         [18.725, 25.887,  7.498],
         [16.505, 25.592,  7.678],
         [18.061, 30.623,  8.799],
         [18.933, 30.661,  9.676],
         [16.792, 30.95 ,  9.096],
         [16.387, 31.423, 10.397],
         [14.984, 32.074, 10.368],
         [16.437, 30.281, 11.379],
         [16.526, 29.125, 10.964],
         [16.427, 30.56 , 12.674],
         [16.537, 29.562, 13.745],
         [16.176, 31.903, 13.233],
         [16.392, 30.408, 15.01 ],
         [15.547, 31.573, 14.555],
         [15.471, 28.466, 13.657],
         [15.746, 27.296, 13.944]])
  array([[1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0.],
         [1., 1., 1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0.],
         [0., 1., 1.

  input_data1 = np.array(input_data1)


In [212]:
# import numpy as np
from keras.models import Model
from keras.layers import Input, Conv2D, MaxPooling2D, UpSampling2D
# import tensorflow as tf


In [213]:
print(input_data1[0][2].shape)

(21, 32)


In [214]:

# write code to convert each of the 3*n arrays into a 3*100 array by padding with zeros
# original_array = np.random.rand(20, 3)
# # print(original_array)
# target_shape = (100, 3)
# padded_array = np.zeros(target_shape)
# padded_array[:original_array.shape[0], :original_array.shape[1]] = original_array
# print(padded_array)
for i in input_data1:
    # print(i[0].shape)
    target_shape = (300, 3)
    padded_array = np.zeros(target_shape)
    # make the shape of i[0] to 100*3
    padded_array[:i[0].shape[0], :i[0].shape[1]] = i[0]
    i[0] = padded_array
    # print(i[0].shape)
    target_shape2 = (300, 300)
    padded_array2 = np.zeros(target_shape2)
    # make the shape of i[1] to 100*100
    padded_array2[:i[1].shape[0], :i[1].shape[1]] = i[1]
    i[1] = padded_array2
    # print(i[1].shape)
    target_shape3 = (300, 32)
    padded_array3 = np.zeros(target_shape3)
    # make the shape of i[2] to 100*32
    padded_array3[:i[2].shape[0], :i[2].shape[1]] = i[2]
    i[2] = padded_array3
    # print(i[2].shape)


In [219]:

print(input_data1[0][2].shape)

(300, 32)


In [220]:
# convert numpy array to tensor
input_data1 = tf.convert_to_tensor(input_data1, dtype=tf.float32)

input_shape = (None, None, 3)

# Define the input layer
input_layer = Input(shape=input_shape)

# Define the encoder layers
x = Conv2D(16, (3, 3), activation='relu', padding='same')(input_layer)
x = MaxPooling2D((2, 2), padding='same')(x)
x = Conv2D(8, (3, 3), activation='relu', padding='same')(x)
encoded = MaxPooling2D((2, 2), padding='same')(x)

# Define the decoder layers
x = Conv2D(8, (3, 3), activation='relu', padding='same')(encoded)
x = UpSampling2D((2, 2))(x)
x = Conv2D(16, (3, 3), activation='relu', padding='same')(x)
x = UpSampling2D((2, 2))(x)
decoded = Conv2D(3, (3, 3), activation='linear', padding='same')(x)

# Define the model
autoencoder = Model(input_layer, decoded)

# Compile the model
autoencoder.compile(optimizer='adam', loss='mean_squared_error')

# Train the model
autoencoder.fit(input_data1, input_data1,
                epochs=50,
                batch_size=32,
                shuffle=True)

ValueError: setting an array element with a sequence.