In [1]:
import pickle as pkl
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import Draw

In [2]:
with open ('keys/train_keys.pkl', 'rb') as fp:
    train_keys = pkl.load(fp)
print(train_keys.shape)
with open ('keys/test_keys.pkl', 'rb') as fp:
    test_keys = pkl.load(fp)
print(test_keys.shape)

(1944,)
(216,)


In [4]:
dataset = []
for train_key in train_keys:
    # print(train_keys[i])
    mol_w = Chem.MolFromPDBFile('./wild_pdb/' + train_key + '_wild.pdb')
    mol_m = Chem.MolFromPDBFile('./mutation_pdb/' + train_key + '_mutation.pdb')
    with open('./ddg/' + train_key, 'rb') as f:
        ddg = pkl.load(f)
    # Getting details about the mutation molecule
    num_atoms_m = mol_m.GetNumAtoms()
    positions_m = np.array(mol_m.GetConformers()[0].GetPositions())
    adjacency_matrix_m = Chem.rdmolops.GetAdjacencyMatrix(mol_m) + np.eye(num_atoms_m)
    # Getting details about the wild molecule
    num_atoms_w = mol_w.GetNumAtoms()
    positions_w = np.array(mol_w.GetConformers()[0].GetPositions())
    adjacency_matrix_w = Chem.rdmolops.GetAdjacencyMatrix(mol_w) + np.eye(num_atoms_w)
    # Getting one hot encoding for atoms in mutation molecule
    possible_atoms = ['C', 'N', 'O', 'S', 'F', 'Cl', 'Br', 'I', 'P', 'B', 'Si', 'H']
    possible_degrees = [0, 1, 2, 3, 4, 5, 6]
    possible_hydrogens = [0, 1, 2, 3, 4]
    possible_implicit_valence = [0, 1, 2, 3, 4, 5, 6]
    one_hot_encoding_m = []
    one_hot_encoding_w = []
    for atom in mol_m.GetAtoms():
        one_hot_encoding_atom = list(map(lambda s: atom.GetSymbol() == s, possible_atoms))
        one_hot_encoding_degree = list(map(lambda s: atom.GetDegree() == s, possible_degrees))
        one_hot_encoding_hydrogens = list(map(lambda s: atom.GetTotalNumHs() == s, possible_hydrogens))
        one_hot_encoding_implicit_valence = list(map(lambda s: atom.GetImplicitValence() == s, possible_implicit_valence))
        one_hot_encoding_aromatic = [atom.GetIsAromatic()]
        one_hot_encoding = one_hot_encoding_atom + one_hot_encoding_degree + one_hot_encoding_hydrogens + one_hot_encoding_implicit_valence + one_hot_encoding_aromatic
        one_hot_encoding = np.array(one_hot_encoding)
        one_hot_encoding_m.append(one_hot_encoding)
    one_hot_encoding_m = np.array(one_hot_encoding_m)
    for atom in mol_w.GetAtoms():
        one_hot_encoding_atom = list(map(lambda s: atom.GetSymbol() == s, possible_atoms))
        one_hot_encoding_degree = list(map(lambda s: atom.GetDegree() == s, possible_degrees))
        one_hot_encoding_hydrogens = list(map(lambda s: atom.GetTotalNumHs() == s, possible_hydrogens))
        one_hot_encoding_implicit_valence = list(map(lambda s: atom.GetImplicitValence() == s, possible_implicit_valence))
        one_hot_encoding_aromatic = [atom.GetIsAromatic()]
        one_hot_encoding = one_hot_encoding_atom + one_hot_encoding_degree + one_hot_encoding_hydrogens + one_hot_encoding_implicit_valence + one_hot_encoding_aromatic
        one_hot_encoding = np.array(one_hot_encoding)
        one_hot_encoding_w.append(one_hot_encoding)
    one_hot_encoding_w = np.array(one_hot_encoding_w)
    datapoint = {
        'num_atoms_m': num_atoms_m,
        'positions_m': positions_m,
        'adjacency_matrix_m': adjacency_matrix_m,
        'one_hot_encoding_m': one_hot_encoding_m,
        'num_atoms_w': num_atoms_w,
        'positions_w': positions_w,
        'adjacency_matrix_w': adjacency_matrix_w,
        'one_hot_encoding_w': one_hot_encoding_w,
        'ddg': ddg,
        'key': train_key
    }
    dataset.append(datapoint)
for test_key in test_keys:
    mol_w = Chem.MolFromPDBFile('./wild_pdb/' + test_key + '_wild.pdb')
    mol_m = Chem.MolFromPDBFile('./mutation_pdb/' + test_key + '_mutation.pdb')
    with open('./ddg/' + test_key, 'rb') as f:
        ddg = pkl.load(f)
    # Getting details about the mutation molecule
    num_atoms_m = mol_m.GetNumAtoms()
    positions_m = np.array(mol_m.GetConformers()[0].GetPositions())
    adjacency_matrix_m = Chem.rdmolops.GetAdjacencyMatrix(mol_m) + np.eye(num_atoms_m)
    # Getting details about the wild molecule
    num_atoms_w = mol_w.GetNumAtoms()
    positions_w = np.array(mol_w.GetConformers()[0].GetPositions())
    adjacency_matrix_w = Chem.rdmolops.GetAdjacencyMatrix(mol_w) + np.eye(num_atoms_w)
    # Getting one hot encoding for atoms in mutation molecule
    possible_atoms = ['C', 'N', 'O', 'S', 'F', 'Cl', 'Br', 'I', 'P', 'B', 'Si', 'H']
    possible_degrees = [0, 1, 2, 3, 4, 5, 6]
    possible_hydrogens = [0, 1, 2, 3, 4]
    possible_implicit_valence = [0, 1, 2, 3, 4, 5, 6]
    one_hot_encoding_m = []
    one_hot_encoding_w = []
    for atom in mol_m.GetAtoms():
        one_hot_encoding_atom = list(map(lambda s: atom.GetSymbol() == s, possible_atoms))
        one_hot_encoding_degree = list(map(lambda s: atom.GetDegree() == s, possible_degrees))
        one_hot_encoding_hydrogens = list(map(lambda s: atom.GetTotalNumHs() == s, possible_hydrogens))
        one_hot_encoding_implicit_valence = list(map(lambda s: atom.GetImplicitValence() == s, possible_implicit_valence))
        one_hot_encoding_aromatic = [atom.GetIsAromatic()]
        one_hot_encoding = one_hot_encoding_atom + one_hot_encoding_degree + one_hot_encoding_hydrogens + one_hot_encoding_implicit_valence + one_hot_encoding_aromatic
        one_hot_encoding = np.array(one_hot_encoding)
        one_hot_encoding_m.append(one_hot_encoding)
    one_hot_encoding_m = np.array(one_hot_encoding_m)
    for atom in mol_w.GetAtoms():
        one_hot_encoding_atom = list(map(lambda s: atom.GetSymbol() == s, possible_atoms))
        one_hot_encoding_degree = list(map(lambda s: atom.GetDegree() == s, possible_degrees))
        one_hot_encoding_hydrogens = list(map(lambda s: atom.GetTotalNumHs() == s, possible_hydrogens))
        one_hot_encoding_implicit_valence = list(map(lambda s: atom.GetImplicitValence() == s, possible_implicit_valence))
        one_hot_encoding_aromatic = [atom.GetIsAromatic()]
        one_hot_encoding = one_hot_encoding_atom + one_hot_encoding_degree + one_hot_encoding_hydrogens + one_hot_encoding_implicit_valence + one_hot_encoding_aromatic
        one_hot_encoding = np.array(one_hot_encoding)
        one_hot_encoding_w.append(one_hot_encoding)
    one_hot_encoding_w = np.array(one_hot_encoding_w)
    datapoint = {
        'num_atoms_m': num_atoms_m,
        'positions_m': positions_m,
        'adjacency_matrix_m': adjacency_matrix_m,
        'one_hot_encoding_m': one_hot_encoding_m,
        'num_atoms_w': num_atoms_w,
        'positions_w': positions_w,
        'adjacency_matrix_w': adjacency_matrix_w,
        'one_hot_encoding_w': one_hot_encoding_w,
        'ddg': ddg,
        'key': test_key
    }
    dataset.append(datapoint)
print(dataset[0])

{'num_atoms_m': 21, 'positions_m': array([[19.64 , 30.905,  6.993],
       [18.418, 30.207,  7.375],
       [18.624, 28.68 ,  7.307],
       [17.344, 27.855,  7.539],
       [17.585, 26.355,  7.571],
       [18.725, 25.887,  7.498],
       [16.505, 25.592,  7.678],
       [18.061, 30.623,  8.799],
       [18.933, 30.661,  9.676],
       [16.792, 30.95 ,  9.096],
       [16.387, 31.423, 10.397],
       [14.984, 32.074, 10.368],
       [16.437, 30.281, 11.379],
       [16.526, 29.125, 10.964],
       [16.427, 30.56 , 12.674],
       [16.537, 29.562, 13.745],
       [16.176, 31.903, 13.233],
       [16.392, 30.408, 15.01 ],
       [15.547, 31.573, 14.555],
       [15.471, 28.466, 13.657],
       [15.746, 27.296, 13.944]]), 'adjacency_matrix_m': array([[1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0.],
       [1., 1., 1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0.],
       [0., 1., 1., 1., 0., 0., 0., 0., 0., 0.