In [23]:
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit import DataStructs
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
from typing import List

from torch import nn#, optimize
from torch import optim
import torch.nn.functional as F

In [24]:
### model.py ######

class MolecularEnc(nn.Module):
    def __init__(self, dimensions):
        super(MolecularEnc, self).__init__()

        self.conv1d1 = nn.Conv1d(120, 9, kernel_size=9)
        self.conv1d2 = nn.Conv1d(9, 9, kernel_size=9)
        self.conv1d3 = nn.Conv1d(9, 10, kernel_size=11)
        self.fc0 = nn.Linear(90, 435)
        self.fc11 = nn.Linear(435, dimensions)
        self.fc12 = nn.Linear(435, dimensions)

        self.fc2 = nn.Linear(dimensions, dimensions)
        self.fc3 = nn.Linear(501, 35)

    def encode(self, x):
        h = F.relu(self.conv1d1(x))
        h = F.relu(self.conv1d2(h))
        h = F.relu(self.conv1d3(h))
        h = h.view(h.size(0), -1)
        h = F.selu(self.fc0(h))
        return self.fc11(h), self.fc12(h)

    def reparametrize(self, mu, logvar):
        if self.training:
            std = torch.exp(0.5 * logvar)
            eps = 1e-2 * torch.randn_like(std)
            w = eps.mul(std).add_(mu)
            return w
        else:
            return mu

    def decode(self, z):
        z = F.selu(self.fc2(z))
        z = z.view(z.size(0), 1, z.size(-1)).repeat(1, 120, 1)
        out, h = self.gru(z)
        out_reshape = out.contiguous().view(-1, out.size(-1))
        y0 = F.softmax(self.fc3(out_reshape), dim=1)
        y = y0.contiguous().view(out.size(0), -1, y0.size(-1))
        return y

    def forward(self, x):
        mu, logvar = self.encode(x)
        z = self.reparametrize(mu, logvar)
        return self.decode(z), mu, logvar

In [25]:
#### featurizer.py #####

# CHARSET = ['@', 'B', 'C', 'H', 'N', ' ']
CHARSET = [' ', '#', '(', ')', '+', '-', '/', '1', '2', '3', '4', '5', '6', '7',
        '8', '=', '@', 'B', 'C', 'F', 'H', 'I', 'N', 'O', 'P', 'S', '[', '\\', ']',
        'c', 'l', 'n', 'o', 'r', 's']


class OneHotFeaturizer(object):
    def __init__(self, charset=CHARSET, padlength=120):
        self.charset = CHARSET
        self.pad_length = padlength

    def featurize(self, smiles):
        return np.array([self.one_hot_encode(smi) for smi in smiles])

    def one_hot_array(self, i):
        return [int(x) for x in [ix == i for ix in range(len(self.charset))]]

    def one_hot_index(self, c):
        return self.charset.index(c)

    def pad_smi(self, smi):
        return smi.ljust(self.pad_length)

    def one_hot_encode(self, smi):
        return np.array([self.one_hot_array(self.one_hot_index(x)) for x in self.pad_smi(smi)])

    def one_hot_decode(self, z):
        z1 = []
        for i in range(len(z)):
            s = ''
            for j in range(len(z[i])):
                oh = np.argmax(z[i][j])
                s += self.charset[oh]
            z1.append([s.strip()])
        return z1

    def decode_smiles_from_index(vec):
        return ''.join(map(lambda x: CHARSET[x], vec)).strip()

In [26]:
def load_dataset(file_dg, file_smiles):
    dataset = {}
    with open(file_dg) as f_dg:
        for line in f_dg:
            w = line.split()
            name = w[0][w[0].index('_') + 1:].replace('_', ' ')
            dg = float(w[1])
            dataset[name] = {'dg': dg, 'smiles': None}

    smiles = []
    dg = []
    with open(file_smiles) as f_smiles:
        for line in f_smiles:
            w = line.split('\t')
            if len(w) == 3 and w[0] in dataset.keys():
                name = w[0]
                dataset[name]['smiles'] = w[-1].replace('\n', '')
                smiles.append(dataset[name]['smiles'])
                dg.append(dataset[name]['dg'])

    return dataset, smiles, dg

dataset, smiles, dg = load_dataset('./DatabaseOMSDrugs_scores.dat', './DatabaseOMSDrugs.dat')
#print(smiles)

In [60]:
#def sampler(epochs=100):
def loss_function(recon_x, x, mu1, logvar1):
    BCE = F.binary_cross_entropy(recon_x, x, size_average=False)
    KLD = -0.5 * torch.sum(1 + logvar1 - mu1.pow(2) - logvar1.exp())
    return BCE + KLD

    #smiles = ['C[C@@H]1CN(C(=O)c2cc(Br)cn2C)CC[C@H]1[NH3+]', 'OC(=O)[C@@H](N)C', 'CC[C@H](O1)CC[C@@]12CCCO2']

    
smiles2 = []
for i in smiles:
    s = i.ljust(120)
    if len(s) <=120:
        smiles2.append(s)
        
oh = OneHotFeaturizer()
temp = []
for i in range(len(smiles2)):
    #t = torch.from_numpy(oh.featurize([smiles[i]]))[0].float()
    t = torch.from_numpy(oh.featurize([smiles2[i]]))[0].float()
    #print(t)
    temp.append(t)

train = torch.stack(temp)
train = torch.utils.data.TensorDataset(train)
train_loader = DataLoader(train, batch_size=250, shuffle=True)

   
epochs = 100

model = MolecularEnc(3)
#optimizer = optimize.Adam(model.parameters())
optimizer = torch.optim.Adam(model.parameters())
model.train()

for epoch in range(1, epochs + 1):
    train_loss = 0
     #losses = []
    #print('in')
    data = model(batch)[0]
    optimizer.zero_grad()
    recon_batch, mu, logvar = model(data)

    loss = loss_function(recon_batch, data, mu, logvar)
    loss.backward()
    train_loss += loss
    optimizer.step()
    #losses.append(loss.detach().item())
    if batch_idx % 100 == 0:
        print(f'epoch :{epoch} \t loss: {loss:.4f}')

TypeError: conv1d() received an invalid combination of arguments - got (list, Parameter, Parameter, tuple, tuple, tuple, int), but expected one of:
 * (Tensor input, Tensor weight, Tensor bias, tuple of ints stride, tuple of ints padding, tuple of ints dilation, int groups)
      didn't match because some of the arguments have invalid types: (!list!, !Parameter!, !Parameter!, !tuple!, !tuple!, !tuple!, int)
 * (Tensor input, Tensor weight, Tensor bias, tuple of ints stride, str padding, tuple of ints dilation, int groups)
      didn't match because some of the arguments have invalid types: (!list!, !Parameter!, !Parameter!, !tuple!, !tuple!, !tuple!, int)
