In [1]:
import moses

train = moses.get_dataset('train')
test = moses.get_dataset('test')
test_scaffolds = moses.get_dataset('test_scaffolds')

In [2]:
train

array(['CCCS(=O)c1ccc2[nH]c(=NC(=O)OC)[nH]c2c1',
       'CC(C)(C)C(=O)C(Oc1ccc(Cl)cc1)n1ccnc1',
       'Cc1c(Cl)cccc1Nc1ncccc1C(=O)OCC(O)CO', ...,
       'NC(=O)c1ccc2ccccc2c1Br',
       'CC(=O)Nc1cccc(-c2nc3cc(C)ccc3[nH]c2=O)c1',
       'CC(NC(=O)OC(C)(C)C)c1nc(CO)nn1Cc1ccccc1'], dtype=object)

In [4]:
from pysmiles import read_smiles
import networkx as nx

smiles = 'C1CC[13CH2]CC1C1CCCCC1'
mol = read_smiles(smiles)

# atom vector (C only)
print(mol.nodes(data='element'))
# adjacency matrix
print(nx.to_numpy_matrix(mol, weight='order'))
print(nx.adjacency_matrix(mol, weight='order').todense())

[(0, 'C'), (1, 'C'), (2, 'C'), (3, 'C'), (4, 'C'), (5, 'C'), (6, 'C'), (7, 'C'), (8, 'C'), (9, 'C'), (10, 'C'), (11, 'C')]
[[0. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
 [1. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 1. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 1. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 1. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 1. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 0. 1. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 1. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 1.]
 [0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 0.]]
[[0 1 0 0 0 1 0 0 0 0 0 0]
 [1 0 1 0 0 0 0 0 0 0 0 0]
 [0 1 0 1 0 0 0 0 0 0 0 0]
 [0 0 1 0 1 0 0 0 0 0 0 0]
 [0 0 0 1 0 1 0 0 0 0 0 0]
 [1 0 0 0 1 0 1 0 0 0 0 0]
 [0 0 0 0 0 1 0 1 0 0 0 1]
 [0 0 0 0 0 0 1 0 1 0 0 0]
 [0 0 0 0 0 0 0 1 0 1 0 0]
 [0 0 0 0 0 0 0 0 1 0 1 0]
 [0 0 0 0 0 0 0 0 0 1 0 1]
 [0 0 0 0 0 0 1 0 0 0 1 0]]


In [5]:
from rdkit import Chem

In [6]:
m = Chem.MolFromSmiles('c1ccncc1')

In [11]:
len(m.ToBinary())

123

In [12]:
n = Chem.MolFromSmiles('C1CCCC2C1CCCC2')
len(n.ToBinary())

166

In [3]:
# GENTRL

import torch
import re


_atoms = ['He', 'Li', 'Be', 'Ne', 'Na', 'Mg', 'Al', 'Si', 'Cl', 'Ar',
          'Ca', 'Ti', 'Cr', 'Fe', 'Ni', 'Cu', 'Ga', 'Ge', 'As', 'Se',
          'Br', 'Kr', 'Rb', 'Sr', 'Zr', 'Nb', 'Mo', 'Tc', 'Ru', 'Rh',
          'Pd', 'Ag', 'Cd', 'Sb', 'Te', 'Xe', 'Ba', 'La', 'Ce', 'Pr',
          'Nd', 'Pm', 'Sm', 'Eu', 'Gd', 'Tb', 'Dy', 'Er', 'Tm', 'Yb',
          'Lu', 'Hf', 'Ta', 'Re', 'Ir', 'Pt', 'Au', 'Hg', 'Tl', 'Pb',
          'Bi', 'At', 'Fr', 'Ra', 'Ac', 'Th', 'Pa', 'Pu', 'Am', 'Cm',
          'Bk', 'Cf', 'Es', 'Fm', 'Md', 'Lr', 'Rf', 'Db', 'Sg', 'Mt',
          'Ds', 'Rg', 'Fl', 'Mc', 'Lv', 'Ts', 'Og']


def get_tokenizer_re(atoms):
    return re.compile('('+'|'.join(atoms)+r'|\%\d\d|.)')


_atoms_re = get_tokenizer_re(_atoms)


__i2t = {
    0: 'unused', 1: '>', 2: '<', 3: '2', 4: 'F', 5: 'Cl', 6: 'N',
    7: '[', 8: '6', 9: 'O', 10: 'c', 11: ']', 12: '#',
    13: '=', 14: '3', 15: ')', 16: '4', 17: '-', 18: 'n',
    19: 'o', 20: '5', 21: 'H', 22: '(', 23: 'C',
    24: '1', 25: 'S', 26: 's', 27: 'Br'
}


__t2i = {
    '>': 1, '<': 2, '2': 3, 'F': 4, 'Cl': 5, 'N': 6, '[': 7, '6': 8,
    'O': 9, 'c': 10, ']': 11, '#': 12, '=': 13, '3': 14, ')': 15,
    '4': 16, '-': 17, 'n': 18, 'o': 19, '5': 20, 'H': 21, '(': 22,
    'C': 23, '1': 24, 'S': 25, 's': 26, 'Br': 27
}


def smiles_tokenizer(line, atoms=None):
    """
    Tokenizes SMILES string atom-wise using regular expressions. While this
    method is fast, it may lead to some mistakes: Sn may be considered as Tin
    or as Sulfur with Nitrogen in aromatic cycle. Because of this, you should
    specify a set of two-letter atoms explicitly.
    Parameters:
         atoms: set of two-letter atoms for tokenization
    """
    if atoms is not None:
        reg = get_tokenizer_re(atoms)
    else:
        reg = _atoms_re
    return reg.split(line)[1::2]


def encode(sm_list, pad_size=50):
    """
    Encoder list of smiles to tensor of tokens
    """
    res = []
    lens = []
    for s in sm_list:
        tokens = ([1] + [__t2i[tok]
                  for tok in smiles_tokenizer(s)])[:pad_size - 1]
        lens.append(len(tokens))
        tokens += (pad_size - len(tokens)) * [2]
        res.append(tokens)

    return torch.tensor(res).long(), lens


def decode(tokens_tensor):
    """
    Decodes from tensor of tokens to list of smiles
    """

    smiles_res = []

    for i in range(tokens_tensor.shape[0]):
        cur_sm = ''
        for t in tokens_tensor[i].detach().cpu().numpy():
            if t == 2:
                break
            elif t > 2:
                cur_sm += __i2t[t]

        smiles_res.append(cur_sm)

    return smiles_res


def get_vocab_size():
    return len(__i2t)

In [7]:
a = encode(["C1CCCC2C1CCCC2", "c1ccncc1", "CC(=O)NCCC1=CNc2c1cc(OC)cc2CC(=O)NCCc1c[nH]c2ccc(OC)cc12"])

In [11]:
len(a[0][0])

50

In [37]:
from chembl_webresource_client.new_client import new_client
molecule = new_client.molecule
res = molecule.search('viagra')

In [33]:
res



In [41]:
from chembl_webresource_client.new_client import new_client
target = new_client.target
gene_name = 'CCR4'
res = target.search(gene_name)

In [48]:
from chembl_webresource_client.new_client import new_client
activities = new_client.activity
activities.filter(target_chembl_id="CHEMBL2414",
                  pchembl_value__isnull=False, standard_type="IC50")

[{'activity_comment': None, 'activity_id': 72026, 'activity_properties': [], 'assay_chembl_id': 'CHEMBL656318', 'assay_description': 'Inhibition of [125I]MDC binding to recombinant human C-C chemokine receptor type 4 (CCR4) expressed in murine pre-B cells', 'assay_type': 'B', 'bao_endpoint': 'BAO_0000190', 'bao_format': 'BAO_0000219', 'bao_label': 'cell-based format', 'canonical_smiles': 'O=C(C[C@H]1S[C@H](c2ccc(Cl)cc2Cl)N(CCC(=O)N2CCNCC2)C1=O)NCc1cccc2ccccc12', 'data_validity_comment': None, 'data_validity_description': None, 'document_chembl_id': 'CHEMBL1148225', 'document_journal': 'Bioorg. Med. Chem. Lett.', 'document_year': 2004, 'ligand_efficiency': {'bei': '9.32', 'le': '0.19', 'lle': '0.84', 'sei': '6.67'}, 'molecule_chembl_id': 'CHEMBL297145', 'molecule_pref_name': None, 'parent_molecule_chembl_id': 'CHEMBL297145', 'pchembl_value': '5.46', 'potential_duplicate': False, 'qudt_units': 'http://www.openphacts.org/units/Nanomolar', 'record_id': 68872, 'relation': '=', 'src_id': 1, 

In [50]:
activities[0].keys()

dict_keys(['activity_comment', 'activity_id', 'activity_properties', 'assay_chembl_id', 'assay_description', 'assay_type', 'bao_endpoint', 'bao_format', 'bao_label', 'canonical_smiles', 'data_validity_comment', 'data_validity_description', 'document_chembl_id', 'document_journal', 'document_year', 'ligand_efficiency', 'molecule_chembl_id', 'molecule_pref_name', 'parent_molecule_chembl_id', 'pchembl_value', 'potential_duplicate', 'qudt_units', 'record_id', 'relation', 'src_id', 'standard_flag', 'standard_relation', 'standard_text_value', 'standard_type', 'standard_units', 'standard_upper_value', 'standard_value', 'target_chembl_id', 'target_organism', 'target_pref_name', 'target_tax_id', 'text_value', 'toid', 'type', 'units', 'uo_units', 'upper_value', 'value'])

In [53]:
from pathlib import Path
import csv

DATA_DIR = Path("../data")

activities = new_client.activity
activities.filter(
    target_chembl_id="CHEMBL2414", pchembl_value__isnull=False, standard_type="IC50"
)

columns = activities[0].keys()
csv_file = DATA_DIR / "ccr4_ic50_meta.csv"
try:
    with open(csv_file, "w") as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=columns)
        writer.writeheader()
        for data in activities:
            writer.writerow(data)
except IOError:
    print("I/O error")


I/O error
