In [1]:
import nfp
nfp.__version__

'0.3.6'

In [2]:
#Reading in the json
json = 'json.json'
smiles = 'NCCCC(=O)O'

In [3]:
from nfp.preprocessing.xtb_preprocessor import xTBSmilesPreprocessor

In [4]:
def get_ring_size(obj, max_size=12):
    if not obj.IsInRing():
        return 0
    else:
        for i in range(max_size):
            if obj.IsInRingSize(i):
                return i
        else:
            return 'max'

def atom_featurizer(atom):
    """ Return an string representing the atom type
    """

    return str((
        atom.GetSymbol(),
        atom.GetIsAromatic(),
        get_ring_size(atom, max_size=6),
        atom.GetDegree(),
        atom.GetTotalNumHs(includeNeighbors=True)
    ))


In [5]:
preprocessor = xTBSmilesPreprocessor(atom_features=atom_featurizer)

In [6]:
# Initially, the preprocessor has no data on atom types, so we have to loop over the 
# training set once to pre-allocate these mappings
print("before pre-allocating")
print(preprocessor.atom_tokenizer._data)


input_dict = preprocessor(smiles, jsonfile=json, train=True)
    
print()
print("after pre-allocating")
print(preprocessor.atom_tokenizer._data)

before pre-allocating
{'unk': 1}

after pre-allocating
{'unk': 1, "('N', False, 0, 3, 2)": 2, "('C', False, 0, 4, 2)": 3, "('C', False, 0, 3, 0)": 4, "('O', False, 0, 1, 0)": 5, "('O', False, 0, 2, 1)": 6, "('H', False, 0, 1, 0)": 7}


In [7]:
# Atom types, as integer classes
input_dict['atom']

array([2, 3, 3, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7], dtype=int32)

In [8]:
# Bond types, as integer classes
input_dict['bond']

array([ 2,  3,  3,  4,  5,  6,  6,  5,  5,  6,  6,  5,  5,  6,  6,  5,  7,
        7,  8,  8,  9, 10, 10, 11, 11, 11, 11, 11, 11, 12], dtype=int32)

In [9]:
# connectivity, as integer classes
input_dict['connectivity']

array([[ 0,  1],
       [ 0,  7],
       [ 0,  8],
       [ 1,  0],
       [ 1,  2],
       [ 1,  9],
       [ 1, 10],
       [ 2,  1],
       [ 2,  3],
       [ 2, 11],
       [ 2, 12],
       [ 3,  2],
       [ 3,  4],
       [ 3, 13],
       [ 3, 14],
       [ 4,  3],
       [ 4,  5],
       [ 4,  6],
       [ 5,  4],
       [ 6,  4],
       [ 6, 15],
       [ 7,  0],
       [ 8,  0],
       [ 9,  1],
       [10,  1],
       [11,  2],
       [12,  2],
       [13,  3],
       [14,  3],
       [15,  6]])

In [10]:
input_dict['atomxtbfeatures']

array([[-0.47556, -0.96664,  1.461  ,  4.015  ,  0.     ,  0.     ,
         0.     ,  0.     ,  0.     ],
       [ 0.08509,  0.02811,  1.009  ,  2.906  ,  0.     ,  0.     ,
         0.     ,  0.     ,  0.     ],
       [-0.05684, -0.16241,  1.034  ,  3.023  ,  0.     ,  0.     ,
         0.     ,  0.     ,  0.     ],
       [-0.10887, -0.209  ,  1.049  ,  3.06   ,  0.     ,  0.     ,
         0.     ,  0.     ,  0.     ],
       [ 0.58755,  0.63844,  0.973  ,  2.439  ,  0.     ,  0.     ,
         0.     ,  0.     ,  0.     ],
       [-0.52043, -0.56624,  1.716  ,  4.805  ,  0.     ,  0.     ,
         0.     ,  0.     ,  0.     ],
       [-0.51615, -0.7004 ,  1.692  ,  4.825  ,  0.     ,  0.     ,
         0.     ,  0.     ,  0.     ],
       [ 0.21071,  0.39842,  0.789  ,  0.     ,  0.     ,  0.     ,
         0.     ,  0.     ,  0.     ],
       [ 0.21433,  0.40157,  0.786  ,  0.     ,  0.     ,  0.     ,
         0.     ,  0.     ,  0.     ],
       [ 0.02842,  0.09344,  0.972  ,

In [11]:
input_dict['bondxtbfeatures']

array([1.0119054 , 0.9630482 , 0.9622639 , 1.0119054 , 1.0052398 ,
       0.9749128 , 0.97257215, 1.0052398 , 0.9995087 , 0.97485995,
       0.97448593, 0.9995087 , 0.95687616, 0.95483685, 0.9748943 ,
       0.95687616, 1.7624316 , 1.1797163 , 1.7624316 , 1.1797163 ,
       0.76018524, 0.9630482 , 0.9622639 , 0.9749128 , 0.97257215,
       0.97485995, 0.97448593, 0.95483685, 0.9748943 , 0.76018524],
      dtype=float32)