In [25]:
from SmilesPE.pretokenizer import atomwise_tokenizer

smi = 'CC[N+](C)(C)Cc1ccccc1Br'
toks = atomwise_tokenizer(smi)
print(toks)

['C', 'C', '[N+]', '(', 'C', ')', '(', 'C', ')', 'C', 'c', '1', 'c', 'c', 'c', 'c', 'c', '1', 'Br']


In [26]:
from SmilesPE.pretokenizer import kmer_tokenizer

smi = 'CC[N+](C)(C)Cc1ccccc1Br'
toks = kmer_tokenizer(smi, ngram=4)
print(toks)

['CC[N+](', 'C[N+](C', '[N+](C)', '(C)(', 'C)(C', ')(C)', '(C)C', 'C)Cc', ')Cc1', 'Cc1c', 'c1cc', '1ccc', 'cccc', 'cccc', 'ccc1', 'cc1Br']


In [27]:
import selfies
smi = 'CC[N+](C)(C)Cc1ccccc1Br'
sel = selfies.encoder(smi)
print(f'SELFIES string: {sel}')

toks = atomwise_tokenizer(sel)
print(toks)


SELFIES string: [C][C][N+1][Branch1][C][C][Branch1][C][C][C][C][=C][C][=C][C][=C][Ring1][=Branch1][Br]
['[C]', '[C]', '[N+1]', '[Branch1]', '[C]', '[C]', '[Branch1]', '[C]', '[C]', '[C]', '[C]', '[=C]', '[C]', '[=C]', '[C]', '[=C]', '[Ring1]', '[=Branch1]', '[Br]']


In [28]:
toks = kmer_tokenizer(sel, ngram=4)
print(toks)

['[C][C][N+1][Branch1]', '[C][N+1][Branch1][C]', '[N+1][Branch1][C][C]', '[Branch1][C][C][Branch1]', '[C][C][Branch1][C]', '[C][Branch1][C][C]', '[Branch1][C][C][C]', '[C][C][C][C]', '[C][C][C][=C]', '[C][C][=C][C]', '[C][=C][C][=C]', '[=C][C][=C][C]', '[C][=C][C][=C]', '[=C][C][=C][Ring1]', '[C][=C][Ring1][=Branch1]', '[=C][Ring1][=Branch1][Br]']


In [29]:
import deepsmiles
converter = deepsmiles.Converter(rings=True, branches=True)
smi = 'CC[N+](C)(C)Cc1ccccc1Br'
deepsmi = converter.encode(smi)
print(f'DeepSMILES string: {deepsmi}')

DeepSMILES string: CC[N+]C)C)Ccccccc6Br


In [30]:
toks = atomwise_tokenizer(deepsmi)
print(toks)

['C', 'C', '[N+]', 'C', ')', 'C', ')', 'C', 'c', 'c', 'c', 'c', 'c', 'c', '6', 'Br']


In [31]:
toks = kmer_tokenizer(deepsmi, ngram=4)
print(toks)

['CC[N+]C', 'C[N+]C)', '[N+]C)C', 'C)C)', ')C)C', 'C)Cc', ')Ccc', 'Cccc', 'cccc', 'cccc', 'cccc', 'ccc6', 'cc6Br']


In [32]:
import codecs
from SmilesPE.tokenizer import *

spe_vob= codecs.open('SPE_ChEMBL.txt')
spe = SPE_Tokenizer(spe_vob)

smi = 'CC[N+](C)(C)Cc1ccccc1Br'
spe.tokenize(smi)

'CC [N+](C) (C)C c1ccccc1 Br'