Import some Python modules

# Creation of molecular embeddings for classifying compounds using Transformers

## Obtaining Data

In [1]:
import pandas as pd
import numpy as np
import requests
import io

Get the latest version of the dataset from the repository

In [2]:
url = 'https://github.com/GLambard/Molecules_Dataset_Collection/raw/master/originals/HIV.csv'

data = requests.get(url).content
df = pd.read_csv(io.StringIO(data.decode('utf-8')), index_col = 0)
df.reset_index(inplace=True)
df

Unnamed: 0,smiles,activity,HIV_active
0,CCC1=[O+][Cu-3]2([O+]=C(CC)C1)[O+]=C(CC)CC(CC)...,CI,0
1,C(=Cc1ccccc1)C1=[O+][Cu-3]2([O+]=C(C=Cc3ccccc3...,CI,0
2,CC(=O)N1c2ccccc2Sc2c1ccc1ccccc21,CI,0
3,Nc1ccc(C=Cc2ccc(N)cc2S(=O)(=O)O)c(S(=O)(=O)O)c1,CI,0
4,O=S(=O)(O)CCS(=O)(=O)O,CI,0
...,...,...,...
41122,CCC1CCC2c3c([nH]c4ccc(C)cc34)C3C(=O)N(N(C)C)C(...,CI,0
41123,Cc1ccc2[nH]c3c(c2c1)C1CCC(C(C)(C)C)CC1C1C(=O)N...,CI,0
41124,Cc1ccc(N2C(=O)C3c4[nH]c5ccccc5c4C4CCC(C(C)(C)C...,CI,0
41125,Cc1cccc(N2C(=O)C3c4[nH]c5ccccc5c4C4CCC(C(C)(C)...,CI,0


## Pre-Processing Data

Create alphabet

http://opensmiles.org/opensmiles.html

In [3]:
elements = 'H,He,Li,Be,B,C,N,O,F,Ne,Na,Mg,Al,Si,P,S,Cl,Ar,K,Ca,Sc,Ti,V,Cr,Mn,Fe,Co,Ni,Cu,Zn,Ga,Ge,As,Se,Br,Kr,Rb,Sr,Y,Zr,Nb,Mo,Tc,Ru,Rh,Pd,Ag,Cd,In,Sn,Sb,Te,I,Xe,Cs,Ba,La,Ce,Pr,Nd,Pm,Sm,Eu,Gd,Tb,Dy,Ho,Er,Tm,Yb,Lu,Hf,Ta,W,Re,Os,Ir,Pt,Au,Hg,Tl,Pb,Bi,Po,At,Rn,Fr,Ra,Ac,Th,Pa,U,Np,Pu,Am,Cm,Bk,Cf,Es,Fm,Md,No,Lr,Rf,Db,Sg,Bh,Hs,Mt,Ds,Rg,Cn,Uut,Fl,Uup,Lv,Uus,Uuo'
aromatic_atoms = 'b,c,n,o,p,s,se,as'
symbols = '[,],(,),=,+,-,#,:,@,.,%'
isotopes = '0,1,2,3,4,5,6,7,8,9'
other = 'te'

elements = str(elements).split(',')
aromatic_atoms = str(aromatic_atoms).split(',')
symbols = str(symbols).split(',')
isotopes = str(isotopes).split(',')
other = str(other).split(',')

alphabet = elements + aromatic_atoms + symbols + isotopes + other

Process smile

In [4]:
def process_smile(smile):
  units = []
  i = 0;
  found = False;
  while i < len(smile):
    if len(smile[i:]) >= 3:
      if smile[i:i+3] in alphabet:
        units.append(smile[i:i+3])
        i += 3
        found = True
    if len(smile[i:]) >= 2 and not found:
      if smile[i:i+2] in alphabet:
        units.append(smile[i:i+2])
        i += 2
        found = True
    if len(smile[i:]) >= 1 and not found:
      if smile[i] in alphabet:
        units.append(smile[i])
        i += 1
        found = True
    if not found:
      print('Error in value', smile[i])
      print(smile)
      break
    found = False
  result = ' '.join(units)
  return result

Process smiles

In [5]:
def process_smiles(smiles):
  processed_smiles = list()
  for i in range(len(smiles)):
      processed_smiles.append(process_smile(smiles[i]))
  return processed_smiles

In [6]:
df['processed_smiles'] = process_smiles(df['smiles'].values)

In [7]:
df

Unnamed: 0,smiles,activity,HIV_active,processed_smiles
0,CCC1=[O+][Cu-3]2([O+]=C(CC)C1)[O+]=C(CC)CC(CC)...,CI,0,C C C 1 = [ O + ] [ Cu - 3 ] 2 ( [ O + ] = C (...
1,C(=Cc1ccccc1)C1=[O+][Cu-3]2([O+]=C(C=Cc3ccccc3...,CI,0,C ( = C c 1 c c c c c 1 ) C 1 = [ O + ] [ Cu -...
2,CC(=O)N1c2ccccc2Sc2c1ccc1ccccc21,CI,0,C C ( = O ) N 1 c 2 c c c c c 2 Sc 2 c 1 c c c...
3,Nc1ccc(C=Cc2ccc(N)cc2S(=O)(=O)O)c(S(=O)(=O)O)c1,CI,0,N c 1 c c c ( C = C c 2 c c c ( N ) c c 2 S ( ...
4,O=S(=O)(O)CCS(=O)(=O)O,CI,0,O = S ( = O ) ( O ) C C S ( = O ) ( = O ) O
...,...,...,...,...
41122,CCC1CCC2c3c([nH]c4ccc(C)cc34)C3C(=O)N(N(C)C)C(...,CI,0,C C C 1 C C C 2 c 3 c ( [ n H ] c 4 c c c ( C ...
41123,Cc1ccc2[nH]c3c(c2c1)C1CCC(C(C)(C)C)CC1C1C(=O)N...,CI,0,C c 1 c c c 2 [ n H ] c 3 c ( c 2 c 1 ) C 1 C ...
41124,Cc1ccc(N2C(=O)C3c4[nH]c5ccccc5c4C4CCC(C(C)(C)C...,CI,0,C c 1 c c c ( N 2 C ( = O ) C 3 c 4 [ n H ] c ...
41125,Cc1cccc(N2C(=O)C3c4[nH]c5ccccc5c4C4CCC(C(C)(C)...,CI,0,C c 1 c c c c ( N 2 C ( = O ) C 3 c 4 [ n H ] ...


## Generate Fingerprints

In [12]:
def split_smile(smile):
  units = []
  i = 0;
  found = False;
  while i < len(smile):
    if len(smile[i:]) >= 3:
      if smile[i:i+3] in alphabet:
        units.append(smile[i:i+3])
        i += 3
        found = True
    if len(smile[i:]) >= 2 and not found:
      if smile[i:i+2] in alphabet:
        units.append(smile[i:i+2])
        i += 2
        found = True
    if len(smile[i:]) >= 1 and not found:
      if smile[i] in alphabet:
        units.append(smile[i])
        i += 1
        found = True
    if not found:
      print('Error in value', smile[i])
      print(smile)
      break
    found = False
  return units

In [13]:
smiles = df['smiles'].values

def longest_sequence(smiles):
  max = 0
  for smile in smiles:
    units = split_smile(smile)
    if len(units) > max:
      max = len(units)
  return max

sequence_length = longest_sequence(smiles)
sequence_length

575

In [14]:
! pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/d8/b2/57495b5309f09fa501866e225c84532d1fd89536ea62406b2181933fb418/transformers-4.5.1-py3-none-any.whl (2.1MB)
[K     |████████████████████████████████| 2.1MB 8.3MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/75/ee/67241dc87f266093c533a2d4d3d69438e57d7a90abb216fa076e7d475d4a/sacremoses-0.0.45-py3-none-any.whl (895kB)
[K     |████████████████████████████████| 901kB 34.2MB/s 
Collecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/ae/04/5b870f26a858552025a62f1649c20d29d2672c02ff3c3fb4c688ca46467a/tokenizers-0.10.2-cp37-cp37m-manylinux2010_x86_64.whl (3.3MB)
[K     |████████████████████████████████| 3.3MB 43.0MB/s 
Installing collected packages: sacremoses, tokenizers, transformers
Successfully installed sacremoses-0.0.45 tokenizers-0.10.2 transformers-4.5.1


In [15]:
from transformers import TFAutoModel, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')

tokenizer.add_tokens(alphabet)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=570.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=213450.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=435797.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=29.0, style=ProgressStyle(description_w…




72

In [17]:
def tokenize(sequence):
    tokens = tokenizer.encode_plus(sequence,                    # sequence to tokenize
                                   max_length=sequence_length,  # maximum size of the sequence
                                   truncation=True,             # truncate any sequence longer than the maximum size
                                   padding='max_length',        # allow any sequence shorter than the maximum size to be padded
                                   return_token_type_ids=False, # output token_type_ids not needed 
                                   return_tensors='tf')         # working in tensorflow
    return tokens['input_ids']

In [18]:
ids = np.zeros((len(smiles), sequence_length))

print(ids.shape)

(41127, 575)


In [19]:
for i in range(len(smiles)):
    ids[i, :] = tokenize(smiles[i])

In [20]:
df['fingerprint'] = ids.tolist()

In [21]:
#df = df.drop(columns=['fingerprints'])
df

Unnamed: 0,smiles,activity,HIV_active,processed_smiles,fingerprint
0,CCC1=[O+][Cu-3]2([O+]=C(CC)C1)[O+]=C(CC)CC(CC)...,CI,0,C C C 1 = [ O + ] [ Cu - 3 ] 2 ( [ O + ] = C (...,"[101.0, 21362.0, 1658.0, 1475.0, 134.0, 164.0,..."
1,C(=Cc1ccccc1)C1=[O+][Cu-3]2([O+]=C(C=Cc3ccccc3...,CI,0,C ( = C c 1 c c c c c 1 ) C 1 = [ O + ] [ Cu -...,"[101.0, 140.0, 113.0, 134.0, 140.0, 1665.0, 14..."
2,CC(=O)N1c2ccccc2Sc2c1ccc1ccccc21,CI,0,C C ( = O ) N 1 c 2 c c c c c 2 Sc 2 c 1 c c c...,"[101.0, 21362.0, 113.0, 134.0, 152.0, 114.0, 1..."
3,Nc1ccc(C=Cc2ccc(N)cc2S(=O)(=O)O)c(S(=O)(=O)O)c1,CI,0,N c 1 c c c ( C = C c 2 c c c ( N ) c c 2 S ( ...,"[101.0, 151.0, 1665.0, 1475.0, 19515.0, 1665.0..."
4,O=S(=O)(O)CCS(=O)(=O)O,CI,0,O = S ( = O ) ( O ) C C S ( = O ) ( = O ) O,"[101.0, 152.0, 134.0, 156.0, 113.0, 134.0, 152..."
...,...,...,...,...,...
41122,CCC1CCC2c3c([nH]c4ccc(C)cc34)C3C(=O)N(N(C)C)C(...,CI,0,C C C 1 C C C 2 c 3 c ( [ n H ] c 4 c c c ( C ...,"[101.0, 21362.0, 1658.0, 1475.0, 12096.0, 1658..."
41123,Cc1ccc2[nH]c3c(c2c1)C1CCC(C(C)(C)C)CC1C1C(=O)N...,CI,0,C c 1 c c c 2 [ n H ] c 3 c ( c 2 c 1 ) C 1 C ...,"[101.0, 140.0, 1665.0, 1475.0, 19515.0, 1665.0..."
41124,Cc1ccc(N2C(=O)C3c4[nH]c5ccccc5c4C4CCC(C(C)(C)C...,CI,0,C c 1 c c c ( N 2 C ( = O ) C 3 c 4 [ n H ] c ...,"[101.0, 140.0, 1665.0, 1475.0, 19515.0, 1665.0..."
41125,Cc1cccc(N2C(=O)C3c4[nH]c5ccccc5c4C4CCC(C(C)(C)...,CI,0,C c 1 c c c c ( N 2 C ( = O ) C 3 c 4 [ n H ] ...,"[101.0, 140.0, 1665.0, 1475.0, 19515.0, 19515...."


In [26]:
decoded = tokenizer.decode(ids[0]['input_ids']) 

IndexError: ignored

In [28]:
!pip install sentence-transformers

Collecting sentence-transformers
[?25l  Downloading https://files.pythonhosted.org/packages/c4/87/49dc49e13ac107ce912c2f3f3fd92252c6d4221e88d1e6c16747044a11d8/sentence-transformers-1.1.0.tar.gz (78kB)
[K     |████▏                           | 10kB 13.8MB/s eta 0:00:01[K     |████████▎                       | 20kB 19.1MB/s eta 0:00:01[K     |████████████▌                   | 30kB 11.4MB/s eta 0:00:01[K     |████████████████▋               | 40kB 9.6MB/s eta 0:00:01[K     |████████████████████▊           | 51kB 7.9MB/s eta 0:00:01[K     |█████████████████████████       | 61kB 8.0MB/s eta 0:00:01[K     |█████████████████████████████   | 71kB 8.6MB/s eta 0:00:01[K     |████████████████████████████████| 81kB 4.7MB/s 
Collecting sentencepiece
[?25l  Downloading https://files.pythonhosted.org/packages/f5/99/e0808cb947ba10f575839c43e8fafc9cc44e4a7a2c8f79c60db48220a577/sentencepiece-0.1.95-cp37-cp37m-manylinux2014_x86_64.whl (1.2MB)
[K     |████████████████████████████████| 1

In [246]:
from sentence_transformers import SentenceTransformer
sbert_model = SentenceTransformer('bert-base-nli-mean-tokens')

In [248]:
sentence_embeddings = model.encode(split_smile(smiles[0]))

In [251]:
sentence_embeddings

array([[-0.28823736, -0.35624063,  2.3871417 , ...,  0.45486125,
        -0.02593684,  0.3003537 ],
       [-0.28823736, -0.35624063,  2.3871417 , ...,  0.45486125,
        -0.02593684,  0.3003537 ],
       [-0.28823736, -0.35624063,  2.3871417 , ...,  0.45486125,
        -0.02593684,  0.3003537 ],
       ...,
       [ 0.01723187, -0.28072873,  2.0650592 , ..., -0.02811317,
         0.06294928,  0.3878306 ],
       [ 0.19048661, -0.13500808,  2.5141323 , ...,  0.31000856,
         0.16385144, -0.090996  ],
       [-0.16357912, -0.23144011,  1.8824396 , ...,  0.39567515,
         0.522398  ,  0.00749508]], dtype=float32)

In [252]:
model.encode(split_smile(smiles[1]))

array([[-2.88237363e-01, -3.56240630e-01,  2.38714170e+00, ...,
         4.54861253e-01, -2.59368420e-02,  3.00353706e-01],
       [ 5.89574166e-02, -2.46308133e-01,  2.68255138e+00, ...,
         3.95458311e-01,  1.74260318e-01,  1.33215576e-01],
       [-1.61529407e-01, -3.11214566e-01,  2.32731605e+00, ...,
         2.64565158e-03, -2.14386433e-02,  3.19118351e-01],
       ...,
       [ 1.11085035e-01, -8.80328789e-02,  2.53355312e+00, ...,
         3.96026582e-01,  2.15556964e-01,  4.12720675e-03],
       [-2.88237363e-01, -3.56240630e-01,  2.38714170e+00, ...,
         4.54861253e-01, -2.59368420e-02,  3.00353706e-01],
       [-1.77325666e-01,  1.48379309e-02,  1.62181509e+00, ...,
         1.46162227e-01,  4.67429757e-01,  4.91010666e-01]], dtype=float32)

## SmilesPE

https://towardsdatascience.com/tensorflow-and-transformers-df6fceaf57cc

In [281]:
!pip install SmilesPE

from SmilesPE.pretokenizer import atomwise_tokenizer

Collecting SmilesPE
  Downloading https://files.pythonhosted.org/packages/6d/f9/273f54d9d4b42779926291c82a5b3ffea30cff2492ebbe4ce08dccdcc949/SmilesPE-0.0.3-py3-none-any.whl
Installing collected packages: SmilesPE
Successfully installed SmilesPE-0.0.3


In [282]:
my_tokenized_data = []
vocabulary = set()

for smiles in df['smiles'].values:
    current_tokens = atomwise_tokenizer(smiles)
    vocabulary.update(current_tokens)
    my_tokenized_data.append(current_tokens)

In [283]:
vocabulary

{'#',
 '%10',
 '%11',
 '%12',
 '%13',
 '%14',
 '(',
 ')',
 '-',
 '.',
 '1',
 '2',
 '3',
 '4',
 '5',
 '6',
 '7',
 '8',
 '9',
 '=',
 'B',
 'Br',
 'C',
 'Cl',
 'F',
 'I',
 'N',
 'O',
 'P',
 'S',
 '[Ac]',
 '[Ag-]',
 '[Ag]',
 '[AlH3-3]',
 '[AlH3-]',
 '[Al]',
 '[As+]',
 '[AsH]',
 '[As]',
 '[Au-3]',
 '[Au-]',
 '[Au]',
 '[B+2]',
 '[B+]',
 '[B-2]',
 '[B-]',
 '[BH2-]',
 '[BH3-]',
 '[Bi+]',
 '[Bi]',
 '[Br-]',
 '[BrH+]',
 '[BrH2+]',
 '[C+]',
 '[C-]',
 '[CH+]',
 '[CH-]',
 '[CH2-]',
 '[Ca-2]',
 '[Ca-4]',
 '[CaH2]',
 '[Cl+3]',
 '[Cl-]',
 '[ClH+]',
 '[ClH2+]',
 '[Co+2]',
 '[Co-2]',
 '[Co-3]',
 '[Co-4]',
 '[Co]',
 '[Cr]',
 '[Cs+]',
 '[Cu+2]',
 '[Cu-2]',
 '[Cu-3]',
 '[Cu-4]',
 '[Cu-5]',
 '[Cu-]',
 '[Cu]',
 '[FH+]',
 '[Fe+2]',
 '[Fe+3]',
 '[Fe+]',
 '[Fe-2]',
 '[Fe-3]',
 '[Fe-4]',
 '[Fe-]',
 '[Fe]',
 '[Ga-3]',
 '[Ga-]',
 '[GaH3]',
 '[Ga]',
 '[Gd+3]',
 '[GeH2+]',
 '[Ge]',
 '[H+]',
 '[H-]',
 '[H]',
 '[Hg-2]',
 '[Hg-]',
 '[Hg]',
 '[Ho]',
 '[I+]',
 '[I-]',
 '[IH2+]',
 '[IH2]',
 '[Ir+3]',
 '[Ir+]',
 '[Ir-3]',


In [284]:
my_tokenized_data

[['C',
  'C',
  'C',
  '1',
  '=',
  '[O+]',
  '[Cu-3]',
  '2',
  '(',
  '[O+]',
  '=',
  'C',
  '(',
  'C',
  'C',
  ')',
  'C',
  '1',
  ')',
  '[O+]',
  '=',
  'C',
  '(',
  'C',
  'C',
  ')',
  'C',
  'C',
  '(',
  'C',
  'C',
  ')',
  '=',
  '[O+]',
  '2'],
 ['C',
  '(',
  '=',
  'C',
  'c',
  '1',
  'c',
  'c',
  'c',
  'c',
  'c',
  '1',
  ')',
  'C',
  '1',
  '=',
  '[O+]',
  '[Cu-3]',
  '2',
  '(',
  '[O+]',
  '=',
  'C',
  '(',
  'C',
  '=',
  'C',
  'c',
  '3',
  'c',
  'c',
  'c',
  'c',
  'c',
  '3',
  ')',
  'C',
  'C',
  '(',
  'c',
  '3',
  'c',
  'c',
  'c',
  'c',
  'c',
  '3',
  ')',
  '=',
  '[O+]',
  '2',
  ')',
  '[O+]',
  '=',
  'C',
  '(',
  'c',
  '2',
  'c',
  'c',
  'c',
  'c',
  'c',
  '2',
  ')',
  'C',
  '1'],
 ['C',
  'C',
  '(',
  '=',
  'O',
  ')',
  'N',
  '1',
  'c',
  '2',
  'c',
  'c',
  'c',
  'c',
  'c',
  '2',
  'S',
  'c',
  '2',
  'c',
  '1',
  'c',
  'c',
  'c',
  '1',
  'c',
  'c',
  'c',
  'c',
  'c',
  '2',
  '1'],
 ['N',
  'c',
  '1',
  'c

## Transformer

In [9]:
! pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/d8/b2/57495b5309f09fa501866e225c84532d1fd89536ea62406b2181933fb418/transformers-4.5.1-py3-none-any.whl (2.1MB)
[K     |████████████████████████████████| 2.1MB 8.0MB/s 
[?25hCollecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/75/ee/67241dc87f266093c533a2d4d3d69438e57d7a90abb216fa076e7d475d4a/sacremoses-0.0.45-py3-none-any.whl (895kB)
[K     |████████████████████████████████| 901kB 41.4MB/s 
Collecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/ae/04/5b870f26a858552025a62f1649c20d29d2672c02ff3c3fb4c688ca46467a/tokenizers-0.10.2-cp37-cp37m-manylinux2010_x86_64.whl (3.3MB)
[K     |████████████████████████████████| 3.3MB 36.0MB/s 
Installing collected packages: sacremoses, tokenizers, transformers
Successfully installed sacremoses-0.0.45 tokenizers-0.10.2 transformers-4.5.1


In [17]:
from transformers import TFAutoModel, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')
bert = TFAutoModel.from_pretrained('bert-base-cased', output_attentions=True)

Some layers from the model checkpoint at bert-base-cased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [18]:
def split_smile(smile):
  units = []
  i = 0;
  found = False;
  while i < len(smile):
    if len(smile[i:]) >= 3:
      if smile[i:i+3] in alphabet:
        units.append(smile[i:i+3])
        i += 3
        found = True
    if len(smile[i:]) >= 2 and not found:
      if smile[i:i+2] in alphabet:
        units.append(smile[i:i+2])
        i += 2
        found = True
    if len(smile[i:]) >= 1 and not found:
      if smile[i] in alphabet:
        units.append(smile[i])
        i += 1
        found = True
    if not found:
      print('Error in value', smile[i])
      print(smile)
      break
    found = False
  return units

In [19]:
smiles = df['smiles'].values

def longest_sequence(smiles):
  max = 0
  for smile in smiles:
    units = split_smile(smile)
    if len(units) > max:
      max = len(units)
  return max

sequence_length = longest_sequence(smiles)
sequence_length

575

In [20]:
def tokenize(sequence):
    tokens = tokenizer.encode_plus(sequence,                    # sequence to tokenize
                                   max_length=sequence_length,  # maximum size of the sequence
                                   truncation=True,             # truncate any sequence longer than the maximum size
                                   padding='max_length',        # allow any sequence shorter than the maximum size to be padded
                                   add_special_tokens=True,     # allow special tokens (important for BERT)
                                   return_attention_mask=True,  # output attention_mask needed
                                   return_token_type_ids=False, # output token_type_ids not needed 
                                   return_tensors='tf')         # working in tensorflow
    return tokens['input_ids'], tokens['attention_mask']

In [21]:
ids = np.zeros((len(smiles), sequence_length))
masks = np.zeros((len(smiles), sequence_length))

print(ids.shape)
print(masks.shape)

(41127, 575)
(41127, 575)


In [22]:
for i in range(len(smiles)):
    ids[i, :], masks[i, :] = tokenize(smiles[i])

In [23]:
ids = ids.astype('int32')
masks = masks.astype('int32')

In [28]:
outputs = bert.predict([[ids[0]],[masks[0]]])

In [29]:
last_layer_embeddings = outputs[0]

In [31]:
print(last_layer_embeddings.shape)
last_layer_embeddings

(575, 1, 768)


array([[[ 0.37119812,  0.36962244,  0.2562517 , ...,  0.24602276,
          0.4760258 ,  0.05093331]],

       [[ 0.36878288,  0.37182748,  0.25462052, ...,  0.24217473,
          0.47841638,  0.0531881 ]],

       [[-0.06826647, -0.29844305,  0.08319429, ...,  0.15617867,
          0.9670284 ,  0.3046805 ]],

       ...,

       [[ 0.36834744,  0.38850296,  0.26901472, ...,  0.23826051,
          0.47614712,  0.05144326]],

       [[ 0.36834744,  0.38850296,  0.26901472, ...,  0.23826051,
          0.47614712,  0.05144326]],

       [[ 0.36834744,  0.38850296,  0.26901472, ...,  0.23826051,
          0.47614712,  0.05144326]]], dtype=float32)

In [None]:
# concat last last layer values

In [None]:
#outputs = bert(ids, attention_mask=masks)

## Tests

In [59]:
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
outputs = bert(input_ids)
# embeddings of last layer
last_hidden_states = outputs[0]  
last_hidden_states[0]

<tf.Tensor: shape=(8, 768), dtype=float32, numpy=
array([[ 0.5132387 ,  0.50970536,  0.19912973, ..., -0.389992  ,
         0.40526882, -0.23153394],
       [ 0.5394626 , -0.36580864,  0.6667345 , ..., -0.39200157,
         0.25045052,  0.02019714],
       [ 0.77666277,  0.68226093,  0.71096045, ..., -0.0420047 ,
        -0.37177953,  0.37482277],
       ...,
       [ 0.35550106,  0.448573  ,  0.6175445 , ..., -0.03877984,
        -0.26307523,  0.35140657],
       [ 0.7927249 , -0.1281678 ,  0.27373925, ..., -0.5219563 ,
         0.4836444 ,  0.09373077],
       [ 1.2903223 ,  1.0355558 ,  0.5053784 , ..., -0.434378  ,
         1.197262  , -0.42358434]], dtype=float32)>