In [3]:
from tensorflow.keras.preprocessing.text import one_hot
import tensorflow_hub as hub
import tensorflow as tf


In [5]:
text_input = tf.keras.layers.Input(shape=(), dtype=tf.string)
preprocessor = hub.KerasLayer(
    "https://kaggle.com/models/tensorflow/bert/frameworks/TensorFlow2/variations/en-uncased-preprocess/versions/3")
encoder_inputs = preprocessor(text_input)
encoder = hub.KerasLayer(
    "https://kaggle.com/models/tensorflow/bert/frameworks/TensorFlow2/variations/en-uncased-l-12-h-768-a-12/versions/3",
    trainable=True)
outputs = encoder(encoder_inputs)
pooled_output = outputs["pooled_output"]      # [batch_size, 768].
sequence_output = outputs["sequence_output"]

RuntimeError: Op type not registered 'CaseFoldUTF8' in binary running on DELL. Make sure the Op and Kernel are registered in the binary running in this process. Note that if you are loading a saved graph which used ops from tf.contrib (e.g. `tf.contrib.resampler`), accessing should be done before importing the graph, as contrib ops are lazily registered when the module is first accessed.

<h3>Making our own Word Embeddings<h3>

In [15]:
sent=[  'the glass of milk',
     'the glass of juice',
     'the cup of tea',
    'I am a good boy',
     'I am a good developer',
     'understand the meaning of words',
     'your videos are good']

In [16]:
max_length = max(len(line.split()) for line in sent)

print("Length of the longest statement:", max_length)

Length of the longest statement: 5


In [17]:
### Vocabulary size
voc_size=500

In [18]:
onehot_repr=[one_hot(words,voc_size) for words in sent] 
# This means convert ever word into the vocabulary size in the list sentences
print(onehot_repr)      

[[331, 227, 115, 334], [331, 227, 115, 371], [331, 223, 115, 149], [462, 250, 239, 127, 392], [462, 250, 239, 127, 6], [281, 331, 470, 115, 234], [157, 56, 261, 127]]


In [19]:
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential

In [20]:
import numpy as np

Here what we do is we make padding of 8 cloumns add zeros before those statemnets whose length is less than 8 

In [21]:
## pre padding
sent_length=8
embedded_docs=pad_sequences(onehot_repr,padding='pre',maxlen=sent_length)

print(embedded_docs)

[[  0   0   0   0 331 227 115 334]
 [  0   0   0   0 331 227 115 371]
 [  0   0   0   0 331 223 115 149]
 [  0   0   0 462 250 239 127 392]
 [  0   0   0 462 250 239 127   6]
 [  0   0   0 281 331 470 115 234]
 [  0   0   0   0 157  56 261 127]]


In [22]:
## 10 feature dimesnions
dim=10

In [23]:
model=Sequential()
model.add(Embedding(voc_size,10,input_length=sent_length))
model.compile('adam','mse')

In [24]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 8, 10)             5000      
                                                                 
Total params: 5000 (19.53 KB)
Trainable params: 5000 (19.53 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [25]:
model.predict(embedded_docs[0])



array([[-0.00329955, -0.01746704,  0.04325211,  0.04216815, -0.01387668,
        -0.00745512,  0.0273121 ,  0.02661324, -0.00896993, -0.01448248],
       [-0.00329955, -0.01746704,  0.04325211,  0.04216815, -0.01387668,
        -0.00745512,  0.0273121 ,  0.02661324, -0.00896993, -0.01448248],
       [-0.00329955, -0.01746704,  0.04325211,  0.04216815, -0.01387668,
        -0.00745512,  0.0273121 ,  0.02661324, -0.00896993, -0.01448248],
       [-0.00329955, -0.01746704,  0.04325211,  0.04216815, -0.01387668,
        -0.00745512,  0.0273121 ,  0.02661324, -0.00896993, -0.01448248],
       [-0.00650359,  0.01464744, -0.01151357,  0.03610105,  0.00290599,
        -0.01979535,  0.00420054,  0.03637863,  0.0370076 ,  0.03646285],
       [-0.04832684, -0.01159501,  0.04414817,  0.01672225,  0.00789487,
        -0.04383813, -0.04101064, -0.04961465, -0.02359298,  0.0209255 ],
       [-0.01906289,  0.02909409, -0.02876822, -0.00975039, -0.03107313,
        -0.03975691, -0.03243393, -0.00101523

In [26]:
print(model.predict(embedded_docs)[0])

[[-0.00329955 -0.01746704  0.04325211  0.04216815 -0.01387668 -0.00745512
   0.0273121   0.02661324 -0.00896993 -0.01448248]
 [-0.00329955 -0.01746704  0.04325211  0.04216815 -0.01387668 -0.00745512
   0.0273121   0.02661324 -0.00896993 -0.01448248]
 [-0.00329955 -0.01746704  0.04325211  0.04216815 -0.01387668 -0.00745512
   0.0273121   0.02661324 -0.00896993 -0.01448248]
 [-0.00329955 -0.01746704  0.04325211  0.04216815 -0.01387668 -0.00745512
   0.0273121   0.02661324 -0.00896993 -0.01448248]
 [-0.00650359  0.01464744 -0.01151357  0.03610105  0.00290599 -0.01979535
   0.00420054  0.03637863  0.0370076   0.03646285]
 [-0.04832684 -0.01159501  0.04414817  0.01672225  0.00789487 -0.04383813
  -0.04101064 -0.04961465 -0.02359298  0.0209255 ]
 [-0.01906289  0.02909409 -0.02876822 -0.00975039 -0.03107313 -0.03975691
  -0.03243393 -0.00101523 -0.03353322  0.00046428]
 [-0.03402957 -0.02362052 -0.04053323  0.03261372 -0.01740384 -0.03324772
   0.03970278  0.00373808  0.0326172  -0.00182427]]