In [1]:
import tensorflow

In [2]:
from tensorflow.keras.preprocessing.text import Tokenizer

In [3]:
sample = ['Hell there buddy','I am the great Saurabh Vishwakarma','Also known as  Sam or Simon Riley Ghost']

In [4]:
tokeniser = Tokenizer(num_words = 1000)

In [5]:
tokeniser.fit_on_texts(sample)

In [6]:
sequence = tokeniser.texts_to_sequences(sample)

In [7]:
print(sequence)

[[1, 2, 3], [4, 5, 6, 7, 8, 9], [10, 11, 12, 13, 14, 15, 16, 17]]


In [8]:
one_hot_encoding = tokeniser.sequences_to_matrix(sequence)

In [9]:
print(one_hot_encoding)

[[0. 1. 1. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [10]:
word_index = tokeniser.word_index

In [11]:
print(word_index)

{'hell': 1, 'there': 2, 'buddy': 3, 'i': 4, 'am': 5, 'the': 6, 'great': 7, 'saurabh': 8, 'vishwakarma': 9, 'also': 10, 'known': 11, 'as': 12, 'sam': 13, 'or': 14, 'simon': 15, 'riley': 16, 'ghost': 17}


In [12]:
#Hashing Trick : 
#Word Level Encoding

In [13]:
sample = ['The cat sat on the mat','The dog ate my home work']
dimensionality = 1000
max_length = 10

In [14]:
import numpy as np

In [16]:
results= np.zeros((len(sample),max_length,dimensionality))


In [17]:
for i, samples in enumerate(sample):
  for j, word in list(enumerate(samples.split()))[:max_length]:
    index = abs(hash(word))%dimensionality
    results[i,j,index] = 1


In [18]:
print(results)

[[[0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  ...
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]

 [[0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  ...
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]]


In [19]:
from tensorflow.keras.datasets import imdb
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding,Dense,Flatten

In [21]:
max_features = 1000
maxlen = 20
(x_train,y_train),(x_test,y_test) = imdb.load_data(num_words = max_features)

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz


In [22]:
x_train = pad_sequences(x_train,maxlen = maxlen)
x_test = pad_sequences(x_test,maxlen = maxlen)

In [23]:
print(x_train)

[[ 65  16  38 ...  19 178  32]
 [ 23   4   2 ...  16 145  95]
 [  2  13 191 ...   7 129 113]
 ...
 [ 11   2   2 ...   4   2   2]
 [ 92 401 728 ...  12   9  23]
 [764  40   4 ... 204 131   9]]


In [25]:
from tensorflow.keras.models import Sequential

In [27]:
network = Sequential([
                      Embedding(1000,8,input_length = maxlen),
                      Flatten(),
                      Dense(units=1,activation = 'sigmoid')
])
network.compile(loss = 'binary_crossentropy',optimizer = 'rmsprop',metrics = ['accuracy'])

In [31]:
history = network.fit(x_train,y_train,epochs = 10,batch_size = 32,validation_split = 0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
