### SetUp

In [44]:
import tensorflow as tf
import tensorflow_datasets as tfds
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os

from tensorflow.keras.preprocessing.sequence import pad_sequences
keras = tf.keras

### Get the dataset

In [33]:
data_path = '/home/login/Documents/Machine_learning/Datasets/reviews/reviews.csv'
dataset = pd.read_csv(data_path, index_col = "Unnamed: 0")

sentences = dataset['text'].tolist()
labels = dataset['sentiment'].tolist()
len(sentences)

1992

### Create a subwords Dataset

In [34]:
vocab_size = 1000
tokenizer = tfds.deprecated.text.SubwordTextEncoder.build_from_corpus(
    sentences, vocab_size, max_subword_length = 5
)
print("Vocab size is ", tokenizer.vocab_size)

Vocab size is  999


In [35]:
# check tokenizer works properly
num = 5
print(sentences[num])

encoded = tokenizer.encode(sentences[num])
print(encoded)

for _ in encoded:
    print(tokenizer.decode([_]))

I have to jiggle the plug to get it to line up right to get decent volume.
[4, 31, 6, 849, 162, 450, 12, 1, 600, 438, 775, 6, 175, 14, 6, 55, 213, 159, 474, 775, 6, 175, 614, 380, 295, 148, 72, 789]
I 
have 
to 
j
ig
gl
e 
the 
pl
ug
 
to 
get 
it 
to 
li
ne 
up 
right
 
to 
get 
dec
ent 
vo
lu
me
.


### Replace sentence data with encoded subwords

In [36]:
for i, sentence in enumerate(sentences):
    sentences[i] = tokenizer.encode(sentence)

In [37]:
print(sentences[3])

[827, 187, 11, 6, 143, 63, 17, 79, 227, 594, 331, 3, 660, 26, 313, 182, 795, 415, 374, 60, 789, 820, 808, 817, 822, 825, 775, 823, 825, 822, 809, 819, 812, 820, 826, 263]


### Final Preprocessing

In [38]:
max_length = 50
padding_type = "post"
trunc_type = "post"

sequences_padded = pad_sequences(sentences, maxlen = max_length,
                                truncating = trunc_type, padding = padding_type)

train_size = int(len(labels) * 0.8)

train_sequences = sequences_padded[:train_size]
test_sequences = sequences_padded[train_size:]
train_labels = labels[:train_size]
test_labels = labels[train_size:]

train_labels_finals = np.array(train_labels)
test_labels_finals = np.array(test_labels)

### Create a model using Embedding

In [39]:
embedding_dim = 16

model = keras.models.Sequential([
    keras.layers.Embedding(vocab_size, embedding_dim, input_length = max_length),
    keras.layers.GlobalAveragePooling1D(),
    keras.layers.Dense(6, activation = 'relu'),
    keras.layers.Dense(1, activation = 'sigmoid')
])
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 50, 16)            16000     
                                                                 
 global_average_pooling1d (  (None, 16)                0         
 GlobalAveragePooling1D)                                         
                                                                 
 dense (Dense)               (None, 6)                 102       
                                                                 
 dense_1 (Dense)             (None, 1)                 7         
                                                                 
Total params: 16109 (62.93 KB)
Trainable params: 16109 (62.93 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


### Train the model

In [43]:
EPOCHS = 30

model.compile(loss = keras.losses.BinaryCrossentropy(),
             optimizer = keras.optimizers.Adam(),
             metrics = ['accuracy'])

history = model.fit(train_sequences, train_labels_finals,
                   epochs = EPOCHS,
                   validation_data = (test_sequences, test_labels_finals))

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


### Plot the accuracy and loss

In [None]:
def plot_graphs