### SetUp

In [2]:
import numpy as np
import pandas as pd
import os

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
keras = tf.keras

### Get the dataset

In [11]:
dataset = pd.read_csv('/home/login/Documents/Machine_learning/Datasets/reviews/reviews.csv', index_col = "Unnamed: 0")
dataset.sample()

Unnamed: 0,text,sentiment
752,Blue Ant is easy to use.,1


In [19]:
sentences = dataset['text'].tolist()
labels = dataset['sentiment'].tolist()

print(len(sentences) == len(labels))
train_size = int(len(labels) * 0.8)

train_sentences = sentences[:train_size]
test_sentences = sentences[train_size:]
train_labels = labels[:train_size]
test_labels = labels[train_size:]

train_labels_final = np.array(train_labels)
test_labels_final = np.array(test_labels)
print(len(test_labels_final))

True
399


### Tokenize the data(with Tweaks)

In [47]:
vocab_size = 500
embedding_dim = 16
max_length = 50
trunc_type = 'post'
padding_type = 'post'
oov_tok = "<OOV>"

tokenizer = Tokenizer(num_words = vocab_size, oov_token = oov_tok)
tokenizer.fit_on_texts(train_sentences)
word_index = tokenizer.word_index
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])
print(len(reverse_word_index))

train_sequences = tokenizer.texts_to_sequences(train_sentences)
train_padded = pad_sequences(train_sequences, maxlen = max_length,
                            truncating = trunc_type, padding = padding_type)

test_sequences = tokenizer.texts_to_sequences(test_sentences)
test_padded = pad_sequences(test_sequences, maxlen = max_length,
                           truncating = trunc_type, padding = padding_type)
test_padded

2834


array([[ 41, 118,   1, ...,   0,   0,   0],
       [ 43,  66,  23, ...,   0,   0,   0],
       [ 20, 364,  19, ...,   0,   0,   0],
       ...,
       [255,   4,  10, ...,   0,   0,   0],
       [  2,   1, 230, ...,   0,   0,   0],
       [125,  32,  46, ...,   0,   0,   0]], dtype=int32)

### Train a Sentiment Model (with tweaks)

In [70]:
model = keras.models.Sequential([
    keras.layers.Embedding(vocab_size, embedding_dim, input_length = max_length),
    keras.layers.GlobalAveragePooling1D(),
    keras.layers.Dense(6, activation = 'relu'),
    keras.layers.Dense(1, activation = 'sigmoid')
])

model.compile(loss = "binary_crossentropy",
              optimizer = 'adam',
             metrics = ['accuracy'])

model.summary()

Model: "sequential_8"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_8 (Embedding)     (None, 50, 16)            8000      
                                                                 
 global_average_pooling1d_6  (None, 16)                0         
  (GlobalAveragePooling1D)                                       
                                                                 
 dense_12 (Dense)            (None, 6)                 102       
                                                                 
 dense_13 (Dense)            (None, 1)                 7         
                                                                 
Total params: 8109 (31.68 KB)
Trainable params: 8109 (31.68 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [72]:
EPOCHS = 30
history = model.fit(train_padded, train_labels_final, epochs = EPOCHS, 
                    validation_data = (test_padded, test_labels_final))

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
