### Set up

In [33]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow_datasets as tfds

import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
keras = tf.keras

### Get the dataset

In [34]:
data_path = "/home/login/Documents/Machine_learning/Datasets/reviews/reviews.csv"
dataset = pd.read_csv(data_path, index_col = 'Unnamed: 0')
dataset.sample()

Unnamed: 0,text,sentiment
1511,Point your finger at any item on the menu orde...,1


In [35]:
sentences = dataset['text'].tolist()
labels = dataset['sentiment'].tolist()

### Create a subwords dataset

In [36]:
vocab_size = 1000
tokenizer = tfds.deprecated.text.SubwordTextEncoder.build_from_corpus(
    sentences, vocab_size, max_subword_length = 5
)

In [37]:
# Check tokenizer works properly
num = 5
print(sentences[5])

encoded = tokenizer.encode(sentences[num])
print(encoded)

for i in encoded:
    print(tokenizer.decode([i]))

I have to jiggle the plug to get it to line up right to get decent volume.
[4, 31, 6, 849, 162, 450, 12, 1, 600, 438, 775, 6, 175, 14, 6, 55, 213, 159, 474, 775, 6, 175, 614, 380, 295, 148, 72, 789]
I 
have 
to 
j
ig
gl
e 
the 
pl
ug
 
to 
get 
it 
to 
li
ne 
up 
right
 
to 
get 
dec
ent 
vo
lu
me
.


### Replace sentence data with encoded subwords

In [38]:
for i, sentence in enumerate(sentences):
    sentences[i] = tokenizer.encode(sentence)

[736, 168, 775, 7, 193, 417, 17, 94, 6, 600, 438, 775, 14, 19, 141, 19, 1, 828, 826, 775, 99, 445, 4, 174, 361, 5, 79, 227, 139, 789]


In [39]:
print(sentences[1])

[625, 677, 626, 274, 380, 633, 148, 844, 789]


### Final preprocessing

In [40]:
max_length = 50
trunc_type = 'post'
padding_type = 'post'

sentences_padded = pad_sequences(sentences, maxlen = max_length,
                                truncating = trunc_type, padding = padding_type)

train_size = int(len(sentences_padded) * .8)

train_sentences = sentences[:train_size]
test_sentences = sentences[train_size:]
train_labels = labels[:train_size]
test_labels = labels[train_size:]

train_labels_final = np.array(train_labels)
test_labels_final = np.array(test_labels)