### SetUp

In [27]:
import numpy as np 
import pandas as pd
import os

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

np.random.seed(42)
tf.random.set_seed(42)

### Get the data

In [28]:
data_path = '/home/login/Documents/Machine_learning/Datasets/reviews/reviews.csv'
data = pd.read_csv(data_path)
data.sample(2)

Unnamed: 0.1,Unnamed: 0,text,sentiment
887,887,Best headset ever!!!.,1
1671,1671,Service was slow and not attentive.,0


In [29]:
data.drop("Unnamed: 0", axis = 1, inplace = True)
data.sample(2)

Unnamed: 0,text,sentiment
330,VERY comfortable.,1
1255,Good service very clean and inexpensive to boot!,1


#### Randomize the data

In [30]:
np.random.permutation(data.index)

array([1195, 1645, 1858, ...,  793, 1265,  685])

In [31]:
data = data.reindex(np.random.permutation(data.index))
data.head()

Unnamed: 0,text,sentiment
757,The nano stated it.My son was dissapointed.,0
706,So anyone near you will hear part of your conv...,0
637,However BT headsets are currently not good for...,0
1983,Spend your money and time some place else.,0
279,Comfort for our whole family.,1


In [32]:
sentences = data['text'].tolist()
labels = data['sentiment'].tolist()
# print(labels)

#### Split the data into train and test

In [36]:
train_size = int(len(labels) * 0.8)
print(f"Train size: {train_size}")

train_sentences = sentences[:train_size]
test_sentences = sentences[train_size:]
train_labels = labels[:train_size]
test_labels = labels[train_size:]

# Make labels into numpy arrays for use with network later
train_labels_final = np.array(train_labels)
test_labels_final = np.array(test_labels)

Train size: 1593


### Tokenize the data

In [69]:
vocab_size = 1000
embedding_dim = 2
max_length = 100
trunc_type = 'post'
padding_type = 'post'
oov_tok = "<OOV>"

tokenizer = Tokenizer(num_words = vocab_size, oov_token = oov_tok)
tokenizer.fit_on_texts(train_sentences)
word_index = tokenizer.word_index

train_sequences = tokenizer.texts_to_sequences(train_sentences)
train_padded = pad_sequences(train_sequences, padding = padding_type, 
                            truncating = trunc_type, maxlen = max_length)

test_sequences = tokenizer.texts_to_sequences(test_sentences)
test_padded = pad_sequences(test_sequences, truncating = trunc_type,
                           maxlen = max_length, padding = padding_type)
print(test_padded)

[[ 35  64 157 ...   0   0   0]
 [  4 136  32 ...   0   0   0]
 [  4  63 175 ...   0   0   0]
 ...
 [ 35  23   1 ...   0   0   0]
 [188   0   0 ...   0   0   0]
 [  1 202  13 ...   0   0   0]]


### Review a sequence

In [70]:
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])
'''<oov>:1 becomes 1: <oov>'''

def decode_review(text):
    """Loops through the text and returns the value at given index.? is returned
    when 0 is found since its not in the reversed dictionary"""
    return " ".join([reverse_word_index.get(i, '?') for i in text])

print(decode_review(train_padded[2]))
print(train_sentences[2])

however bt headsets are <OOV> not good for real time games like first person <OOV> since the audio <OOV> <OOV> me up ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?
However BT headsets are currently not good for real time games like first-person shooters since the audio delay messes me up.


### Train a Basic Sentiment Model with Embeddings

In [78]:
tf.keras.backend.clear_session()
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length = max_length),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(6, activation = 'relu'),
    tf.keras.layers.Dense(1, activation = 'sigmoid')
])

model.compile(loss = tf.keras.losses.BinaryCrossentropy(),
             optimizer = 'adam',
             metrics = ['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 100, 2)            2000      
                                                                 
 flatten (Flatten)           (None, 200)               0         
                                                                 
 dense (Dense)               (None, 6)                 1206      
                                                                 
 dense_1 (Dense)             (None, 1)                 7         
                                                                 
Total params: 3213 (12.55 KB)
Trainable params: 3213 (12.55 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [79]:
EPOCHS = 30
model.fit(train_padded, train_labels_final, epochs = EPOCHS, validation_data = (test_padded, test_labels_final))

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.src.callbacks.History at 0x7fdca0d13850>