In [None]:
import numpy as np
import pandas as pd

import tensorflow_datasets as tfds
import tensorflow as tf

In [None]:
import matplotlib.pyplot as plt


def plot_graphs(history, metric):
  plt.plot(history.history[metric])
  plt.plot(history.history['val_'+metric], '')
  plt.xlabel("Epochs")
  plt.ylabel(metric)
  plt.legend([metric, 'val_'+metric])

In [None]:
data = pd.read_csv("clean_Text.csv")
data.sample(7)

Unnamed: 0.1,Unnamed: 0,review,cleanText
6407,6407,"To put it simply, this was a pompous piece of ...",put simply pompous piece canine poopie overly ...
43726,43726,"I really tried to like ""Saw."" The story was go...",really tried like saw story good admire breakn...
37682,37682,"I watch a lot of movies - DVD, features, and c...",watch lot movies dvd features classics name ni...
15008,15008,Due to this show getting cut early I never rea...,due show getting cut early never realized rece...
5078,5078,"After a humiliating experience on an airplane,...",humiliating experience airplane nashawn wade k...
35836,35836,"If this movie proves only one thing, it's that...",movie proves one thing keaton always comic hea...
23875,23875,Although I was born in the year that this movi...,although born year movie came never heard juni...


In [None]:
indexed_data = pd.read_csv("IMDB Dataset.csv")

In [None]:
indexed_data.sentiment = indexed_data.sentiment.map({"positive":1,"negative":0})

indexed_data.review = data.cleanText
indexed_data.sample(7)

Unnamed: 0,review,sentiment
26197,propaganda pro american war effort film came e...,0
40713,french germans sure long history liking intere...,1
19805,worst movie history cinema know trying funny s...,0
43417,movie gained much recognition appraise spinoff...,0
132,danish bladerunner boldly stated box kidding f...,0
11087,cheers wonderful fans film seen appreciate bas...,1
44500,reasonably good looking girl starring film zoe...,0


In [None]:
dataset = tf.data.Dataset.from_tensor_slices((indexed_data['review'].values, indexed_data['sentiment'].values))

In [None]:
dataset_size = len(dataset)

# Define the size of the training and testing subsets
train_size = int(0.8 * dataset_size)

In [None]:
dataset_size

50000

In [None]:
train_dataset = dataset.take(train_size)
test_dataset = dataset.skip(train_size)

In [None]:
for i,j in test_dataset:
    print(i.numpy(),j.numpy(),sep="\n")
    break

b'first want say lean liberal political scale found movie offensive managed watch whole doggone disgrace film movie brings low original ideas yes original thus stars instead film writers uncreative come acting horrible characters unlikeable part lead lady story good qualities made sort bad guy see maybe missed something know earth relevant character movie shell money garbage almost wish peta would come rescue awful offensive movie form protest disgusting thats say anymore'
0


In [None]:
train_dataset = train_dataset.shuffle(10000).batch(64).prefetch(tf.data.AUTOTUNE)
test_dataset = test_dataset.batch(64).prefetch(tf.data.AUTOTUNE)

In [None]:
# VOCAB_SIZE = 1000
encoder = tf.keras.layers.TextVectorization(max_tokens = 1000)
encoder.adapt(train_dataset.map(lambda text, label: text))

In [None]:
encoder(i)[:3].numpy()

array([[ 20,  79,  49, ...,   0,   0,   0],
       [  1,  12,   1, ...,   0,   0,   0],
       [ 66, 829, 219, ...,   0,   0,   0]], dtype=int64)

In [None]:
model = tf.keras.Sequential([
    encoder,
    tf.keras.layers.Embedding(
        input_dim=len(encoder.get_vocabulary()),output_dim=64,
        # Use masking to handle the variable sequence lengths
        mask_zero=True),
    tf.keras.layers.LSTM(64),
    # tf.keras.layers.Dense(64, activation='relu'),
    # tf.keras.layers.Dense(1)
])

In [None]:
model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              optimizer=tf.keras.optimizers.Adam(1e-4),
              metrics=['accuracy'])

In [None]:
history = model.fit(
    train_dataset, epochs=2,
                    validation_data=test_dataset,
                    validation_steps=30)

Epoch 1/2
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m137s[0m 214ms/step - accuracy: 0.5583 - loss: 0.6524 - val_accuracy: 0.8292 - val_loss: 0.4191
Epoch 2/2
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m172s[0m 275ms/step - accuracy: 0.8427 - loss: 0.3739 - val_accuracy: 0.8391 - val_loss: 0.3493
