In [1]:
import numpy as np
import io
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras 
import tensorflow_datasets as tfds
from tensorflow.keras import layers
import pickle
import tensorflow_hub as hub


In [7]:
from numpy.lib.shape_base import split
def filter_train(line):
  split_line = tf.strings.split(line , ',' , maxsplit=4)
  dataset_belonging = split_line[1]
  sentiiment_catagory = split_line[2]
  return (
      True 
      if dataset_belonging == 'train' and sentiiment_catagory != 'unsup'
      else False )

In [10]:
from numpy.lib.shape_base import split
def filter_test(line):
  split_line = tf.strings.split(line , ',' , maxsplit=4)
  dataset_belonging = split_line[1]
  sentiiment_catagory = split_line[2]
  return (
      True 
      if dataset_belonging == 'test' and sentiiment_catagory != 'unsup'
      else False )

In [11]:
ds_train = tf.data.TextLineDataset("/content/drive/MyDrive/idbm/imdb.csv").filter(filter_train)
ds_test = tf.data.TextLineDataset("/content/drive/MyDrive/idbm/imdb.csv").filter(filter_test)

In [9]:
for line in ds_train.skip(1).take(5):
  print(tf.strings.split(line , ',' , maxsplit=4))

tf.Tensor(
[b'25001' b'train' b'neg' b'10000_4.txt'
 b'"Airport \'77 starts as a brand new luxury 747 plane is loaded up with valuable paintings & such belonging to rich businessman Philip Stevens (James Stewart) who is flying them & a bunch of VIP\'s to his estate in preparation of it being opened to the public as a museum, also on board is Stevens daughter Julie (Kathleen Quinlan) & her son. The luxury jetliner takes off as planned but mid-air the plane is hi-jacked by the co-pilot Chambers (Robert Foxworth) & his two accomplice\'s Banker (Monte Markham) & Wilson (Michael Pataki) who knock the passengers & crew out with sleeping gas, they plan to steal the valuable cargo & land on a disused plane strip on an isolated island but while making his descent Chambers almost hits an oil rig in the Ocean & loses control of the plane sending it crashing into the sea where it sinks to the bottom right bang in the middle of the Bermuda Triangle. With air in short supply, water leaking in & havi

In [13]:
tokenizer = tfds.deprecated.text.Tokenizer()


In [16]:
def build_vocab(ds_train , threshold = 200):
  frequencies = {}
  vocabloary = set()
  vocabloary.update(["sostoken"])
  vocabloary.update(["eostoken"])
  for line in ds_train.skip(1):
    split_line = tf.strings.split(line , ',' , maxsplit=4)
    review = split_line[4]
    tokenized_text= tokenizer.tokenize(review.numpy().lower())
    for word in tokenized_text:
      if word not in frequencies:
        frequencies[word] =1
      else :
        frequencies[word] +=1

      if frequencies[word] == threshold:
        vocabloary.update(tokenized_text)

  return vocabloary

In [17]:
vocabloary = build_vocab(ds_train)
vocab_file = open('vocabloary.obj' , 'wb')
pickle.dump(vocabloary , vocab_file)

In [None]:
vocab_file = open('vocabloary.obj' , 'rb')
vocabloary = pickle.load(vocab_file)

In [19]:
encoder = tfds.deprecated.text.TokenTextEncoder(
    list(vocabloary) , oov_token="<UNK>" , lowercase = True , tokenizer=tokenizer,
)
def my_encoder(text_tensor , label):
  encoded_text = encoder.encode(text_tensor.numpy())
  return encoded_text , label

In [22]:
def encode_map_fn(line):
  split_line = tf.strings.split(line , ',' , maxsplit=4)
  label_str = split_line[2]
  review = "sostoken" + split_line[4] + "eostoken"
  label = 1 if label_str == "pos" else 0
  (encoded_text , label) = tf.py_function(
      my_encoder, inp=[review , label] , Tout =(tf.int64 ,tf.int32)
  )
  encoded_text.set_shape([None])
  label.set_shape([])
  return encoded_text ,label

In [23]:
AUTOTUNE = tf.data.experimental.AUTOTUNE
ds_train = ds_train.map(encode_map_fn, num_parallel_calls=AUTOTUNE).cache()
ds_train = ds_train.shuffle(25000)
ds_train = ds_train.padded_batch(32, padded_shapes=([None], ()))

ds_test = ds_test.map(encode_map_fn)
ds_test = ds_test.padded_batch(32, padded_shapes=([None], ()))

In [25]:
model = keras.Sequential(
    [
        layers.Masking(mask_value=0),
        layers.Embedding(input_dim=len(vocabloary) + 2, output_dim=32,),
        layers.GlobalAveragePooling1D(),
        layers.Dense(64, activation="relu"),
        layers.Dense(1),
    ]
)

In [26]:
model.compile(
    loss=keras.losses.BinaryCrossentropy(from_logits=True),
    optimizer=keras.optimizers.Adam(3e-4, clipnorm=1),
    metrics=["accuracy"],
)

In [27]:
model.fit(ds_train, epochs=15, verbose=2)

Epoch 1/15
782/782 - 47s - loss: 0.6697 - accuracy: 0.5114 - 47s/epoch - 60ms/step
Epoch 2/15
782/782 - 13s - loss: 0.4822 - accuracy: 0.7364 - 13s/epoch - 16ms/step
Epoch 3/15
782/782 - 13s - loss: 0.3365 - accuracy: 0.8592 - 13s/epoch - 17ms/step
Epoch 4/15
782/782 - 13s - loss: 0.2739 - accuracy: 0.8899 - 13s/epoch - 17ms/step
Epoch 5/15
782/782 - 13s - loss: 0.2389 - accuracy: 0.9073 - 13s/epoch - 17ms/step
Epoch 6/15
782/782 - 13s - loss: 0.2116 - accuracy: 0.9198 - 13s/epoch - 17ms/step
Epoch 7/15
782/782 - 14s - loss: 0.1890 - accuracy: 0.9295 - 14s/epoch - 17ms/step
Epoch 8/15
782/782 - 14s - loss: 0.1709 - accuracy: 0.9374 - 14s/epoch - 18ms/step
Epoch 9/15
782/782 - 14s - loss: 0.1533 - accuracy: 0.9452 - 14s/epoch - 18ms/step
Epoch 10/15
782/782 - 14s - loss: 0.1407 - accuracy: 0.9510 - 14s/epoch - 18ms/step
Epoch 11/15
782/782 - 14s - loss: 0.1272 - accuracy: 0.9563 - 14s/epoch - 18ms/step
Epoch 12/15
782/782 - 14s - loss: 0.1169 - accuracy: 0.9606 - 14s/epoch - 18ms/step
E

<keras.callbacks.History at 0x7f516aa62ca0>

In [28]:
model.evaluate(ds_test)



[0.3401123583316803, 0.8829200267791748]