<a href="https://colab.research.google.com/github/MaryDongsn/DL_RL/blob/master/CSI_5138_Assignment_3_IMDB_text_vectorization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import matplotlib.pyplot as plt
import os
import re
import shutil
import string
import tensorflow as tf
import numpy as np
import seaborn as sns

from tensorflow.keras import layers
from tensorflow.keras import losses
from tensorflow.keras import preprocessing
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

batch_size = 1000
seed = 20
max_features = 10000
sequence_length = 500
embedding_dim = 32


def getRawData():

  # Download IMDB data 
  url = "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"

  dataset = tf.keras.utils.get_file("aclImdb_v1.tar.gz", url,
                                      untar=True, cache_dir='.',
                                      cache_subdir='')

  #build the dataset 
  dataset_dir = os.path.join(os.path.dirname(dataset), 'aclImdb')

  train_dir = os.path.join(dataset_dir, 'train')

  remove_dir = os.path.join(train_dir, 'unsup')
  shutil.rmtree(remove_dir)

  raw_train_ds = tf.keras.preprocessing.text_dataset_from_directory(
      'aclImdb/train', 
      batch_size=batch_size, 
      validation_split=0.2, 
      subset='training', 
      seed=seed)

  raw_val_ds = tf.keras.preprocessing.text_dataset_from_directory(
      'aclImdb/train', 
      batch_size=batch_size, 
      validation_split=0.2, 
      subset='validation', 
      seed=seed)

  raw_test_ds = tf.keras.preprocessing.text_dataset_from_directory(
      'aclImdb/test', 
      batch_size=batch_size)

  return raw_train_ds, raw_val_ds, raw_test_ds

def custom_standardization(input_data):
  lowercase = tf.strings.lower(input_data)
  stripped_html = tf.strings.regex_replace(lowercase, '<br />', ' ')
  return tf.strings.regex_replace(stripped_html,
                                  '[%s]' % re.escape(string.punctuation),
                                  '')
  
raw_train_ds, raw_val_ds, raw_test_ds = getRawData()

def getVectorizeLayer():
  vectorize_layer = TextVectorization(
      standardize=custom_standardization,
      max_tokens=max_features,
      output_mode='int',
      output_sequence_length=sequence_length
      )

  # Make a text-only dataset (without labels), then call adapt
  train_text = raw_train_ds.map(lambda x, y: x)
  vectorize_layer.adapt(train_text)
  return vectorize_layer
vectorize_layer = getVectorizeLayer()

  


Downloading data from https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
Found 25000 files belonging to 2 classes.
Using 20000 files for training.
Found 25000 files belonging to 2 classes.
Using 5000 files for validation.
Found 25000 files belonging to 2 classes.


In [3]:
def getVectorizeText(text, label):
  
  text = tf.expand_dims(text, -1)
  text_vectorized = vectorize_layer(text)
  return text_vectorized, label

In [4]:
text_batch, label_batch = next(iter(raw_train_ds))
first_review, first_label = text_batch[0], label_batch[0]
print("Review:", first_review.numpy())
print("Label:", raw_train_ds.class_names[first_label])
print("Vectorized review", getVectorizeText(first_review, first_label))



Review: b'Clint Eastwood would star again as the battle-weary Detective Harry Callahan, but would also direct the fourth entry in the \'Dirty Harry\' series. \'Sudden Impact\' again like the other additions, brings its own distinguishable style and tone, but if anything it\'s probably the most similar to the original in it\'s darker and seedy moments (and bestowing a classic line "Go ahead. Make my day")\xc2\x85 but some of its humor has to been seen to believe. A bulldog\xc2\x85 named meathead that pisses and farts. Oh yeah. However an interesting fact this entry was only one in series to not have it set entirely in San Francisco.<br /><br />The story follows that of detective Callahan trying to put the pieces together of a murder where the victim was shot in the groin and then between the eyes. After getting in some trouble with office superiors and causing a stir which has some crime lord thugs after his blood. He\'s ordered to take leave, but it falls into a working one where he he

In [7]:
def getVectorizedData():
  train_ds = raw_train_ds.map(getVectorizeText)
  val_ds = raw_val_ds.map(getVectorizeText)
  test_ds = raw_test_ds.map(getVectorizeText)
  return train_ds, val_ds, test_ds
train_ds, val_ds, test_ds = getVectorizedData ()



