# Preprocessing

In [None]:
from tensorflow.keras.layers import TextVectorization, Normalization, CenterCrop, Rescaling

In [None]:
import numpy as np
training_data = np.array([["This is the 1st sample."], ["And here's the 2nd sample."]])

In [None]:
vector = TextVectorization()
vector.adapt(training_data)

In [None]:
vector(training_data)

<tf.Tensor: shape=(2, 5), dtype=int64, numpy=
array([[4, 5, 2, 9, 3],
       [7, 6, 2, 8, 3]])>

In [None]:
binary_vectorizer = TextVectorization(output_mode = 'binary',ngrams = 2)
binary_vectorizer.adapt(training_data)
binary_vectorizer(training_data)

<tf.Tensor: shape=(2, 17), dtype=float32, numpy=
array([[0., 1., 1., 1., 1., 0., 1., 1., 1., 0., 0., 0., 0., 0., 0., 1.,
        1.],
       [0., 1., 1., 0., 0., 1., 0., 0., 0., 1., 1., 1., 1., 1., 1., 0.,
        0.]], dtype=float32)>

In [None]:
image_array = np.random.randint(0,256,size = (64,200,200,3),).astype('float')

In [None]:
normalization = Normalization()
normalization.adapt(image_array)
normalized_data = normalization(image_array)
print("Mean %.2f" % np.mean(normalized_data))
print("Var %.2f" % np.var(normalized_data))

Mean -0.00
Var 1.00


In [None]:
# Example image data, with values in the [0, 255] range
training_data = np.random.randint(0, 256, size=(64, 200, 200, 3)).astype("float32")

cropper = CenterCrop(height=150, width=150)
scaler = Rescaling(scale=1.0 / 255)

output_data = scaler(cropper(training_data))
print("shape:", output_data.shape)
print("min:", np.min(output_data))
print("max:", np.max(output_data))

shape: (64, 150, 150, 3)
min: 0.0
max: 1.0


# MOdeling

In [None]:
from tensorflow import keras

In [None]:
dense = keras.layers.Dense(units = 16)

In [None]:
dense

<keras.layers.core.dense.Dense at 0x7dee88d9f0a0>

In [None]:
input = keras.Input((None,None,3))

In [None]:
input.shape

TensorShape([None, None, None, 3])

In [None]:
# remember to crop and then scale
cropped = CenterCrop(height = 150,width = 150)(input)
rescaled = Rescaling(1.0/255)(cropped)

In [None]:
# now that our data is preprocessed build the model
x = keras.layers.Conv2D(filters=32, kernel_size=(3, 3), activation="relu")(rescaled)
x = keras.layers.MaxPooling2D(pool_size=(3, 3))(x)
x = keras.layers.Conv2D(filters=32, kernel_size=(3, 3), activation="relu")(rescaled)
x = keras.layers.MaxPooling2D(pool_size=(3, 3))(x)
x = keras.layers.Conv2D(filters=32, kernel_size=(3, 3), activation="relu")(rescaled)

# Apply global average pooling to get flat feature vectors
x = keras.layers.GlobalAveragePooling2D()(x)

In [None]:
output = keras.layers.Dense(10,activation = 'softmax')(x)
model = keras.Model(inputs = input,outputs = output)

In [None]:
model.summary()

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, None, None, 3)]   0         
                                                                 
 center_crop_6 (CenterCrop)  (None, 150, 150, 3)       0         
                                                                 
 rescaling_6 (Rescaling)     (None, 150, 150, 3)       0         
                                                                 
 conv2d_2 (Conv2D)           (None, 148, 148, 32)      896       
                                                                 
 global_average_pooling2d (G  (None, 32)               0         
 lobalAveragePooling2D)                                          
                                                                 
 dense_2 (Dense)             (None, 10)                330       
                                                           

In [None]:
ip = np.random.randint(0,256,size = (64,200,200,3)).astype('float')

In [None]:
finn = model(ip)
print(finn.shape)

(64, 10)


In [None]:
model.summary()

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, None, None, 3)]   0         
                                                                 
 center_crop_6 (CenterCrop)  (None, 150, 150, 3)       0         
                                                                 
 rescaling_6 (Rescaling)     (None, 150, 150, 3)       0         
                                                                 
 conv2d_2 (Conv2D)           (None, 148, 148, 32)      896       
                                                                 
 global_average_pooling2d (G  (None, 32)               0         
 lobalAveragePooling2D)                                          
                                                                 
 dense_2 (Dense)             (None, 10)                330       
                                                           

In [None]:
import tensorflow as tf
class CustomModel(keras.Model):
  def train_step(self,data):
    """
    data can be an array(x,y) or tf.data.Dataset(dataset)
    """
    x,y = data
    with tf.GradientTape() as tape:
      y_pred = self(x,training = True)
      # compute loss
      loss = self.compute_loss(y = y, y_pred = y_pred)

    # compute gradients
    trainable_vars = self.trainable_variables  # Model variables that can be trained
    gradients = tape.gradient(loss,trainable_vars)

    # update weights by applying gradients on trainable wts
    self.optimizer.apply_gradients(zip(gradients,trainable_vars))

    # update metrics
    for metric in self.metrics:
      if metric.name == 'loss':
        metric.update_state(loss)
      else:
        metric.update_state(y,y_pred)

    return {m.name: m.result() for m in self.metrics}


In [None]:
inputs = keras.Input(shape = (32,))
outputs = keras.layers.Dense(1)(inputs)
model = CustomModel(inputs,outputs)
model.compile(optimizer = 'adam',loss = 'mse',metrics = ['mse'])

x = np.random.random((1000,32))
y = np.random.random((1000,1))

model.fit(x,y, epochs = 3)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7dee827a4e20>

In [None]:
# suppose we want to add loss function in train_step instead of compile
#

32

## TExt Classification

In [None]:
## lib
import tensorflow as tf
import numpy as np

In [None]:
# data
!curl -O https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 80.2M  100 80.2M    0     0  14.9M      0  0:00:05  0:00:05 --:--:-- 16.7M


In [None]:
!tar -xf aclImdb_v1.tar.gz


In [None]:
!ls aclImdb

imdbEr.txt  imdb.vocab	README	test  train


In [None]:
!ls aclImdb/train

labeledBow.feat  pos	unsupBow.feat  urls_pos.txt
neg		 unsup	urls_neg.txt   urls_unsup.txt


In [None]:
!ls aclImdb/test

labeledBow.feat  neg  pos  urls_neg.txt  urls_pos.txt


In [None]:
!cat aclImdb/train/pos/11818_10.txt

Most successful comic book movies usually depend on having villains that are bigger than life, ready to jump off the screen and strangle you alive with a smile or a demented line or two of dialog. The Tim Burton Batmans had it, as did (in an even more grotesque manner) Sin City. With Dick Tracy producer/director/star Warren Beatty piles on the villains until it becomes part of the framework. Like a boisterous homage to 1930s gangster pictures- only this time meant for kids as opposed to the darker Bonnie and Clyde- Dick Tracy is filled, joyfully, with archetypes and bright, primary colors, where the criminals carry tommy guns and are formed on their faces to shape their personalities. Villains like The Stooge, Shoulders, Lips, The Brow, Mumbles, the Blank, Pruneface, Spud. Chester Gould gave the names to his characters that fit their profiles, and gave his hero a jaw that could cut glass. The film is a continuation of sight gags that are perfectly taken seriously.<br /><br />If, at the

In [None]:
!rm -r aclImdb/train/unsup
# removes unsup folder since we don't need it

In [None]:
# divide training data into training and validation set
BATCH = 32
raw_train_ds = tf.keras.utils.text_dataset_from_directory(
    'aclImdb/train',
    batch_size = BATCH,
    validation_split = 0.2,
    subset = 'training',
    seed = 199
)

raw_val_ds = tf.keras.utils.text_dataset_from_directory(
    'aclImdb/train',
    batch_size = BATCH,
    validation_split = 0.2,
    subset = 'validation',
    seed = 199
)

raw_test_ds = tf.keras.utils.text_dataset_from_directory(
    'aclImdb/test',
    batch_size = BATCH
)

print(f"Number of batches in raw_train_ds: {raw_train_ds.cardinality()}")
print(f"Number of batches in raw_val_ds: {raw_val_ds.cardinality()}")
print(f"Number of batches in raw_test_ds: {raw_test_ds.cardinality()}")

Found 25000 files belonging to 2 classes.
Using 20000 files for training.
Found 25000 files belonging to 2 classes.
Using 5000 files for validation.
Found 25000 files belonging to 2 classes.
Number of batches in raw_train_ds: 625
Number of batches in raw_val_ds: 157
Number of batches in raw_test_ds: 782


In [None]:
for text_batch,label_batch in raw_train_ds.take(1): #first batch
  for i in range(5): #iterate through 32 samples
    print(text_batch.numpy()[i])
    print(label_batch.numpy()[i])

b"I do agree with everything Calamine has said! And I don't always agree with people, but what Calamine has said is very true, it is time for the girls to move on to better roles. I would like to see them succeed very much as they were a very inspirational pair growing up and I would like to see them grow as people, actresses and in their career as well as their personal life. So producers, please give the girls a chance to develop something that goes off the tangent a bit, move them into a new direction that recognises them individually and their talents in many facets. This movie that is being commented is not too bad, but as I have seen further on in their movies, their movies stay the same of typical plot and typography. When In Rome is good for audiences of younger generation but the adults who were kids when the twins were babies want to follow the twins in their successes and so hence I think we adults would like to see them make movies of different kinds, maybe some that are li

```their thoughts.<br /><br />actually```
there are break tags present in the text and other things present in the data. so we need processing

default standardizer does not remove this tags

solution is to build a custom standardizer

In [None]:
from tensorflow.keras.layers import TextVectorization
import string
import re

In [None]:
def custom_standardization(input_data):
  lowercase = tf.strings.lower(input_data)
  stripped_html = tf.strings.regex_replace(lowercase,'<br />',' ')
  return tf.strings.regex_replace(stripped_html,f'[{re.escape(string.punctuation)}]','')

# model constants
max_features = 20000
embedding_dim = 128 #each word will be represented in 128 dimensional vector
sequence_length = 500

# we will use standardization layer to clean the data
# next comes the vectorization, in which wll map text to numbers, so op mode -> int


vectorize_layer = TextVectorization(
    max_tokens = max_features,
    standardize = custom_standardization,
    split = 'whitespace',
    output_mode = 'int',
    output_sequence_length = sequence_length,

)

# now we also have vectorization layer which will vectorize the text and we are ready to feed in the model
# consider only the text portion in train, and forget about the label for vectorization

text_ds = raw_train_ds.map(lambda text,label: text)
# let vectorization layer adapt it
vectorize_layer.adapt(text_ds)

In [None]:
# there are two options to deal with the txt data
# case 1
# first tokenize them and then feed in the model, what we did above
# case 2
# or add the tokenization layer in the model iteself and pass the text data to model itself

# text_input = tf.keras.Input(shape=(1,), dtype=tf.string, name='text')
# x = vectorize_layer(text_input)
# x = layers.Embedding(max_features + 1, embedding_dim)(x)


In [None]:
def vectorize_text(text,label):
  text = tf.expand_dims(text,-1)
  return vectorize_layer(text),label

train_ds = raw_train_ds.map(vectorize_text)
val_ds = raw_val_ds.map(vectorize_text)
test_ds = raw_test_ds.map(vectorize_text)

# Do async prefetching / buffering of the data for best performance on GPU.
train_ds = train_ds.cache().prefetch(buffer_size=10)
val_ds = val_ds.cache().prefetch(buffer_size=10)
test_ds = test_ds.cache().prefetch(buffer_size=10)

# .cache(): This method caches the data in memory or on disk,
#  depending on the available resources and the backend being used.
#  Caching is beneficial because it allows for faster data retrieval during training,

'''.prefetch(buffer_size=10):
This method prefetches data from the next batch, which means it loads the next set of data into memory while the model is training on the current batch.
This prefetching process overlaps data loading and training, reducing the idle time of the GPU or CPU during training and maximizing hardware utilization.
 The buffer_size parameter determines how many batches to prefetch ahead of time. In this case, it is set to 10, meaning that the next 10 batches will be preloaded while the current batch is being processed.'''




'.prefetch(buffer_size=10): \nThis method prefetches data from the next batch, which means it loads the next set of data into memory while the model is training on the current batch. \nThis prefetching process overlaps data loading and training, reducing the idle time of the GPU or CPU during training and maximizing hardware utilization.\n The buffer_size parameter determines how many batches to prefetch ahead of time. In this case, it is set to 10, meaning that the next 10 batches will be preloaded while the current batch is being processed.'

In [None]:
# build the model
# we have already vectorized the text, so we have ints to input
inputs = tf.keras.Input(shape = (None,),dtype = 'int64')

# EMBEDDING
# in word embeddings - words or the tokens that we pass are represented as dense vectors in continuous vector space
# this dense vector helps capture semantic relationships between words
x = tf.keras.layers.Embedding(max_features,embedding_dim)(inputs)

# DROPOUT
x = tf.keras.layers.Dropout(0.5)(x)

# convolution layer
x = tf.keras.layers.Conv1D(filters = 128,kernel_size = 7,strides = 3, padding = 'valid',activation = 'relu')(x)
# Max pooling
x = tf.keras.layers.GlobalMaxPooling1D()(x)

# Hidden layer
x = tf.keras.layers.Dense(128,activation = 'relu')(x)
x = tf.keras.layers.Dropout(0.5)(x)

# output layer
predictions = tf.keras.layers.Dense(1,activation = 'softmax',name = 'predictions')(x)

# define a model
model = tf.keras.Model(inputs,predictions)

model.compile(loss = 'binary_crossentropy',optimizer = 'adam',metrics = ['accuracy'])

model.summary()





Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_3 (InputLayer)        [(None, None)]            0         
                                                                 
 embedding_11 (Embedding)    (None, None, 50)          1000000   
                                                                 
 dropout_3 (Dropout)         (None, None, 50)          0         
                                                                 
 conv1d_2 (Conv1D)           (None, None, 128)         44928     
                                                                 
 global_max_pooling1d_2 (Glo  (None, 128)              0         
 balMaxPooling1D)                                                
                                                                 
 dense_1 (Dense)             (None, 128)               16512     
                                                             

In [None]:
model.fit(train_ds,validation_data = val_ds,epochs = 5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f5347ca7400>

In [None]:
model.evaluate(test_ds)



[0.3939269781112671, 0.5]

In [None]:
# end_to_end meaning directly input text data

inputs = tf.keras.Input(shape = (1,),dtype = 'string')
indices = vectorize_layer(inputs)
outputs = model(indices)

end_to_end_model = tf.keras.Model(inputs,outputs)
end_to_end_model.compile(loss = 'binary_crossentropy',
                         optimizer = 'adam',
                         metrics = ['accuracy'])
end_to_end_model.evaluate(raw_test_ds)



[0.3939265310764313, 0.5]