<a href="https://colab.research.google.com/github/KeerthiVasan-ai/deep-learning-tools-lab/blob/main/4_Sentence_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import re
import shutil
import string

import tensorflow as tf
from tensorflow.keras import layers

In [None]:
url = "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"

dataset = tf.keras.utils.get_file(
    "aclImdb_v1", url,
    untar=True,cache_dir=".",cache_subdir=""
)

dataset_dir = os.path.join(os.path.dirname(dataset),'aclImdb')
train_dir = os.path.join(dataset_dir,"train")
shutil.rmtree(os.path.join(train_dir,"unsup"))

In [None]:
batch_size = 32
seed = 42

raw_train_ds = tf.keras.utils.text_dataset_from_directory(
    'aclImdb/train',
    batch_size=batch_size,
    validation_split=0.2,
    subset='training',
    seed=seed
)
raw_val_ds = tf.keras.utils.text_dataset_from_directory(
    'aclImdb/train',
    batch_size=batch_size,
    validation_split = 0.2,
    subset='validation',
    seed=seed
)
raw_test_ds = tf.keras.utils.text_dataset_from_directory(
    'aclImdb/test',
    batch_size=batch_size
)

Found 25000 files belonging to 2 classes.
Using 20000 files for training.
Found 25000 files belonging to 2 classes.
Using 5000 files for validation.
Found 25000 files belonging to 2 classes.


In [None]:
max_features = 10000
sequence_length = 250

vectorize_layer = layers.TextVectorization(
    max_tokens= max_features,
    output_sequence_length = sequence_length
)

In [None]:
def vectorize_text(text,label):
  text = tf.expand_dims(text,-1)
  return vectorize_layer(text),label

In [None]:
vectorize_layer.adapt(raw_train_ds.map(lambda x,y : x))

### Classification 1D CNN

In [None]:
train_ds = raw_train_ds.map(vectorize_text).cache().prefetch(tf.data.AUTOTUNE)
val_ds = raw_val_ds.map(vectorize_text).cache().prefetch(tf.data.AUTOTUNE)
test_ds = raw_test_ds.map(vectorize_text).cache().prefetch(tf.data.AUTOTUNE)


In [None]:
model = Sequential([
    layers.Embedding(max_features+1,16),
    layers.Conv1D(8,7,activation="relu"),
    layers.GlobalAveragePooling1D(),
    layers.Dropout(0.2),
    layers.Dense(8,activation="relu"),
    layers.Dense(1)
])

model.summary()

Model: "sequential_9"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_9 (Embedding)     (None, None, 16)          160016    
                                                                 
 conv1d_2 (Conv1D)           (None, None, 8)           904       
                                                                 
 global_average_pooling1d_2  (None, 8)                 0         
  (GlobalAveragePooling1D)                                       
                                                                 
 dropout_2 (Dropout)         (None, 8)                 0         
                                                                 
 dense_18 (Dense)            (None, 8)                 72        
                                                                 
 dense_19 (Dense)            (None, 1)                 9         
                                                      

In [None]:
model.compile(
    loss=tf.keras.losses.BinaryCrossentropy(),
    optimizer="adam",
    metrics=['accuracy']
)

In [None]:
model.fit(train_ds,epochs=2,validation_data=val_ds)

Epoch 1/2

KeyboardInterrupt: 

### Classification 2D CNN

In [None]:
train_ds = raw_train_ds.cache().prefetch(tf.data.AUTOTUNE)
val_ds = raw_val_ds.cache().prefetch(tf.data.AUTOTUNE)
test_ds = raw_test_ds.cache().prefetch(tf.data.AUTOTUNE)


In [None]:
from tensorflow.keras.models import Sequential

In [None]:
model1 = Sequential([
    vectorize_layer,
    layers.Embedding(
        input_dim = len(vectorize_layer.get_vocabulary()),
        output_dim = 64,
        mask_zero = True
    ),
    layers.Bidirectional(layers.LSTM(64,return_sequences=True)),
    layers.Bidirectional(layers.LSTM(32)),
    layers.Dense(64,activation="relu"),
    layers.Dense(1)
])

model1.compile(
    loss = tf.keras.losses.BinaryCrossentropy(),
    optimizer="adam",
    metrics=['accuracy']
)

model1.summary()

Model: "sequential_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 text_vectorization_1 (Text  (None, 250)               0         
 Vectorization)                                                  
                                                                 
 embedding_6 (Embedding)     (None, 250, 64)           640000    
                                                                 
 bidirectional_12 (Bidirect  (None, 250, 128)          66048     
 ional)                                                          
                                                                 
 bidirectional_13 (Bidirect  (None, 64)                41216     
 ional)                                                          
                                                                 
 dense_12 (Dense)            (None, 64)                4160      
                                                      

In [None]:
history = model1.fit(
    train_ds,epochs=2,validation_data=val_ds
)

Epoch 1/2

KeyboardInterrupt: 