In [1]:
!nvidia-smi

Mon Feb 26 07:41:31 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   41C    P8              11W /  70W |      0MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [2]:
from google.colab import drive
# Review the uploaded file and provided detail information
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras
from keras import Model
from keras import layers
from keras.utils import text_dataset_from_directory

In [4]:
BATCH_SIZE = 32
MAX_TOKENS = 5000
MAX_LENGTH = 142

In [5]:
def load_text_dataset(directory_path: str, batch_size: int):
    '''
        Load a text dataset from the specified directory.

        Args:
            directory_path (str):
                Path to the directory containing the text data.

            batch_size(int):
                Number of samples per batch.

        Return:
            tf.data.Dataset:
            TensorFlow dataset containing the text data.
    '''
    dataset = text_dataset_from_directory(
                    directory_path, batch_size= batch_size)

    # print the shape and datatype data (for data accuracy control)
    for inputs, targets in dataset:
        print(f'Shape of inputs: {inputs.shape}')
        print(f'dtype of inputs: {inputs.dtype}')
        print(f'taeger[0]: {targets[0]}')
        break

    return dataset


In [6]:
def proprocess_text_data(train_ds: tf.data.Dataset,test_ds: tf.data.Dataset,
                         max_token: int, output_mode: str, output_sequence_length: int) -> tuple[tf.data.Dataset, tf.data.Dataset]:
    '''
        Loads and preprocesses text data from the specified datasets.

        Args:
            Train_ds:
                Training dataset containing text samples.

            Test_ds:
                Test dataset containing text samples.

            Max_token (int):
                Maximum number of tokens in the vocabulary.

            Output_mode (str):
                Output mode ('int' for integer-encoded tokens).

            Output_sequence_length (int):
                Maximum sequence length for the output (length of the dictionary).

        Returns:
            tf.data.Dataset:
                proccessed training and test datasets
    '''
    # Create a TextVectorization layer
    vectorizer = layers.TextVectorization(max_tokens= max_token,
                                                  output_mode= output_mode,
                                                  output_sequence_length= output_sequence_length)

    # Adapt the TextVectorization layer to the text data
    vectorizer.adapt(train_ds.unbatch().map(lambda x,y: x).batch(BATCH_SIZE))

    # Apply the adapted TextVectorization layer to both training and test datasets
    int_train_ds = train_ds.map(lambda x, y: (vectorizer(x), y), num_parallel_calls=4)
    int_test_ds = test_ds.map(lambda x, y: (vectorizer(x), y), num_parallel_calls=4)

    # print the top token in the vocabbulary (for data accuracy control)
    print(vectorizer.get_vocabulary()[:10])

    return (int_train_ds, int_test_ds)

In [7]:
def gru_model(max_token: int)-> keras.Model:
    '''
        Create an gru_based binary classification model.

        Args:
            max_token (int):
                Maximun number of tokens in the vocabulary.

        returns:
            keras.Model:
                A compiled LSTM model for binary classification.
    '''
    # Define input layer
    inputs = keras.Input(shape=(None,), dtype ='int64')

    # Embedding layer
    embeded= layers.Embedding(input_dim= max_token, output_dim= 128)(inputs)

    # Create the GRU model using the functional API
    x = layers.GRU(units= 64, recurrent_dropout=0.5, return_sequences=True)(embeded) # Increased hidden state size
    x = layers.GRU(units= 64, recurrent_dropout=0.5)(x)
    x = layers.Dropout(0.5)(x)
     # Increased hidden state size

    # Output layer
    outputs = layers.Dense(1, activation= 'sigmoid')(x)

    # Create model
    model = Model(inputs, outputs)

    # Compile the model
    model.compile(optimizer="adam",
                loss="binary_crossentropy",
                metrics=["accuracy"])

    model.summary()

    history = model.fit(int_train_ds, validation_data= int_test_ds, epochs = 10)

    return history

In [8]:
def show_results(history):
    plt.style.use('ggplot')

    # Get the loss and accuracy values from the history object
    loss = history.history['loss']
    val_loss = history.history['val_loss']
    accuracy = history.history['accuracy']
    val_accuracy = history.history['val_accuracy']

    # Plot the loss and accuracy curves
    plt.plot(np.arange(EPOCHS), loss, label='Train Loss')
    plt.plot(np.arange(EPOCHS), val_loss, label='Validation Loss')

    plt.plot(np.arange(EPOCHS), accuracy, label='Accuracy')
    plt.plot(np.arange(EPOCHS), val_accuracy, label='Validation Accuracy')

    plt.legend()
    plt.xlabel('Epoch')
    plt.ylabel('Loss/Accuracy')
    plt.title('IMDB segmentation training')

    plt.show()


In [9]:
TRAIN_DS_PATH= '/content/drive/MyDrive/dataset/aclImdb/train'
TEST_DS_PATH = '/content/drive/MyDrive/dataset/aclImdb/test'

train_ds = load_text_dataset(directory_path= TRAIN_DS_PATH, batch_size= BATCH_SIZE)

test_ds = load_text_dataset(directory_path= TEST_DS_PATH, batch_size= BATCH_SIZE)

Found 25021 files belonging to 2 classes.
Shape of inputs: (32,)
dtype of inputs: <dtype: 'string'>
taeger[0]: 0
Found 25001 files belonging to 2 classes.
Shape of inputs: (32,)
dtype of inputs: <dtype: 'string'>
taeger[0]: 0


In [None]:
int_train_ds, int_test_ds = proprocess_text_data(
                                                train_ds = train_ds,
                                                test_ds = test_ds,
                                                max_token= MAX_TOKENS,
                                                output_mode= 'int',
                                                output_sequence_length= MAX_LENGTH )

In [None]:
history = gru_model(max_token= MAX_TOKENS)