In [1]:
import os
os.chdir("../")

In [2]:
%pwd

'e:\\MyOnlineCourses\\ML_Projects\\arabic-digits-recognition'

In [3]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class DataPreparationConfig:
    root_dir: Path
    origin_data_path: str
    processed_data_path:str


In [4]:
from src.ard.constants import *
from src.ard.utils.help import read_yaml, create_directories

class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH):

        self.config = read_yaml(config_filepath)

        create_directories([self.config.artifacts_root])


    
    def get_data_preparation_config(self) -> DataPreparationConfig:
        config = self.config.data_preparation

        create_directories([config.root_dir])

        data_preparation_config = DataPreparationConfig(
            root_dir=config.root_dir,
            origin_data_path=config.origin_data_path,
            processed_data_path=config.processed_data_path,
        )

        return data_preparation_config

In [9]:
import numpy as np
from sklearn.model_selection import train_test_split
#from ard.utils.common import to_categorical
from ard.utils.dataset import SeqDataset
import logging
from ard import logger
from operator import itemgetter
from tensorflow.keras.utils import to_categorical
class DataPreparation:
    def __init__(self,
                 config : DataPreparationConfig
    ):
        self.config=config
        self._origin_data_path = self.config.origin_data_path
        self._processed_data_path = self.config.processed_data_path
    def load_npz_data(self, data_path):
        """
        Load data from an NPZ file.

        Args:
            file_path (str): Path to the NPZ file.

        Returns:
            SeqDataset: Loaded dataset.
        """
        try:
            data = np.load(data_path, allow_pickle=True)
           
            features, targets, lengths, classes = itemgetter('features', 'targets', 'lengths','classes')(data)
            idx = np.argwhere(np.isin(targets, classes)).flatten()
            ranges = SeqDataset._get_idxs(lengths)[idx]
        
            return SeqDataset(
                features = np.vstack(np.array([x for x in SeqDataset._iter_X(features, ranges)], dtype=object)),
                targets = targets[idx],
                lengths= lengths[idx],
                classes=classes)
        
            
        except Exception as e:
            logger.error(f"Error loading NPZ file: {e}")
            raise

    def prepare_data(self):
        """
        Prepare data for a CNN model.

        Args:
            dataset (SeqDataset): The dataset to prepare.
            train_shape (tuple): The desired shape for training data.
            test_shape (tuple): The desired shape for testing data.

        Returns:
            tuple: X_train, X_test, y_train, y_test
        """
        logger.info("Loading data from NPZ file...")
        dataset = self.load_npz_data(self._origin_data_path)
        logger.info(f"Dataset loaded")
        X = dataset._features
        y = dataset._targets
        # Split the data
        logger.info("Preparing data for training...")
        train_data, test_data = dataset.split_data(split_size=0.2, shuffle=True, stratify=True)
        X_train, y_train,_,_ = train_data._get_data()
        X_test, y_test,_,_ = test_data._get_data()
        
        # Reshape for CNN (add channel dimension)
        X_train = X_train.reshape(X_train.shape[0], X_train.shape[1], 1)
        X_test = X_test.reshape(X_test.shape[0], X_test.shape[1], 1)
        

        # Convert labels to categorical
        num_classes = len(np.unique(y))
        #y_train = to_categorical(y_train, num_classes=num_classes)
        #y_test = to_categorical(y_test,  num_classes=num_classes)
        logger.info(f"Training data shape: {X_train.shape}")
        logger.info(f"Testing data shape: {X_test.shape}")
        logger.info(f"Training labels shape: {y_train.shape}")
        logger.info(f"Testing labels shape: {y_test.shape}")

        return X_train, X_test, y_train, y_test

def pad_sequences(sequences, maxlen):
    """
    Pad or truncate sequences to a specific length.

    Args:
        sequences (list): List of sequences.
        maxlen (int): Desired length of sequences.

    Returns:
        np.array: Padded sequences.
    """
    padded_sequences = []
    for seq in sequences:
        if len(seq) > maxlen:
            padded_sequences.append(seq[:maxlen])
        else:
            padded_sequences.append(np.pad(seq, ((0, maxlen - len(seq)), (0, 0)), mode='constant'))
    return np.array(padded_sequences)


In [10]:
try:
    config = ConfigurationManager()
    data_preparation_config = config.get_data_preparation_config()
    data_preparation = DataPreparation(config=data_preparation_config)
    X_train, X_test, y_train, y_test = data_preparation.prepare_data()
    #dataset.save(compress=True)

except Exception as e:
    raise e

[2024-08-09 08:35:27,009: INFO: help: yaml file: config\config.yaml loaded successfully. Content size: 4]
[2024-08-09 08:35:27,014: INFO: help: Total directories created: 1]
[2024-08-09 08:35:27,019: INFO: help: Total directories created: 1]
[2024-08-09 08:35:27,021: INFO: 1094416712: Loading data from NPZ file...]
[2024-08-09 08:35:27,211: INFO: 1094416712: Dataset loaded]
[2024-08-09 08:35:27,215: INFO: 1094416712: Preparing data for training...]
[2024-08-09 08:35:27,225: INFO: 1094416712: Training data shape: (321, 13, 1)]
[2024-08-09 08:35:27,227: INFO: 1094416712: Testing data shape: (81, 13, 1)]
[2024-08-09 08:35:27,229: INFO: 1094416712: Training labels shape: (321,)]
[2024-08-09 08:35:27,230: INFO: 1094416712: Testing labels shape: (81,)]


In [14]:
X_train[0], y_train[0]

(array([[-2.84621406],
        [ 2.68434405],
        [ 0.71497554],
        [ 0.08256976],
        [-0.29053208],
        [-0.18529031],
        [-0.02732126],
        [ 0.16135539],
        [ 0.18251677],
        [-0.05077429],
        [-0.05610316],
        [-0.11398745],
        [ 0.02956609]]),
 6)

In [12]:
from tensorflow import keras
# Build the model architecture
model = keras.Sequential([
                          #input layer
                          
                          keras.layers.Flatten(input_shape=(X_train.shape[1], X_train.shape[2])),
                          keras.layers.Dense(512, activation="relu", kernel_regularizer=keras.regularizers.l2(0.01)),
                          keras.layers.Dropout(0.3),
                          keras.layers.Dense(256, activation="relu", kernel_regularizer=keras.regularizers.l2(0.01)),
                          keras.layers.Dropout(0.25),
                          keras.layers.Dense(64, activation="relu", kernel_regularizer=keras.regularizers.l2(0.01)),
                          keras.layers.Dropout(0.2),
                          keras.layers.Dense(10, activation="softmax")
])

# Callback
callback = keras.callbacks.EarlyStopping(monitor="loss",verbose=2, patience=10)

# Compile model
model.compile(optimizer=keras.optimizers.Adam(learning_rate=0.01),
              loss = "sparse_categorical_crossentropy",
              metrics=["accuracy"])

model.summary()

history = model.fit(X_train, y_train,
          validation_data=(X_test, y_test),
          epochs= 1000,batch_size=32,
          callbacks=[callback])

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 flatten_2 (Flatten)         (None, 13)                0         
                                                                 
 dense_8 (Dense)             (None, 512)               7168      
                                                                 
 dropout_6 (Dropout)         (None, 512)               0         
                                                                 
 dense_9 (Dense)             (None, 256)               131328    
                                                                 
 dropout_7 (Dropout)         (None, 256)               0         
                                                                 
 dense_10 (Dense)            (None, 64)                16448     
                                                                 
 dropout_8 (Dropout)         (None, 64)               