In [1]:
import os
os.chdir("../")

In [2]:
%pwd

'e:\\MyOnlineCourses\\ML_Projects\\arabic-digits-recognition'

In [3]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class DataTrainingConfig:
    root_dir: Path
    origin_data_path: str
    dst_path:Path


In [4]:
from adr.constants import *
from adr.utils.help import read_yaml, create_directories

class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH):

        self.config = read_yaml(config_filepath)

        create_directories([self.config.artifacts_root])


    
    def get_data_training_config(self) -> DataTrainingConfig:
        config = self.config.data_training

        create_directories([config.root_dir])

        data_training_config = DataTrainingConfig(
            root_dir=config.root_dir,
            origin_data_path=config.origin_data_path,
            dst_path=config.dst_path
        )

        return data_training_config

In [5]:
import numpy as np
from sklearn.model_selection import train_test_split
#from ard.utils.common import to_categorical
from adr.utils.dataset import SeqDataset
import logging
from adr import logger
from operator import itemgetter
from tensorflow.keras.utils import to_categorical
import tensorflow as tf
import tensorflow.keras.models as models
import tensorflow.keras.layers as layers
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import GridSearchCV
class DataTraining:
    def __init__(self,
                 config : DataTrainingConfig
    ):
        self.config=config
        self._origin_data_path = self.config.origin_data_path
        self.num_classes = None
        self.input_shape=None
    def load_npz_data(self, data_path):
        """
        Load data from an NPZ file.

        Args:
            file_path (str): Path to the NPZ file.

        Returns:
            SeqDataset: Loaded dataset.
        """
        try:
            data = np.load(data_path, allow_pickle=True)
           
            features, targets, lengths, classes = itemgetter('features', 'targets', 'lengths','classes')(data)
            logger.info(f"features shape: {features.shape}")
            idx = np.argwhere(np.isin(targets, classes)).flatten()
        
            return SeqDataset(
                features = features[idx],
                targets = targets[idx],
                lengths= lengths[idx],
                classes=classes)
        
            
        except Exception as e:
            logger.error(f"Error loading NPZ file: {e}")
            raise

    def prepare_data(self):
        """
        Prepare data for a CNN model.

        Args:
            dataset (SeqDataset): The dataset to prepare.
            train_shape (tuple): The desired shape for training data.
            test_shape (tuple): The desired shape for testing data.

        Returns:
            tuple: X_train, X_test, y_train, y_test
        """
        logger.info("Loading data from NPZ file...")
        dataset = self.load_npz_data(self._origin_data_path)
        
        logger.info(f"Dataset loaded")
        X = dataset._features
        y = dataset._targets
        
        # Split the data
        logger.info("Preparing data for training...")
        train_data, test_data = dataset.split_data(split_size=0.2, shuffle=True, stratify=True)
       
        X_train, y_train,_,_ = train_data._get_data()
        X_test, y_test,_,_ = test_data._get_data()
        logger.info(f"X_train shape: {X_train.shape}")
        logger.info(f"y_train shape: {y_train.shape}")
        # Reshape for CNN (add channel dimension)
        X_train = X_train.reshape(X_train.shape[0], X_train.shape[1],X_train.shape[2], 1)
        X_test = X_test.reshape(X_test.shape[0], X_test.shape[1],X_test.shape[2] , 1)
       
        
        

        # Convert labels to categorical
        self.num_classes = len(np.unique(y))
        y_train = tf.keras.utils.to_categorical(y_train, num_classes=self.num_classes)
        y_test = tf.keras.utils.to_categorical(y_test,  num_classes=self.num_classes)
        logger.info(f"Training data shape: {X_train.shape}")
        logger.info(f"Testing data shape: {X_test.shape}")
        logger.info(f"Training labels shape: {y_train.shape}")
        logger.info(f"Testing labels shape: {y_test.shape}")

        return X_train, X_test, y_train, y_test
    
    def build_and_train_model(self, X_train, X_test, y_train, y_test):
        model = self._create_model()
        self._compile_model(model)
        model = self._train_model(model, X_train, X_test, y_train, y_test)

        #model.save(self.config.dst_path)
      

    def _create_model(self, filters=32, kernel_size=(3, 3), dense_units=256, dropout_rate=0.2):
        model = models.Sequential([
            layers.Conv2D(filters, kernel_size, activation='relu', padding='valid', input_shape=self.input_shape),  
            layers.MaxPooling2D(2, padding='same'),
            layers.Conv2D(128, kernel_size, activation='relu', padding='valid'),
            layers.MaxPooling2D(2, padding='same'),
            layers.Dropout(dropout_rate),
            layers.Conv2D(128, kernel_size, activation='relu', padding='valid'),
            layers.MaxPooling2D(2, padding='same'),
            layers.Dropout(dropout_rate),
            layers.GlobalAveragePooling2D(),
            layers.Dense(dense_units, activation='relu'),
            layers.Dense(self.num_classes, activation='softmax')
        ])
        
        return model


    def _compile_model(self, model):
        model.compile(
            loss = 'categorical_crossentropy', optimizer = 'adam', metrics = 'acc'
        )
        model.summary()
        

    def _train_model(self, model, X_train, X_test, y_train, y_test):
       
        EPOCHS = 100
        batch_size = 12
        early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0, patience=10, 
                                                          verbose=1, mode='auto',baseline=None,
                                                          restore_best_weights=True)
        history = model.fit(X_train,y_train ,
            validation_data=(X_test,y_test),
            epochs=100,
            callbacks = [early_stopping],batch_size=batch_size)
        return history
    
    def process(self):
        X_train, X_test, y_train, y_test = self.prepare_data()
        self.input_shape=X_train.shape[1:]
        logger.info(f'Input shape: {self.input_shape}')
        self.build_and_train_model( X_train, X_test, y_train, y_test)
        
        # Wrap the model
       

In [6]:
try:
    config = ConfigurationManager()
    data_training_config = config.get_data_training_config()
    data_training = DataTraining(config=data_training_config)
    data_training.process()
   

except Exception as e:
    raise e

[2024-08-13 17:08:56,088: INFO: help: yaml file: config\config.yaml loaded successfully. Content size: 9]
[2024-08-13 17:08:56,091: INFO: help: Total directories created: 1]
[2024-08-13 17:08:56,094: INFO: help: Total directories created: 1]
[2024-08-13 17:08:56,097: INFO: 1287824526: Loading data from NPZ file...]
[2024-08-13 17:08:56,143: INFO: 1287824526: features shape: (402, 52, 40)]
[2024-08-13 17:08:56,150: INFO: 1287824526: Dataset loaded]
[2024-08-13 17:08:56,152: INFO: 1287824526: Preparing data for training...]
[2024-08-13 17:08:56,160: INFO: 1287824526: X_train shape: (321, 52, 40)]
[2024-08-13 17:08:56,162: INFO: 1287824526: y_train shape: (321,)]
[2024-08-13 17:08:56,165: INFO: 1287824526: Training data shape: (321, 52, 40, 1)]
[2024-08-13 17:08:56,167: INFO: 1287824526: Testing data shape: (81, 52, 40, 1)]
[2024-08-13 17:08:56,168: INFO: 1287824526: Training labels shape: (321, 10)]
[2024-08-13 17:08:56,170: INFO: 1287824526: Testing labels shape: (81, 10)]
[2024-08-13 1

In [7]:
# import matplotlib.pyplot as plt
# plt.plot(history.history['acc'])
# plt.plot(history.history['val_acc'])
# plt.title('model accuracy')
# plt.ylabel('accuracy')
# plt.xlabel('epoch')
# plt.legend(['train', 'validation'], loc='upper left')
# plt.show()


# plt.plot(history.history['loss'])
# plt.plot(history.history['val_loss'])
# plt.title('model loss')
# plt.ylabel('loss')
# plt.xlabel('epoch')
# plt.legend(['train', 'validation'], loc='upper left')
# plt.show()