In [1]:
import os
os.chdir("../")

In [2]:
%pwd

'e:\\MyOnlineCourses\\ML_Projects\\arabic-digits-recognition'

In [3]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class DataIngestionConfig:
    root_dir: Path
    source_path: str
    data_file: Path

In [4]:
from adr.constants import *
from adr.utils.help import read_yaml, create_directories

In [5]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])


    
    def get_data_ingestion_config(self) -> DataIngestionConfig:
        config = self.config.data_ingestion

        create_directories([config.root_dir])

        data_ingestion_config = DataIngestionConfig(
            root_dir=config.root_dir,
            source_path=config.source_path,
            data_file=config.data_file,
        )

        return data_ingestion_config, self.params

In [6]:
from typing import Optional, Union, IO
import pathlib, os
import random
import numpy as np 
from src.adr.utils.dataset import SeqDataset
from tqdm import tqdm
import time
import matplotlib.pyplot as plt
from src.adr.utils.common import convert_digits, get_wav_path, get_wav_label, ArdArray, Signal
from src.adr.utils.preprocess import WVLoader, MFCCExtractor
from src.adr.utils.transformer import  MFCC, MinMaxScaler, Standardiser, TransformsChain
from tensorflow.keras.preprocessing.sequence import pad_sequences
from adr import logger

class DataIngestion:
    
    def __init__(self,
                config : DataIngestionConfig,
                params : None
                
                 ):
        
        self.loader = WVLoader()
        self.extractor= MFCCExtractor()
        self.params=params
        self._target_sample_rate = self.params.TARGET_SAMPLE_RATE
        self._num_samples = self.params.NUM_SAMPLES
        self._random_state = self.params.RANDOM_STATE
        self._transform_kwargs = self.params.SPEC_KWARGS
        self.config=config
        self._source_path = self.config.source_path
        self._data_file = self.config.data_file
        self.minmax_scaler = MinMaxScaler(min=0, max=1)
        self.standardiser = Standardiser()
        self.transform_chain = TransformsChain(transforms=[self.standardiser])

        assert self._target_sample_rate > 0, "Sample rate must be a positive integer"
    
    def Load(self): 
        files_list, _inputs, _targets, _lengths = [], [],[], []
        classes = range(10)
        for file in os.listdir(self._source_path):
            if file.lower().endswith(".wav"):
                files_list.append(file)
        
        if files_list:
            random.shuffle(files_list)
            
        with tqdm(total=len(files_list), colour="green", desc="Processing MFCC ", 
                  bar_format="{l_bar}{bar} [time spent: {elapsed}]",
                  leave=True) as pbar:
            for file_name in files_list:
                wav_path = get_wav_path(file_name, self._source_path)
                label = get_wav_label(file_name)
                waveform, sr = self.loader.load(file = wav_path)
                mfcc_signal = self.extractor.mfcc(audio=waveform, sample_rate=sr)
                signal = Signal(name = file_name.split('\\')[-1], data=mfcc_signal, samplerate=self._target_sample_rate, filepath=wav_path)
                signal = self.transform_chain.process(signal)
                # Append the MFCC features to the list
              
               
                _inputs.append(signal.data)
                _targets.append(label)
                _lengths.append(signal.data.shape[0])
                pbar.update(1)
                time.sleep(0.01)
        max_length = max(mfcc.shape[1] for mfcc in _inputs) 
        sequences = pad_sequences([mfcc.T for mfcc in _inputs], maxlen=max_length, padding='post', dtype='float32')
        logger.info(f"Padded MFCC shape: {sequences.shape}")
        logger.info(f"Labels shape: {np.array(_targets).shape}")          
        #sequences = np.array(_inputs, dtype=object)
        lengths = np.array(_lengths, dtype=int)
        idx = np.argwhere(np.isin(_targets, classes)).flatten()
        return SeqDataset(features= sequences[idx], targets = np.array(_targets)[idx],
                          lengths = lengths[idx], classes = classes, path =self._data_file, 
                          random_state = self._random_state)
    
    

[2024-08-13 17:07:07,101: INFO: help: yaml file: params.yaml loaded successfully. Content size: 9]


In [7]:
try:
    config = ConfigurationManager()
    data_ingestion_config, data_ingestion_params = config.get_data_ingestion_config()
    data_ingestion = DataIngestion(config=data_ingestion_config, params=data_ingestion_params)
    dataset = data_ingestion.Load()
    dataset.save(compress=True)

except Exception as e:
    raise e

[2024-08-13 17:07:07,542: INFO: help: yaml file: config\config.yaml loaded successfully. Content size: 9]
[2024-08-13 17:07:07,551: INFO: help: yaml file: params.yaml loaded successfully. Content size: 9]
[2024-08-13 17:07:07,555: INFO: help: Total directories created: 1]
[2024-08-13 17:07:07,559: INFO: help: Total directories created: 1]
[2024-08-13 17:07:07,561: INFO: preprocess: WVLoader is initializing]
[2024-08-13 17:07:07,563: INFO: transformer: Instantiated TransformType.MINMAXSCALER transform]
[2024-08-13 17:07:07,565: INFO: transformer: Instantiated TransformType.STANDARDSCALER transform]


Processing MFCC : 100%|[32m██████████[0m [time spent: 00:25]

[2024-08-13 17:07:32,655: INFO: 2883497174: Padded MFCC shape: (402, 52, 40)]
[2024-08-13 17:07:32,658: INFO: 2883497174: Labels shape: (402,)]





[2024-08-13 17:07:32,858: INFO: dataset: A npz file has been saved]


In [8]:
# Check the dataset
print(f"Dataset size: {len(dataset)}")
print(f"Number of classes: {len(dataset._classes)}")

# Split the dataset
train_data, test_data = dataset.split_data(split_size=0.2, shuffle=True, stratify=True)

print(f"Train set size: {len(train_data)}")
print(f"Test set size: {len(test_data)}")

# Iterate through the dataset
for i, (feature, target, length) in enumerate(dataset):
    print(f"Sample {i}:")
    print(f"  Feature shape: {feature.shape}")
    print(f"  Target: {target}")
    print(f"  Length: {length}")
    if i == 2:  # Print only first 3 samples
        break
class_samples = {}
for features, class_label in dataset.iterator():
    if class_label not in class_samples:
        class_samples[class_label] = 0
    class_samples[class_label] += 1

for class_label, count in class_samples.items():
    print(f"Class {class_label}: {count} samples")



Dataset size: 402
Number of classes: 10
Train set size: 321
Test set size: 81
Sample 0:
  Feature shape: (52, 40)
  Target: 5
  Length: 40
Sample 1:
  Feature shape: (52, 40)
  Target: 8
  Length: 40
Sample 2:
  Feature shape: (52, 40)
  Target: 9
  Length: 40
Class 0: 1 samples
Class 1: 1 samples
Class 2: 1 samples
Class 3: 1 samples
Class 4: 1 samples
Class 5: 1 samples
Class 6: 1 samples
Class 7: 1 samples
Class 8: 1 samples
Class 9: 1 samples
