In [1]:
import os
os.chdir("../")

In [2]:
%pwd

'e:\\MyOnlineCourses\\ML_Projects\\arabic-digits-recognition'

In [3]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class DataIngestionConfig:
    root_dir: Path
    source_path: str
    metadata_file: Path

In [4]:
from src.ard.constants import *
from src.ard.utils.help import read_yaml, create_directories

In [5]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])


    
    def get_data_ingestion_config(self) -> DataIngestionConfig:
        config = self.config.data_ingestion

        create_directories([config.root_dir])

        data_ingestion_config = DataIngestionConfig(
            root_dir=config.root_dir,
            source_path=config.source_path,
            metadata_file=config.metadata_file,
        )

        return data_ingestion_config, self.params

In [6]:
from typing import Optional, Union, IO
import pathlib, os
import random
#from pydantic import confloat, PositiveInt, Field
import numpy as np 
from src.ard.utils.dataset import SeqDataset
from tqdm import tqdm
import time
import matplotlib.pyplot as plt
from src.ard.utils.common import convert_digits, get_wav_path, get_wav_label, ArdArray, Signal
from src.ard.utils.preprocess import WVLoader, MFCCExtractor
from src.ard.utils.transformer import  MFCC, MinMaxScaler, Standardiser, TransformsChain


class DataIngestion:
    
    def __init__(self,
                config : DataIngestionConfig,
                params : None
                
                 ):
        
        self.loader = WVLoader()
        self.extractor= MFCCExtractor()
        self.params=params
        self._target_sample_rate = self.params.TARGET_SAMPLE_RATE
        self._num_samples = self.params.NUM_SAMPLES
        self._random_state = self.params.RANDOM_STATE
        self._transform_kwargs = self.params.SPEC_KWARGS
        self.config=config
        self._source_path = self.config.source_path
        self._metadata_file = self.config.metadata_file
        self.minmax_scaler = MinMaxScaler(min=0, max=1)
        self.standardiser = Standardiser()
        self.transform_chain = TransformsChain(transforms=[self.minmax_scaler, self.standardiser])

        assert self._target_sample_rate > 0, "Sample rate must be a positive integer"
    
    def Load(self): 
        files_list, _inputs, _targets, _lengths = [], [],[], []
        classes = range(10)
        for file in os.listdir(self._source_path):
            if file.lower().endswith(".wav"):
                files_list.append(file)
        
        if files_list:
            random.shuffle(files_list)
        with tqdm(total=len(files_list), colour="green", desc="Processing MFCC ", 
                  bar_format="{l_bar}{bar} [time spent: {elapsed}]",
                  leave=True) as pbar:
            for file_name in files_list:
                wav_path = get_wav_path(file_name, self._source_path)
                label = get_wav_label(file_name)
                waveform = self.loader.load(file = wav_path,sample_rate= self._target_sample_rate)
                mfcc_signal = self.extractor.mfcc(data=waveform, spec_kwargs=self._transform_kwargs)
                signal = Signal(name = file_name.split('\\')[-1], data=mfcc_signal, samplerate=self._target_sample_rate, filepath=wav_path)
                scaled_signal = self.transform_chain.process(signal)
                _inputs.append(scaled_signal.data)
                _targets.append(label)
                _lengths.append(scaled_signal.data.shape[0])
                pbar.update(1)
                time.sleep(0.01)
        sequences = np.array(_inputs, dtype=object)
        lengths = np.array(_lengths, dtype=int)
        idx = np.argwhere(np.isin(_targets, classes)).flatten()
        
        return SeqDataset(features= np.vstack(sequences[idx]), targets = np.array(_targets)[idx],
                          lengths = lengths[idx], classes = classes, path =self._metadata_file, 
                          random_state = self._random_state)
    
    

In [7]:
try:
    config = ConfigurationManager()
    data_ingestion_config, data_ingestion_params = config.get_data_ingestion_config()
    data_ingestion = DataIngestion(config=data_ingestion_config, params=data_ingestion_params)
    dataset = data_ingestion.Load()
    dataset.save(compress=True)

except Exception as e:
    raise e

[2024-08-07 08:55:48,172: INFO: help: yaml file: config\config.yaml loaded successfully. Content size: 3]
[2024-08-07 08:55:48,180: INFO: help: yaml file: params.yaml loaded successfully. Content size: 6]
[2024-08-07 08:55:48,184: INFO: help: Total directories created: 1]
[2024-08-07 08:55:48,187: INFO: help: Total directories created: 1]
[2024-08-07 08:55:48,191: INFO: preprocess: WVLoader is initializing]
[2024-08-07 08:55:48,193: INFO: transformer: Instantiated TransformType.MINMAXSCALER transform]
[2024-08-07 08:55:48,197: INFO: transformer: Instantiated TransformType.STANDARDSCALER transform]


Processing MFCC : 100%|[32m██████████[0m [time spent: 00:30]


In [9]:
# Check the dataset
print(f"Dataset size: {len(dataset)}")
print(f"Number of classes: {len(dataset._classes)}")

# Split the dataset
train_data, test_data = dataset.split_data(split_size=0.2, shuffle=True, stratify=True)

print(f"Train set size: {len(train_data)}")
print(f"Test set size: {len(test_data)}")

# Iterate through the dataset
for i, (feature, target, length) in enumerate(dataset):
    print(f"Sample {i}:")
    print(f"  Feature shape: {feature.shape}")
    print(f"  Target: {target}")
    print(f"  Length: {length}")
    if i == 2:  # Print only first 3 samples
        break
class_samples = {}
for features, class_label in dataset.iterator():
    if class_label not in class_samples:
        class_samples[class_label] = 0
    class_samples[class_label] += 1

for class_label, count in class_samples.items():
    print(f"Class {class_label}: {count} samples")



Dataset size: 402
Number of classes: 10
Train set size: 321
Test set size: 81
Sample 0:
  Feature shape: (13,)
  Target: 5
  Length: 344
Sample 1:
  Feature shape: (13,)
  Target: 0
  Length: 270
Sample 2:
  Feature shape: (13,)
  Target: 4
  Length: 258
Class 5: 32 samples
Class 0: 48 samples
Class 4: 40 samples
Class 3: 38 samples
Class 7: 40 samples
Class 6: 41 samples
Class 2: 38 samples
Class 9: 43 samples
Class 1: 51 samples
Class 8: 31 samples


In [10]:
data,targets,lengths = test_data._get_data()

In [12]:
data[1]

array([-1.3101428 ,  2.6180587 , -1.2077682 ,  0.7328182 ,  0.11773837,
       -0.403541  ,  0.12819949, -0.4718498 ,  0.14588565,  0.38098323,
       -0.23878922,  0.12566495,  0.23440695], dtype=float32)