In [1]:
import os
os.chdir("../")

In [None]:
%pwd

In [None]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class DataPreparationConfig:
    root_dir: Path
    origin_data_path: str
    processed_data_path:str


In [None]:
from src.ard.constants import *
from src.ard.utils.help import read_yaml, create_directories

class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH):

        self.config = read_yaml(config_filepath)

        create_directories([self.config.artifacts_root])


    
    def get_data_preparation_config(self) -> DataPreparationConfig:
        config = self.config.data_preparation

        create_directories([config.root_dir])

        data_preparation_config = DataPreparationConfig(
            root_dir=config.root_dir,
            origin_data_path=config.origin_data_path,
            processed_data_path=config.processed_data_path,
        )

        return data_preparation_config

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
from ard.utils.dataset import SeqDataset
import logging
from ard import logger

class DataPreoaration:
    def __init__(self,
                 config : DataPreparationConfig
    ):
        self.config=config
        self._origin_data_path = self.config.origin_data_path
        self._processed_data_path = self.config.processed_data_path
def load_npz_data(self, data_path):
    """
    Load data from an NPZ file.

    Args:
        file_path (str): Path to the NPZ file.

    Returns:
        SeqDataset: Loaded dataset.
    """
    try:
        data = np.load(data_path, allow_pickle=True)
        features, targets, lengths, classes = data._get_data()
      
        return SeqDataset(features=features, targets=targets, classes=classes, lengths=lengths)
    except Exception as e:
        logger.error(f"Error loading NPZ file: {e}")
        raise

def prepare_data(self):
    """
    Prepare data for a CNN model.

    Args:
        dataset (SeqDataset): The dataset to prepare.
        train_shape (tuple): The desired shape for training data.
        test_shape (tuple): The desired shape for testing data.

    Returns:
        tuple: X_train, X_test, y_train, y_test
    """
    logger.info("Loading data from NPZ file...")
    dataset = load_npz_data(self._origin_data_path)
    logger.info(f"Dataset loaded")
    X = dataset._features
    y = dataset._targets

    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

    
    # Reshape for CNN (add channel dimension)
    X_train = X_train.reshape((-1, X_train[0], X_train[1], 1))
    X_test = X_test.reshape((-1, X_test[0], X_test[1], 1))

    # Convert labels to categorical
    y_train = to_categorical(y_train)
    y_test = to_categorical(y_test)

    return X_train, X_test, y_train, y_test

def pad_sequences(sequences, maxlen):
    """
    Pad or truncate sequences to a specific length.

    Args:
        sequences (list): List of sequences.
        maxlen (int): Desired length of sequences.

    Returns:
        np.array: Padded sequences.
    """
    padded_sequences = []
    for seq in sequences:
        if len(seq) > maxlen:
            padded_sequences.append(seq[:maxlen])
        else:
            padded_sequences.append(np.pad(seq, ((0, maxlen - len(seq)), (0, 0)), mode='constant'))
    return np.array(padded_sequences)

def main():
    # File path
    npz_file_path = 'path/to/your/audio_dataset.npz'

    # Load the data
    logger.info("Loading data from NPZ file...")
    dataset = load_npz_data(npz_file_path)
    logger.info(f"Dataset loaded: {dataset}")

    # Prepare data for CNN
    logger.info("Preparing data for CNN...")
    X_train, X_test, y_train, y_test = prepare_data_for_cnn(dataset)

    # Log shapes
    logger.info(f"Training data shape: {X_train.shape}")
    logger.info(f"Testing data shape: {X_test.shape}")
    logger.info(f"Training labels shape: {y_train.shape}")
    logger.info(f"Testing labels shape: {y_test.shape}")

    # At this point, X_train, X_test, y_train, and y_test are ready for your CNN model
    # You can proceed with model creation, compilation, and training
    return  X_train, X_test, y_train, y_test