In [1]:
import os
os.chdir("../")

In [2]:
%pwd

'e:\\MyOnlineCourses\\ML_Projects\\arabic-digits-recognition'

In [3]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class DataPreprocessConfig:
    root_dir: Path
    source_path: str
    data_file: Path

In [4]:
from adr.constants import *
from adr.utils.help import read_yaml, create_directories

In [5]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH):

        self.config = read_yaml(config_filepath)

        create_directories([self.config.artifacts_root])


    
    def get_data_preprocess_config(self) -> DataPreprocessConfig:
        config = self.config.data_preprocessing

        create_directories([config.root_dir])

        data_preprocess_config = DataPreprocessConfig(
            root_dir=config.root_dir,
            source_path=config.source_path,
            data_file=config.data_file,
        )

        return data_preprocess_config

In [6]:
import numpy as np
from adr.utils.dataset import SeqDataset
from sklearn.decomposition import PCA
from operator import itemgetter
from adr import logger
class DataPreprocessing:
    def __init__(self,
                 config : DataPreprocessConfig
    ):
        self.config=config
        self._source_path = self.config.source_path
        self._data_file = self.config.data_file
    def get_data(self):
        data = np.load(self._source_path, allow_pickle=True)

        # Fetch arrays from loaded file
        features, targets, lengths, classes = itemgetter('features', 'targets', 'lengths', 'classes')(data)
        # Assuming features is of shape (n_samples, n_timesteps, n_features)
        n_samples, n_timesteps, n_features = features.shape
        
        pca = PCA(n_components=40)
        # Apply PCA to each time step
        features = np.array([
            pca.fit_transform(features[:, t, :]) for t in range(n_timesteps)
        ]).transpose(1, 0, 2)  # Reshape back to original structure if needed
        
        idx = np.argwhere(np.isin(targets, classes)).flatten()
        logger.info(f"features shape :{features.shape} type:{type(features)}")
        return SeqDataset(
            features = features[idx],
            targets = targets[idx],
            lengths= lengths[idx],
            classes=classes, path =self._data_file)
        
        
 

In [7]:
try:
    config = ConfigurationManager()
    data_preprocess_config = config.get_data_preprocess_config()
    data_preprocess = DataPreprocessing(config=data_preprocess_config)
    dataset = data_preprocess.get_data()
    dataset.save(compress=True)

except Exception as e:
    raise e

[2024-08-13 16:57:50,637: INFO: help: yaml file: config\config.yaml loaded successfully. Content size: 9]
[2024-08-13 16:57:50,641: INFO: help: Total directories created: 1]
[2024-08-13 16:57:50,644: INFO: help: Total directories created: 1]
[2024-08-13 16:57:51,087: INFO: 2230231788: features shape :(402, 52, 40) type:<class 'numpy.ndarray'>]
[2024-08-13 16:57:51,263: INFO: dataset: A npz file has been saved]


In [8]:
# Check the dataset
print(f"Dataset size: {len(dataset)}")
print(f"Number of classes: {len(dataset._classes)}")

# Split the dataset
train_data, test_data = dataset.split_data(split_size=0.2, shuffle=True, stratify=True)

print(f"Train set size: {len(train_data)}")
print(f"Test set size: {len(test_data)}")

# Iterate through the dataset
for i, (feature, target, length) in enumerate(dataset):
    print(f"Sample {i}:")
    print(f"  Feature shape: {feature.shape}")
    print(f"  Target: {target}")
    print(f"  Length: {length}")
    if i == 2:  # Print only first 3 samples
        break
class_samples = {}
for features, class_label in dataset.iterator():
    if class_label not in class_samples:
        class_samples[class_label] = 0
    class_samples[class_label] += 1

for class_label, count in class_samples.items():
    print(f"Class {class_label}: {count} samples")



Dataset size: 402
Number of classes: 10
Train set size: 321
Test set size: 81
Sample 0:
  Feature shape: (52, 40)
  Target: 1
  Length: 40
Sample 1:
  Feature shape: (52, 40)
  Target: 4
  Length: 40
Sample 2:
  Feature shape: (52, 40)
  Target: 4
  Length: 40
Class 0: 1 samples
Class 1: 1 samples
Class 2: 1 samples
Class 3: 1 samples
Class 4: 1 samples
Class 5: 1 samples
Class 6: 1 samples
Class 7: 1 samples
Class 8: 1 samples
Class 9: 1 samples
