In [1]:
import os
os.chdir("../")

In [2]:
%pwd

'e:\\MyOnlineCourses\\ML_Projects\\arabic-digits-recognition'

In [3]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class DataPreprocessConfig:
    root_dir: Path
    source_path: str
    data_file: Path

In [4]:
from src.ard.constants import *
from src.ard.utils.help import read_yaml, create_directories

In [5]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH):

        self.config = read_yaml(config_filepath)

        create_directories([self.config.artifacts_root])


    
    def get_data_preprocess_config(self) -> DataPreprocessConfig:
        config = self.config.data_preprocessing

        create_directories([config.root_dir])

        data_preprocess_config = DataPreprocessConfig(
            root_dir=config.root_dir,
            source_path=config.source_path,
            data_file=config.data_file,
        )

        return data_preprocess_config

In [31]:
import numpy as np
from src.ard.utils.dataset import SeqDataset
from sklearn.decomposition import PCA
from operator import itemgetter
class DataPreprocessing:
    def __init__(self,
                 config : DataPreprocessConfig
    ):
        self.config=config
        self._source_path = self.config.source_path
        self._data_file = self.config.data_file
    def get_data(self):
        data = np.load(self._source_path, allow_pickle=True)

        # Fetch arrays from loaded file
        features, targets, lengths, classes = itemgetter('features', 'targets', 'lengths', 'classes')(data)
        
        pca = PCA(n_components=13)
        pca.fit(features)
        features = pca.transform(features)
        idx = np.argwhere(np.isin(targets, classes)).flatten()
        ranges = SeqDataset._get_idxs(lengths)[idx]
       
        return SeqDataset(
            features = np.vstack(np.array([x for x in SeqDataset._iter_X(features, ranges)], dtype=object)),
            targets = targets[idx],
            lengths= lengths[idx],
            classes=classes, path =self._data_file)
        
        
 

In [32]:
try:
    config = ConfigurationManager()
    data_preprocess_config = config.get_data_preprocess_config()
    data_preprocess = DataPreprocessing(config=data_preprocess_config)
    dataset = data_preprocess.get_data()
    dataset.save(compress=True)

except Exception as e:
    raise e

[2024-08-07 13:01:08,550: INFO: help: yaml file: config\config.yaml loaded successfully. Content size: 3]
[2024-08-07 13:01:08,553: INFO: help: Total directories created: 1]
[2024-08-07 13:01:08,560: INFO: help: Total directories created: 1]
(126930, 13)
(402,)
[2024-08-07 13:01:15,848: INFO: dataset: A npz file has been saved]


In [33]:
# Check the dataset
print(f"Dataset size: {len(dataset)}")
print(f"Number of classes: {len(dataset._classes)}")

# Split the dataset
train_data, test_data = dataset.split_data(split_size=0.2, shuffle=True, stratify=True)

print(f"Train set size: {len(train_data)}")
print(f"Test set size: {len(test_data)}")

# Iterate through the dataset
for i, (feature, target, length) in enumerate(dataset):
    print(f"Sample {i}:")
    print(f"  Feature shape: {feature.shape}")
    print(f"  Target: {target}")
    print(f"  Length: {length}")
    if i == 2:  # Print only first 3 samples
        break
class_samples = {}
for features, class_label in dataset.iterator():
    if class_label not in class_samples:
        class_samples[class_label] = 0
    class_samples[class_label] += 1

for class_label, count in class_samples.items():
    print(f"Class {class_label}: {count} samples")



Dataset size: 402
Number of classes: 10
Train set size: 321
Test set size: 81
Sample 0:
  Feature shape: (13,)
  Target: 1
  Length: 274
Sample 1:
  Feature shape: (13,)
  Target: 0
  Length: 220
Sample 2:
  Feature shape: (13,)
  Target: 7
  Length: 239
Class 1: 51 samples
Class 0: 48 samples
Class 7: 40 samples
Class 3: 38 samples
Class 8: 31 samples
Class 4: 40 samples
Class 9: 43 samples
Class 6: 41 samples
Class 5: 32 samples
Class 2: 38 samples


In [34]:
data,targets,lengths = test_data._get_data()