In [1]:
import os
import sys
import numpy as np
import pandas as pd


# Import from src/ and config/
from src.preprocessing import preprocess_aachen_dataset
from config.defaults import Config

# Load the default configuration
config = Config()

# Preprocess the dataset for classification (CNN)
preprocessed_classification = preprocess_aachen_dataset(
    data_path=config.data_path,
    test_cell_count=config.test_cell_count,
    random_state=config.random_state,
    log_transform=config.log_transform,
    classification=True  # Explore classification (CNN) output
)

# Preprocess the dataset for regression (LSTM)
preprocessed_regression = preprocess_aachen_dataset(
    data_path=config.data_path,
    test_cell_count=config.test_cell_count,
    random_state=config.random_state,
    log_transform=config.log_transform,
    classification=False  # Explore regression (LSTM) output
)

# Explore classification (CNN) output
print("# Classification (CNN) Output Exploration")
print("X_train shape:", preprocessed_classification["X_train"].shape)  # Expected: (n_samples, 120, 1)
print("X_val shape:", preprocessed_classification["X_val"].shape)      # Expected: (n_samples, 120, 1)
print("X_test shape:", preprocessed_classification["X_test"].shape)    # Expected: (n_samples, 120, 1)
print("y_train shape:", preprocessed_classification["y_train"].shape)  # Expected: (n_samples, 7) for one-hot encoding
print("y_max:", preprocessed_classification["y_max"])                 # Maximum RUL for scaling
print("label_mapping:", preprocessed_classification["label_mapping"])  # RUL bin mappings
print("max_sequence_length:", preprocessed_classification["max_sequence_length"])  # Should be 120 for classification
print("Sample X_train[0]:\n", preprocessed_classification["X_train"][0])  # First sequence
print("Sample y_train[0]:\n", preprocessed_classification["y_train"][0])  # First one-hot label

# Explore regression (LSTM) output
print("\n# Regression (LSTM) Output Exploration")
print("X_train shape:", preprocessed_regression["X_train"].shape)     # Expected: (n_samples, max_seq_len, 1)
print("X_val shape:", preprocessed_regression["X_val"].shape)         # Expected: (n_samples, max_seq_len, 1)
print("X_test shape:", preprocessed_regression["X_test"].shape)       # Expected: (n_samples, max_seq_len, 1)
print("y_train shape:", preprocessed_regression["y_train"].shape)     # Expected: (n_samples,)
print("y_max:", preprocessed_regression["y_max"])                     # Maximum RUL for scaling
print("max_sequence_length:", preprocessed_regression["max_sequence_length"])  # Maximum sequence length
print("Sample X_train[0] shape:", preprocessed_regression["X_train"][0].shape)  # First sequence dimensions
print("Sample y_train[0]:", preprocessed_regression["y_train"][0])     # First normalized RUL

# Additional exploration: Class distribution for classification (if applicable)
if preprocessed_classification["label_mapping"]:
    y_train_classes = np.argmax(preprocessed_classification["y_train"], axis=1)
    print("\nClassification Class Distribution (Training):\n", pd.Series(y_train_classes).value_counts())

# Additional exploration: RUL statistics for regression
print("\nRegression RUL Statistics (Training):")
print("Mean RUL (normalized):", np.mean(preprocessed_regression["y_train"]))
print("Std RUL (normalized):", np.std(preprocessed_regression["y_train"]))
print("Min RUL (normalized):", np.min(preprocessed_regression["y_train"]))
print("Max RUL (normalized):", np.max(preprocessed_regression["y_train"]))

TypeError: Object of type int64 is not JSON serializable