In [None]:
# Multi-dataset processing for RUL prediction
# Add this after your read_hdf function definition

def check_dataset_compatibility(files_to_check):
    """Check if datasets have compatible shapes and features."""
    compatibility_info = {}
    
    for file in files_to_check:
        print(f"\nChecking {file}...")
        trainx, trainy, adev, testx, testy, atest = read_hdf(filename=file)
        
        compatibility_info[file] = {
            'n_features': trainx.shape[1],
            'features': trainx.columns.tolist(),
            'train_samples': trainx.shape[0],
            'test_samples': testx.shape[0],
            'n_units_train': adev['unit'].nunique(),
            'n_units_test': atest['unit'].nunique()
        }
        
        # Clean up
        del trainx, trainy, adev, testx, testy, atest
        gc.collect()
    
    return compatibility_info

def process_multiple_datasets(file_list, window_size=30, stride=15):
    """
    Process multiple HDF files and combine their data.
    
    Parameters:
    -----------
    file_list : list
        List of file paths to process
    window_size : int
        Window size for time series slicing
    stride : int
        Stride for window slicing
        
    Returns:
    --------
    Combined training and test data
    """
    
    all_train_windows = []
    all_train_labels = []
    all_test_windows = []
    all_test_labels = []
    
    # Track unit offset to avoid unit ID conflicts across datasets
    unit_offset_train = 0
    unit_offset_test = 0
    
    for file_idx, curr_file in enumerate(file_list):
        print(f"\n{'='*60}")
        print(f"Processing file {file_idx+1}/{len(file_list)}: {curr_file}")
        print(f"{'='*60}")
        
        # Read data
        trainx, trainy, adev, testx, testy, atest = read_hdf(filename=curr_file)
        
        # Filter zero RUL values
        trainy = trainy[trainy != 0].dropna()
        trainx_l = trainx.loc[trainy.index]
        testy = testy[testy != 0].dropna()
        testx_l = testx.loc[testy.index]
        
        # Scale data
        sc = StandardScaler()
        sc.fit(trainx_l.values)
        trainx_scaled = sc.transform(trainx_l.values).astype(np.float32)
        testx_scaled = sc.transform(testx_l.values).astype(np.float32)
        
        trainx_scaled = pd.DataFrame(trainx_scaled, columns=trainx.columns)
        testx_scaled = pd.DataFrame(testx_scaled, columns=trainx.columns)
        
        # Process training data
        df_train = trainx_scaled.copy()
        df_train['unit'] = adev.loc[trainx_scaled.index, 'unit'].values + unit_offset_train
        df_train['RUL'] = trainy['RUL'].values
        
        for un in df_train['unit'].unique():
            windows = time_window_slicing_sample(
                df_train, window_size, un, 
                df_train.columns.difference(['unit', 'RUL']), 
                stride
            )
            labels = time_window_slicing_label(df_train, window_size, un, stride=stride)
            
            all_train_windows.append(windows)
            all_train_labels.extend(labels.tolist())
        
        # Process test data
        df_test = testx_scaled.copy()
        df_test['unit'] = atest.loc[testx_scaled.index, 'unit'].values + unit_offset_test
        df_test['RUL'] = testy['RUL'].values
        
        for un in df_test['unit'].unique():
            windows = time_window_slicing_sample(
                df_test, window_size, un,
                df_test.columns.difference(['unit', 'RUL']),
                stride
            )
            labels = time_window_slicing_label(df_test, window_size, un, stride=stride)
            
            all_test_windows.append(windows)
            all_test_labels.extend(labels.tolist())
        
        # Update unit offsets
        unit_offset_train += df_train['unit'].nunique()
        unit_offset_test += df_test['unit'].nunique()
        
        print(f"✓ Processed {df_train['unit'].nunique()} training units")
        print(f"✓ Processed {df_test['unit'].nunique()} test units")
        
        # Clean up
        del trainx, trainy, adev, testx, testy, atest
        del trainx_scaled, testx_scaled, df_train, df_test
        gc.collect()
    
    # Combine all data
    print(f"\n{'='*60}")
    print("Combining all datasets...")
    print(f"{'='*60}")
    
    combined_train_data = np.concatenate([w.transpose(2, 0, 1) for w in all_train_windows], axis=0)
    combined_train_labels = np.array(all_train_labels, dtype=np.float32)
    
    combined_test_data = np.concatenate([w.transpose(2, 0, 1) for w in all_test_windows], axis=0)
    combined_test_labels = np.array(all_test_labels, dtype=np.float32)
    
    print(f"Combined training data shape: {combined_train_data.shape}")
    print(f"Combined training labels shape: {combined_train_labels.shape}")
    print(f"Combined test data shape: {combined_test_data.shape}")
    print(f"Combined test labels shape: {combined_test_labels.shape}")
    
    return combined_train_data, combined_train_labels, combined_test_data, combined_test_labels


# ============================================================================
# USAGE EXAMPLE
# ============================================================================

# Step 1: Check compatibility
print("Step 1: Checking dataset compatibility...")
compatibility = check_dataset_compatibility(all_files[:3])  # Check first 3 files

# Display compatibility info
for file, info in compatibility.items():
    print(f"\n{file}:")
    print(f"  Features: {info['n_features']}")
    print(f"  Training samples: {info['train_samples']}")
    print(f"  Test samples: {info['test_samples']}")
    print(f"  Training units: {info['n_units_train']}")
    print(f"  Test units: {info['n_units_test']}")

# Check if all have same number of features
n_features = [info['n_features'] for info in compatibility.values()]
if len(set(n_features)) == 1:
    print(f"\n✓ All datasets have {n_features[0]} features - COMPATIBLE")
    can_combine = True
else:
    print(f"\n✗ Datasets have different feature counts: {set(n_features)} - NOT COMPATIBLE")
    can_combine = False

# Step 2: Process multiple datasets (only if compatible)
if can_combine:
    # Select which files to combine (example: first 3 files)
    files_to_process = all_files[:3]  # Adjust as needed
    
    print(f"\nStep 2: Processing {len(files_to_process)} datasets...")
    all_units_data, all_labels, test_units_data, test_labels = process_multiple_datasets(
        files_to_process,
        window_size=WINDOW_SIZE,
        stride=STRIDE
    )
    
    # Step 3: Create train/val split
    from sklearn.model_selection import train_test_split
    xtrain, xval, ytrain, yval = train_test_split(
        all_units_data, all_labels, 
        test_size=0.2, 
        random_state=42
    )
    
    ytrain = ytrain.reshape([-1, 1])
    yval = yval.reshape([-1, 1])
    test_labels = test_labels.reshape([-1, 1])
    
    print(f"\nFinal shapes:")
    print(f"  Train: {xtrain.shape}")
    print(f"  Val: {xval.shape}")
    print(f"  Test: {test_units_data.shape}")
    
    # Continue with your existing tf.data pipeline and training...
else:
    print("\n⚠ Datasets are not compatible for combining.")
    print("Consider training separate models for each dataset.")