In [1]:
import pandas as pd
import glob
import os
from pathlib import Path
import numpy as np

def normalize_datasets():
    """
    Load all CSV files from EM_data/, find global absolute maxima across all datasets,
    normalize each dataset by global absolute maxima, and save to EM_data_norm/
    """
    
    # Get all CSV files in EM_data folder
    input_folder = 'test'
    csv_pattern = os.path.join(input_folder, '*.csv')
    csv_files = glob.glob(csv_pattern)
    
    if not csv_files:
        print(f"No CSV files found in {input_folder}/ folder!")
        return
    
    print(f"Found {len(csv_files)} CSV files:")
    for file in csv_files:
        print(f"  - {os.path.basename(file)}")
    
    # Step 1: Load all datasets and concatenate to find global absolute maxima
    print("\nStep 1: Loading all datasets to find global absolute maxima...")
    all_data = []
    total_rows = 0
    
    for file in csv_files:
        try:
            df = pd.read_csv(file)
            all_data.append(df)
            total_rows += len(df)
            print(f"  Loaded {os.path.basename(file)}: {len(df)} rows, {len(df.columns)} columns")
        except Exception as e:
            print(f"  Error loading {file}: {e}")
            continue
    
    if not all_data:
        print("No datasets loaded successfully!")
        return
    
    # Concatenate all data to find global absolute maxima
    print(f"\nCombining {len(all_data)} datasets ({total_rows} total rows)...")
    combined_data = pd.concat(all_data, ignore_index=True)
    
    # Find maximum of absolute values for each column
    global_abs_max = combined_data.abs().max()
    
    print(f"\nGlobal absolute maxima across all datasets:")
    print("=" * 60)
    for col, max_val in global_abs_max.items():
        min_val = combined_data[col].min()
        max_val_orig = combined_data[col].max()
        print(f"{col:15}: max(abs) = {max_val:12.6f} | range: [{min_val:8.3f}, {max_val_orig:8.3f}]")
    print("=" * 60)
    
    # Step 2: Create output folder
    output_folder = 'test_norm'
    Path(output_folder).mkdir(exist_ok=True)
    print(f"\nCreated/verified output folder: {output_folder}/")
    
    # Step 3: Normalize each dataset and save
    print(f"\nStep 2: Normalizing and saving datasets...")
    
    successful_files = 0
    for file in csv_files:
        try:
            # Load dataset
            df = pd.read_csv(file)
            
            # Normalize by global absolute maxima (element-wise division)
            df_norm = df / global_abs_max
            
            # Verify normalization (all values should be between -1 and 1)
            assert (df_norm >= -1.001).all().all(), f"Values < -1 found in {file}"  # Small tolerance for floating point
            assert (df_norm <= 1.001).all().all(), f"Values > 1 found in {file}"   # Small tolerance for floating point
            
            # Create output filename
            filename = os.path.basename(file)
            name_without_ext = os.path.splitext(filename)[0]
            output_filename = f"{name_without_ext}_norm.csv"
            output_path = os.path.join(output_folder, output_filename)
            
            # Save normalized dataset
            df_norm.to_csv(output_path, index=False)
            
            # Print statistics
            min_val = df_norm.min().min()
            max_val = df_norm.max().max()
            print(f"  {filename:25} -> {output_filename:30} | Shape: {df_norm.shape} | Range: [{min_val:7.4f}, {max_val:7.4f}]")
            
            successful_files += 1
            
        except Exception as e:
            print(f"  ❌ Error processing {file}: {e}")
            continue
    
    print(f"\nNormalization complete!")
    print(f"✅ Successfully processed: {successful_files}/{len(csv_files)} files")
    print(f"📁 Normalized datasets saved in {output_folder}/")
    
    # Step 4: Verification summary
    print(f"\nSummary:")
    print(f"- Input folder: {input_folder}/")
    print(f"- Output folder: {output_folder}/")
    print(f"- Files processed: {successful_files}/{len(csv_files)}")
    print(f"- Total rows processed: {total_rows:,}")
    print(f"- Columns: {list(global_abs_max.index)}")
    print(f"- Normalization: col_norm = col / max(abs(col_all_datasets))")
    
    return global_abs_max

def verify_normalization():
    """
    Verify that normalization was done correctly by checking a few samples
    """
    print("\n" + "="*70)
    print("VERIFICATION: Checking normalization correctness")
    print("="*70)
    
    # Get normalized files
    norm_files = glob.glob('test_norm/*_norm.csv')
    
    if not norm_files:
        print("No normalized files found!")
        return
    
    print(f"Found {len(norm_files)} normalized files")
    
    # Check all files for range compliance
    all_in_range = True
    for norm_file in norm_files[:3]:  # Check first 3 files as samples
        df_norm = pd.read_csv(norm_file)
        min_val = df_norm.min().min()
        max_val = df_norm.max().max()
        
        in_range = (-1.001 <= min_val <= 1.001) and (-1.001 <= max_val <= 1.001)
        status = "✅" if in_range else "❌"
        
        print(f"{status} {os.path.basename(norm_file):30} | Range: [{min_val:7.4f}, {max_val:7.4f}]")
        
        if not in_range:
            all_in_range = False
    
    if len(norm_files) > 3:
        print(f"... and {len(norm_files) - 3} more files")
    
    # Detailed check on first file
    if norm_files:
        sample_file = norm_files[0]
        df_norm = pd.read_csv(sample_file)
        
        print(f"\nDetailed statistics for: {os.path.basename(sample_file)}")
        print(f"Shape: {df_norm.shape}")
        print(f"\nColumn ranges (should be within [-1, 1]):")
        for col in df_norm.columns:
            min_val = df_norm[col].min()
            max_val = df_norm[col].max()
            status = "✅" if (-1.001 <= min_val <= 1.001) and (-1.001 <= max_val <= 1.001) else "❌"
            print(f"  {col:15}: [{min_val:7.4f}, {max_val:7.4f}] {status}")
    
    return all_in_range

if __name__ == "__main__":
    print("🚀 Starting dataset normalization process...")
    print("Normalization formula: col_norm = col / max(abs(col_all_datasets))")
    print("Expected range: [-1, 1]\n")
    
    # Run normalization
    global_maxima = normalize_datasets()
    
    if global_maxima is not None:
        # Run verification
        verification_passed = verify_normalization()
        
        if verification_passed:
            print(f"\n🎉 Process completed successfully!")
            print(f"All normalized values are within [-1, 1] range.")
        else:
            print(f"\n⚠️  Process completed with warnings!")
            print(f"Some values may be outside expected range.")
    else:
        print(f"\n❌ Process failed!")

🚀 Starting dataset normalization process...
Normalization formula: col_norm = col / max(abs(col_all_datasets))
Expected range: [-1, 1]

Found 19 CSV files:
  - motor_simulation_Trj_5_e4.csv
  - motor_simulation_Trj_1_e0.csv
  - motor_simulation_Trj_1_e1.csv
  - motor_simulation_Trj_3_e0.csv
  - motor_simulation_Trj_3_e1.csv
  - motor_simulation_Trj_2_e1.csv
  - motor_simulation_Trj_4_e1.csv
  - motor_simulation_Trj_2_e0.csv
  - motor_simulation_Trj_4_e2.csv
  - motor_simulation_Trj_2_e2.csv
  - motor_simulation_Trj_1_e3.csv
  - motor_simulation_Trj_5_e2.csv
  - motor_simulation_Trj_5_e0.csv
  - motor_simulation_Trj_1_e2.csv
  - motor_simulation_Trj_5_e1.csv
  - motor_simulation_Trj_4_e0.csv
  - motor_simulation_Trj_5_e3.csv
  - motor_simulation_Trj_4_e3.csv
  - motor_simulation_Trj_3_e2.csv

Step 1: Loading all datasets to find global absolute maxima...
  Loaded motor_simulation_Trj_5_e4.csv: 10001 rows, 10 columns
  Loaded motor_simulation_Trj_1_e0.csv: 18001 rows, 10 columns
  Loaded