In [5]:
import pandas as pd
import numpy as np
from scipy import stats

def compare_distributions(file1, file2):
    """
    Compare distributions between two CSV files using Kolmogorov-Smirnov test
    
    Parameters:
    file1 (str): Path to the first CSV file (actual measurements)
    file2 (str): Path to the second CSV file (ideal simulation)
    
    Returns:
    dict: Kolmogorov-Smirnov test results for each column
    """
    # Read CSV files
    df1 = pd.read_csv(file1)
    df2 = pd.read_csv(file2)
    
    # Ensure the files have the same number of columns
    if df1.shape[1] != df2.shape[1]:
        raise ValueError("The two CSV files must have the same number of columns")
    
    # Dictionary to store test results
    ks_results = {}
    
    # Perform Kolmogorov-Smirnov test for each column
    for column in range(df1.shape[1]):
        # Extract column data, converting to numeric and dropping NaNs
        data1 = pd.to_numeric(df1.iloc[:, column], errors='coerce').dropna()
        data2 = pd.to_numeric(df2.iloc[:, column], errors='coerce').dropna()
        
        # Perform Kolmogorov-Smirnov test
        statistic, p_value = stats.ks_2samp(data1, data2)
        
        # Store results
        ks_results[column] = {
            'statistic': round(statistic, 4),
            'p_value': round(p_value, 4),
            'significant_diff': p_value < 0.05  # Standard significance level
        }
    
    return ks_results

# Perform the comparison
results = compare_distributions('data2.csv', 'data3.csv')

# Print detailed results
print("Kolmogorov-Smirnov Test Results:")
for column, result in results.items():
    print(f"\n- **Column {column}**:")
    print(f"  - KS Statistic: {result['statistic']}")
    print(f"  - P-value: {result['p_value']}")
    print(f"  - Significant Difference: {'Yes' if result['significant_diff'] else 'No'}")

# Additional interpretation
significant_columns = [col for col, res in results.items() if res['significant_diff']]
print("\nConclusion:")
if not significant_columns:
    print("All columns have p-values greater than 0.05, so we cannot reject the null hypothesis. This indicates no significant distribution differences between actual measurements and ideal simulations across the columns.")
else:
    print(f"Columns {', '.join(map(str, significant_columns))} show significant distribution differences. Other columns do not have significant differences.")

Kolmogorov-Smirnov Test Results:

- **Column 0**:
  - KS Statistic: 0.1313
  - P-value: 0.3621
  - Significant Difference: No

- **Column 1**:
  - KS Statistic: 0.1212
  - P-value: 0.463
  - Significant Difference: No

- **Column 2**:
  - KS Statistic: 0.1212
  - P-value: 0.463
  - Significant Difference: No

- **Column 3**:
  - KS Statistic: 0.1313
  - P-value: 0.3621
  - Significant Difference: No

Conclusion:
All columns have p-values greater than 0.05, so we cannot reject the null hypothesis. This indicates no significant distribution differences between actual measurements and ideal simulations across the columns.
