In [None]:
import pandas as pd
import numpy as np
from scipy import stats
import os
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns

def analyze_temporal_coverage(obs_data, model_data):
    """Analyze temporal coverage with aligned data"""
    # 
    merged_data = pd.merge(obs_data, model_data, 
                          on=['Month'], 
                          suffixes=('_obs', '_model'))
    
    metrics = {}
    total_months = 12
    valid_months = np.sum(~np.isnan(merged_data['Rainfall_obs']))
    
    metrics['Coverage_Ratio'] = valid_months / total_months
    metrics['Valid_Months'] = valid_months
    metrics['Missing_Months'] = total_months - valid_months
    
    # Calculate metrics for months
    valid_mask = ~np.isnan(merged_data['Rainfall_obs'])
    if np.sum(valid_mask) > 0:
        obs_values = merged_data.loc[valid_mask, 'Rainfall_obs']
        model_values = merged_data.loc[valid_mask, 'Rainfall_model']
        
        metrics['r'] = stats.pearsonr(obs_values, model_values)[0]
        metrics['RMSE'] = np.sqrt(np.mean((model_values - obs_values) ** 2))
        metrics['MAE'] = np.mean(np.abs(model_values - obs_values))
        metrics['PBIAS'] = 100 * np.sum(model_values - obs_values) / np.sum(obs_values)
        metrics['NSE'] = 1 - (np.sum((model_values - obs_values) ** 2) / 
                             np.sum((obs_values - obs_values.mean()) ** 2))
    else:
        metrics.update({'r': np.nan, 'RMSE': np.nan, 'MAE': np.nan, 'PBIAS': np.nan, 'NSE': np.nan})
    
    return metrics

def process_ensemble_validation(obs_dir, ensemble_models_dir, output_dir):
    """Process validation for 6-model ensemble data against observations"""
    
    print("="*80)
    print("6-MODEL ENSEMBLE VALIDATION ANALYSIS")
    print("="*80)
    print(f"Observations directory: {obs_dir}")
    print(f"Ensemble models directory: {ensemble_models_dir}")
    print(f"Output directory: {output_dir}")
    print("-"*80)
    
    Path(output_dir).mkdir(parents=True, exist_ok=True)
    validation_results = []
    coverage_patterns = []
    error_stations = []
    
    # Look for the Ensemble_6Models subdirectory
    ensemble_path = os.path.join(ensemble_models_dir, 'Ensemble_6Models')
    if not os.path.exists(ensemble_path):
        print(f"ERROR: Ensemble_6Models directory not found at {ensemble_path}")
        return None, None
    
    print(f"Processing ensemble model: Ensemble_6Models")
    print(f"Ensemble path: {ensemble_path}")
    
    # Get list of ensemble station files
    ensemble_files = [f for f in os.listdir(ensemble_path) if f.endswith('.xlsx')]
    total_stations = len(ensemble_files)
    processed_stations = 0
    
    print(f"Found {total_stations} ensemble station files")
    
    for station_file in ensemble_files:
        if not station_file.startswith('Station_') or not station_file.endswith('.xlsx'):
            continue
            
        # Extract station ID from filename: Station_{STATION_ID}_Daily_Rainfall.xlsx
        try:
            station_id = station_file.split('_')[1]
        except IndexError:
            print(f"Warning: Could not extract station ID from {station_file}")
            continue
            
        print(f"\nProcessing station {processed_stations + 1}/{total_stations}: {station_id}")
        
        try:
            # Read ensemble model data
            ensemble_file_path = os.path.join(ensemble_path, station_file)
            model_data = pd.read_excel(ensemble_file_path, sheet_name='Monthly_Averages')
            
            # Read corresponding observation data
            obs_file = os.path.join(obs_dir, f'Station_{station_id}_Daily_Rainfall.xlsx')
            if not os.path.exists(obs_file):
                print(f"  Warning: No observation file found for station {station_id}")
                error_stations.append({'Station_ID': station_id, 'Error': 'No observation file'})
                continue
                
            obs_data = pd.read_excel(obs_file, sheet_name='Monthly_Averages')
            
            print(f"  Loaded data - Ensemble: {len(model_data)} months, Observations: {len(obs_data)} months")
            
            # Calculate validation metrics
            metrics = analyze_temporal_coverage(obs_data, model_data)
            
            print(f"  Metrics calculated - Coverage: {metrics['Coverage_Ratio']:.2f}, r: {metrics['r']:.3f}, NSE: {metrics['NSE']:.3f}")
            
            # Store results
            result = {
                'Model': 'Ensemble_6Models',
                'Station_ID': station_id,
                'Station_Name': model_data['Station_Name'].iloc[0] if 'Station_Name' in model_data.columns else f'Station_{station_id}',
                'Latitude': model_data['Latitude'].iloc[0] if 'Latitude' in model_data.columns else np.nan,
                'Longitude': model_data['Longitude'].iloc[0] if 'Longitude' in model_data.columns else np.nan,
                **metrics
            }
            validation_results.append(result)
            
            # Store coverage patterns for detailed analysis
            # Merge obs and model data to get coverage pattern
            merged_coverage = pd.merge(obs_data[['Month', 'Rainfall']], 
                                     model_data[['Month', 'Rainfall']], 
                                     on='Month', suffixes=('_obs', '_model'))
            
            coverage_pattern = pd.DataFrame({
                'Model': 'Ensemble_6Models',
                'Station_ID': station_id,
                'Month': merged_coverage['Month'],
                'Is_Valid': ~np.isnan(merged_coverage['Rainfall_obs']),
                'Has_Model_Data': ~np.isnan(merged_coverage['Rainfall_model']),
                'Obs_Rainfall': merged_coverage['Rainfall_obs'],
                'Model_Rainfall': merged_coverage['Rainfall_model']
            })
            coverage_patterns.append(coverage_pattern)
            
            processed_stations += 1
            print(f"  ✓ Successfully processed station {station_id}")
            
        except Exception as e:
            print(f"  ✗ Error processing station {station_id}: {str(e)}")
            error_stations.append({'Station_ID': station_id, 'Error': str(e)})
            import traceback
            print(f"  Error details: {traceback.format_exc()}")
            continue
    
    if not validation_results:
        print("ERROR: No stations were successfully processed!")
        return None, None
    
    # Combine results
    validation_df = pd.DataFrame(validation_results)
    coverage_df = pd.concat(coverage_patterns, ignore_index=True)
    error_df = pd.DataFrame(error_stations)
    
    print(f"\n{'='*60}")
    print("VALIDATION SUMMARY")
    print(f"{'='*60}")
    print(f"Total stations found: {total_stations}")
    print(f"Successfully processed: {processed_stations}")
    print(f"Errors/missing data: {len(error_stations)}")
    print(f"Success rate: {processed_stations/total_stations*100:.1f}%")
    
    # Print overall statistics
    if len(validation_df) > 0:
        print(f"\nOverall Performance Statistics:")
        print(f"Average Correlation (r): {validation_df['r'].mean():.3f} ± {validation_df['r'].std():.3f}")
        print(f"Average NSE: {validation_df['NSE'].mean():.3f} ± {validation_df['NSE'].std():.3f}")
        print(f"Average RMSE: {validation_df['RMSE'].mean():.1f} ± {validation_df['RMSE'].std():.1f}")
        print(f"Average Coverage: {validation_df['Coverage_Ratio'].mean():.2f} ± {validation_df['Coverage_Ratio'].std():.2f}")
    
    # Save results
    output_file = os.path.join(output_dir, 'ensemble_6models_validation_results.xlsx')
    print(f"\nSaving results to: {output_file}")
    
    with pd.ExcelWriter(output_file, engine='openpyxl') as writer:
        validation_df.to_excel(writer, sheet_name='Validation_Metrics', index=False)
        coverage_df.to_excel(writer, sheet_name='Coverage_Patterns', index=False)
        if len(error_df) > 0:
            error_df.to_excel(writer, sheet_name='Processing_Errors', index=False)
        
        # Add summary statistics
        summary_stats = validation_df[['r', 'RMSE', 'MAE', 'PBIAS', 'NSE', 'Coverage_Ratio']].describe()
        summary_stats.to_excel(writer, sheet_name='Summary_Statistics')
    
    # Create validation plots
    create_ensemble_validation_plots(validation_df, coverage_df, output_dir)
    
    print(f"✓ Validation analysis complete!")
    
    return validation_df, coverage_df

def create_ensemble_validation_plots(validation_df, coverage_df, output_dir):
    """Create validation plots specifically for ensemble model"""
    
    print("\nCreating validation plots...")
    
    # Set basic plot style
    plt.rcParams['figure.figsize'] = [12, 8]
    plt.rcParams['axes.grid'] = True
    plt.rcParams['grid.alpha'] = 0.3
    plt.rcParams['grid.linestyle'] = '--'
    
    # 1. Performance metrics distribution
    fig, axes = plt.subplots(2, 3, figsize=(18, 12))
    metrics = ['r', 'RMSE', 'MAE', 'NSE', 'PBIAS', 'Coverage_Ratio']
    colors = ['skyblue', 'lightcoral', 'lightgreen', 'gold', 'violet', 'orange']
    
    for ax, metric, color in zip(axes.flat, metrics, colors):
        valid_data = validation_df[metric].dropna()
        if len(valid_data) > 0:
            ax.hist(valid_data, bins=15, alpha=0.7, color=color, edgecolor='black')
            ax.set_title(f'{metric} Distribution\n(n={len(valid_data)}, mean={valid_data.mean():.3f})', 
                        fontsize=12, pad=10)
            ax.set_xlabel(metric)
            ax.set_ylabel('Frequency')
            ax.grid(True, alpha=0.3)
        else:
            ax.text(0.5, 0.5, 'No data', ha='center', va='center', transform=ax.transAxes)
            ax.set_title(f'{metric} - No Data Available')
    
    plt.suptitle('6-Model Ensemble Performance Metrics Distribution', fontsize=16, y=0.98)
    plt.tight_layout()
    plt.savefig(os.path.join(output_dir, 'ensemble_performance_distribution.png'), 
                dpi=300, bbox_inches='tight')
    plt.close()
    
    # 2. Scatter plots for key relationships
    fig, axes = plt.subplots(2, 2, figsize=(15, 12))
    
    # Correlation vs NSE
    ax = axes[0, 0]
    ax.scatter(validation_df['r'], validation_df['NSE'], alpha=0.6, s=60, color='blue')
    ax.set_xlabel('Correlation (r)')
    ax.set_ylabel('Nash-Sutcliffe Efficiency (NSE)')
    ax.set_title('Correlation vs NSE')
    ax.grid(True, alpha=0.3)
    
    # Coverage vs Performance
    ax = axes[0, 1]
    ax.scatter(validation_df['Coverage_Ratio'], validation_df['r'], alpha=0.6, s=60, color='green')
    ax.set_xlabel('Coverage Ratio')
    ax.set_ylabel('Correlation (r)')
    ax.set_title('Coverage vs Correlation')
    ax.grid(True, alpha=0.3)
    
    # PBIAS vs NSE
    ax = axes[1, 0]
    ax.scatter(validation_df['PBIAS'], validation_df['NSE'], alpha=0.6, s=60, color='red')
    ax.set_xlabel('PBIAS (%)')
    ax.set_ylabel('Nash-Sutcliffe Efficiency (NSE)')
    ax.set_title('PBIAS vs NSE')
    ax.grid(True, alpha=0.3)
    
    # RMSE vs MAE
    ax = axes[1, 1]
    ax.scatter(validation_df['RMSE'], validation_df['MAE'], alpha=0.6, s=60, color='purple')
    ax.set_xlabel('RMSE')
    ax.set_ylabel('MAE')
    ax.set_title('RMSE vs MAE')
    ax.grid(True, alpha=0.3)
    
    plt.suptitle('6-Model Ensemble Performance Relationships', fontsize=16, y=0.98)
    plt.tight_layout()
    plt.savefig(os.path.join(output_dir, 'ensemble_performance_relationships.png'), 
                dpi=300, bbox_inches='tight')
    plt.close()
    
    # 3. Coverage pattern visualization
    plt.figure(figsize=(14, 8))
    
    # Create coverage matrix
    coverage_matrix = coverage_df.pivot_table(
        values='Is_Valid', 
        index='Station_ID',
        columns='Month',
        aggfunc='first'
    )
    
    # Create heatmap
    sns.heatmap(coverage_matrix, 
                cmap='RdYlBu_r', 
                cbar_kws={'label': 'Data Available (1=Yes, 0=No)'},
                linewidths=0.5,
                linecolor='white')
    
    plt.title('Data Coverage Pattern by Station and Month\n6-Model Ensemble vs Observations', 
              fontsize=14, pad=20)
    plt.xlabel('Month')
    plt.ylabel('Station ID')
    plt.xticks(range(12), ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun',
                          'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'])
    plt.tight_layout()
    plt.savefig(os.path.join(output_dir, 'ensemble_coverage_pattern.png'), 
                dpi=300, bbox_inches='tight')
    plt.close()
    
    # 4. Performance by station (top/bottom performers)
    if len(validation_df) >= 10:  # Only if we have enough stations
        plt.figure(figsize=(16, 10))
        
        # Sort by correlation
        sorted_df = validation_df.sort_values('r', ascending=True)
        
        # Plot top and bottom 10 stations
        n_show = min(10, len(sorted_df) // 2)
        plot_df = pd.concat([sorted_df.head(n_show), sorted_df.tail(n_show)])
        
        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 8))
        
        # Correlation plot
        colors = ['red' if r < 0.5 else 'orange' if r < 0.7 else 'green' for r in plot_df['r']]
        bars1 = ax1.barh(range(len(plot_df)), plot_df['r'], color=colors, alpha=0.7)
        ax1.set_yticks(range(len(plot_df)))
        ax1.set_yticklabels(plot_df['Station_ID'])
        ax1.set_xlabel('Correlation (r)')
        ax1.set_title(f'Correlation by Station (Top & Bottom {n_show})')
        ax1.grid(True, alpha=0.3)
        ax1.axvline(x=0, color='black', linestyle='-', alpha=0.3)
        ax1.axvline(x=0.5, color='orange', linestyle='--', alpha=0.5, label='r=0.5')
        ax1.axvline(x=0.7, color='green', linestyle='--', alpha=0.5, label='r=0.7')
        ax1.legend()
        
        # NSE plot
        colors = ['red' if nse < 0 else 'orange' if nse < 0.5 else 'green' for nse in plot_df['NSE']]
        bars2 = ax2.barh(range(len(plot_df)), plot_df['NSE'], color=colors, alpha=0.7)
        ax2.set_yticks(range(len(plot_df)))
        ax2.set_yticklabels(plot_df['Station_ID'])
        ax2.set_xlabel('Nash-Sutcliffe Efficiency (NSE)')
        ax2.set_title(f'NSE by Station (Top & Bottom {n_show})')
        ax2.grid(True, alpha=0.3)
        ax2.axvline(x=0, color='black', linestyle='-', alpha=0.3)
        ax2.axvline(x=0.5, color='orange', linestyle='--', alpha=0.5, label='NSE=0.5')
        ax2.legend()
        
        plt.suptitle('Station-wise Performance Analysis - 6-Model Ensemble', fontsize=16, y=0.98)
        plt.tight_layout()
        plt.savefig(os.path.join(output_dir, 'ensemble_station_performance.png'), 
                    dpi=300, bbox_inches='tight')
        plt.close()
    
    print("✓ All validation plots created successfully!")

# Define directories
obs_dir = r'D:\RICAAR\Pr.New.Stations.Selection\OBSERVATIONS\monthly.mean'
ensemble_models_dir = r'D:\RICAAR\Pr.New.Stations.Selection\ensemble.models.6.models\Models\monthly mean.Models'
output_dir = r'D:\RICAAR\Pr.New.Stations.Selection\ensemble.models.6.models\validation_results'

# Run ensemble validation
if __name__ == "__main__":
    validation_results, coverage_patterns = process_ensemble_validation(obs_dir, ensemble_models_dir, output_dir)
    
    if validation_results is not None:
        print(f"\n{'='*60}")
        print("VALIDATION COMPLETED SUCCESSFULLY!")
        print(f"{'='*60}")
        print(f"Results saved to: {output_dir}")
        print("Files created:")
        print("  - ensemble_6models_validation_results.xlsx")
        print("  - ensemble_performance_distribution.png") 
        print("  - ensemble_performance_relationships.png")
        print("  - ensemble_coverage_pattern.png")
        print("  - ensemble_station_performance.png (if ≥10 stations)")
    else:
        print("ERROR: Validation failed!")

6-MODEL ENSEMBLE VALIDATION ANALYSIS
Observations directory: D:\RICAAR\Pr.New.Stations.Selection\OBSERVATIONS\monthly.mean
Ensemble models directory: D:\RICAAR\Pr.New.Stations.Selection\ensemble.models.6.models\Models\monthly mean according to covering and missingperiodinthestation
Output directory: D:\RICAAR\Pr.New.Stations.Selection\ensemble.models.6.models\validation_results
--------------------------------------------------------------------------------
Processing ensemble model: Ensemble_6Models
Ensemble path: D:\RICAAR\Pr.New.Stations.Selection\ensemble.models.6.models\Models\monthly mean according to covering and missingperiodinthestation\Ensemble_6Models
Found 49 ensemble station files

Processing station 1/49: AB0004
  Loaded data - Ensemble: 8 months, Observations: 12 months
  Metrics calculated - Coverage: 0.67, r: 0.984, NSE: 0.943
  ✓ Successfully processed station AB0004

Processing station 2/49: AD0002
  Loaded data - Ensemble: 8 months, Observations: 12 months
  Metrics

<Figure size 1600x1000 with 0 Axes>