In [None]:
import pandas as pd
import numpy as np
from scipy import stats
import os
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns

def analyze_temporal_coverage(obs_data, model_data):
    """Analyze temporal coverage with aligned data"""
    # Ensure both datasets have the same months
    merged_data = pd.merge(obs_data, model_data, 
                          on=['Month'], 
                          suffixes=('_obs', '_model'))
    
    metrics = {}
    total_months = 12
    valid_months = np.sum(~np.isnan(merged_data['Rainfall_obs']))
    
    metrics['Coverage_Ratio'] = valid_months / total_months
    metrics['Valid_Months'] = valid_months
    metrics['Missing_Months'] = total_months - valid_months
    
    # Calculate metrics for valid months
    valid_mask = ~np.isnan(merged_data['Rainfall_obs'])
    if np.sum(valid_mask) > 0:
        obs_values = merged_data.loc[valid_mask, 'Rainfall_obs']
        model_values = merged_data.loc[valid_mask, 'Rainfall_model']
        
        metrics['r'] = stats.pearsonr(obs_values, model_values)[0]
        metrics['RMSE'] = np.sqrt(np.mean((model_values - obs_values) ** 2))
        metrics['MAE'] = np.mean(np.abs(model_values - obs_values))
        metrics['PBIAS'] = 100 * np.sum(model_values - obs_values) / np.sum(obs_values)
        metrics['NSE'] = 1 - (np.sum((model_values - obs_values) ** 2) / 
                             np.sum((obs_values - obs_values.mean()) ** 2))
    else:
        metrics.update({'r': np.nan, 'RMSE': np.nan, 'MAE': np.nan, 'PBIAS': np.nan, 'NSE': np.nan})
    
    return metrics

def process_validation(obs_dir, models_dir, output_dir):
    Path(output_dir).mkdir(parents=True, exist_ok=True)
    validation_results = []
    coverage_patterns = []
    
    for model_name in os.listdir(models_dir):
        model_path = os.path.join(models_dir, model_name)
        if not os.path.isdir(model_path):
            continue
            
        print(f"Processing model: {model_name}")
        
        for station_file in os.listdir(model_path):
            if not station_file.endswith('.xlsx'):
                continue
                
            station_id = station_file.split('_')[1]
            print(f"Processing station: {station_id}")
            
            try:
                # Read data
                model_data = pd.read_excel(os.path.join(model_path, station_file), 
                                         sheet_name='Monthly_Averages')
                obs_file = os.path.join(obs_dir, f'Station_{station_id}_Daily_Rainfall.xlsx')
                obs_data = pd.read_excel(obs_file, sheet_name='Monthly_Averages')
                
                # Calculate metrics
                metrics = analyze_temporal_coverage(obs_data, model_data)
                
                # Store results
                result = {
                    'Model': model_name,
                    'Station_ID': station_id,
                    'Station_Name': model_data['Station_Name'].iloc[0],
                    'Latitude': model_data['Latitude'].iloc[0],
                    'Longitude': model_data['Longitude'].iloc[0],
                    **metrics
                }
                validation_results.append(result)
                
                # Store coverage patterns
                coverage_pattern = pd.DataFrame({
                    'Model': model_name,
                    'Station_ID': station_id,
                    'Month': obs_data['Month'],
                    'Is_Valid': ~np.isnan(obs_data['Rainfall'])
                })
                coverage_patterns.append(coverage_pattern)
                
            except Exception as e:
                print(f"Error processing station {station_id}: {str(e)}")
                continue
    
    # Combine results
    validation_df = pd.DataFrame(validation_results)
    coverage_df = pd.concat(coverage_patterns, ignore_index=True)
    
    # Save results
    with pd.ExcelWriter(os.path.join(output_dir, 'model_validation_results.xlsx')) as writer:
        validation_df.to_excel(writer, sheet_name='Validation_Metrics', index=False)
        coverage_df.to_excel(writer, sheet_name='Coverage_Patterns', index=False)
    
    create_validation_plots(validation_df, coverage_df, output_dir)
    
    return validation_df, coverage_df

def create_validation_plots(validation_df, coverage_df, output_dir):
    # Set basic plot style
    plt.rcParams['figure.figsize'] = [12, 8]
    plt.rcParams['axes.grid'] = True
    plt.rcParams['grid.alpha'] = 0.3
    plt.rcParams['grid.linestyle'] = '--'
    
    # Performance metrics boxplot
    fig, axes = plt.subplots(2, 2, figsize=(15, 10))
    metrics = ['r', 'RMSE', 'MAE', 'NSE']
    
    for ax, metric in zip(axes.flat, metrics):
        sns.boxplot(data=validation_df, x='Model', y=metric, ax=ax)
        ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha='right')
        ax.set_title(f'{metric} by Model', pad=15)
        ax.grid(True, linestyle='--', alpha=0.3)
    
    plt.tight_layout()
    plt.savefig(os.path.join(output_dir, 'model_performance_metrics.png'), dpi=300, bbox_inches='tight')
    plt.close()
    
    # Coverage pattern heatmap
    plt.figure(figsize=(12, 6))
    coverage_matrix = coverage_df.pivot_table(
        values='Is_Valid', 
        index='Station_ID',
        columns=['Model', 'Month'],
        aggfunc='first'
    )
    sns.heatmap(coverage_matrix, cmap='YlOrRd')
    plt.title('Data Coverage Pattern', pad=15)
    plt.tight_layout()
    plt.savefig(os.path.join(output_dir, 'coverage_pattern.png'), dpi=300, bbox_inches='tight')
    plt.close()



# Define directories
obs_dir = r'D:\RICAAR\Pr.New.Stations.Selection\OBSERVATIONS\monthly.mean'
models_dir = r'D:\RICAAR\Pr.New.Stations.Selection\models\monthly mean.Models'
output_dir = r'D:\RICAAR\Pr.New.Stations.Selection\validation_results3'

# Run validation
validation_results, coverage_patterns = process_validation(obs_dir, models_dir, output_dir)