# ERA5 and PVGIS PV Performance Comparison

This notebook compares ERA5 solar radiation data with PVGIS reference data.

In [None]:
import os
import sys
import xarray as xr
import pandas as pd
import plotly.express as px
from pathlib import Path

# Add the project root to sys.path
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
if project_root not in sys.path:
    sys.path.insert(0, project_root)

from scripts.analysis.pv_performance import PVPerformanceAnalyzer, PVGISClient

In [None]:
# Configuration
LAT = 50.7  # Bonn, Germany
LON = 7.1
START_YEAR = 2021
END_YEAR = 2024
ERA5_FILE = os.path.join(project_root, 'data', 'processed', 'era5_merged_Bonn.nc')

# Initialize analyzer
analyzer = PVPerformanceAnalyzer(lat=LAT, lon=LON)

In [None]:
# 1. Load and process ERA5 data
print("Loading ERA5 data...")
try:
    era5_ds = xr.open_dataset(ERA5_FILE)
    print("ERA5 data loaded successfully")
    print("Time range:", era5_ds.time.min().values, "to", era5_ds.time.max().values)
    print("Available variables:", list(era5_ds.data_vars.keys()))
    
    # Process ERA5 data
    era5_df = analyzer.process_era5_data(era5_ds)
    era5_df = era5_df.set_index('time')
    
    print("\nERA5 data sample:")
    display(era5_df[['ssrd', 'ssrd_wm2', 'capacity_factor']].head())
    
    # Plot 6-hourly data
    fig = px.line(
        era5_df.reset_index(), 
        x='time', 
        y='capacity_factor',
        title=f'ERA5 6-Hourly Capacity Factor ({START_YEAR}-{END_YEAR})',
        labels={'capacity_factor': 'Capacity Factor', 'time': 'Date'}
    )
    fig.update_layout(height=500)
    fig.show()
    
except Exception as e:
    print(f"Error loading/processing ERA5 data: {e}")
    era5_ds = None

In [None]:
# 2. Get PVGIS data
print("\nFetching PVGIS data...")
try:
    # Test connection first
    if PVGISClient.test_connection(LAT, LON):
        print("PVGIS API is accessible")
        use_sample = False
    else:
        print("Warning: Could not connect to PVGIS API. Using sample data.")
        use_sample = True
    
    # Get the data
    pvgis_df = analyzer.get_pvgis_data(
        start_year=START_YEAR,
        end_year=END_YEAR,
        peak_power=1.0,
        system_loss=14.0,
        angle=35.0,
        aspect=0.0,
        use_sample_data=use_sample
    )
    
    # Plot the data if successful
    if pvgis_df is not None and not pvgis_df.empty:
        print("\nPVGIS data retrieved successfully!")
        print("Date range:", pvgis_df.index.min(), "to", pvgis_df.index.max())
        
        # Plot the data
        fig = px.line(
            pvgis_df.reset_index(), 
            x='time', 
            y='P',
            title=f'PVGIS Hourly PV Power Output ({START_YEAR}-{END_YEAR})',
            labels={'P': 'Power (W)', 'time': 'Date'}
        )
        fig.update_layout(height=500)
        fig.show()
    else:
        print("No PVGIS data was retrieved.")
        
except Exception as e:
    print(f"Error in PVGIS data retrieval: {e}")
    import traceback
    traceback.print_exc()
    print("Using sample data instead...")
    pvgis_df = PVGISClient._get_sample_data(START_YEAR, END_YEAR)

In [None]:
# 3. Compare ERA5 and PVGIS data if both are available
if pvgis_df is not None and 'era5_df' in locals() and 'capacity_factor' in era5_df.columns:
    print("\nComparing datasets...")
    
    try:
        # Compare the datasets
        comparison = analyzer.compare_datasets(
            era5_data=era5_df,
            pvgis_data=pvgis_df,
            era5_cf_col='capacity_factor',
            pvgis_cf_col='capacity_factor'
        )
        
        # Plot comparison
        fig = analyzer.plot_comparison(
            comparison,
            title=f'ERA5 vs PVGIS Comparison ({START_YEAR}-{END_YEAR})'
        )
        fig.show()
        
        # Print statistics
        print("\nComparison Statistics:")
        stats = comparison[['era5_cf', 'pvgis_cf', 'performance_ratio']].describe()
        display(stats)
        
        # Plot performance ratio distribution
        fig = px.histogram(
            comparison, 
            x='performance_ratio',
            title='Performance Ratio Distribution (ERA5/PVGIS)',
            labels={'performance_ratio': 'Performance Ratio'},
            nbins=50
        )
        fig.update_layout(height=400, showlegend=False)
        fig.show()
        
    except Exception as e:
        print(f"Error during comparison: {e}")
        import traceback
        traceback.print_exc()
else:
    print("\nSkipping comparison - missing required data.")
    if pvgis_df is None:
        print("- PVGIS data not available")
    if 'era5_df' not in locals():
        print("- ERA5 data not processed")
    elif 'capacity_factor' not in era5_df.columns:
        print("- Capacity factor not calculated in ERA5 data")