# ERA5 and PVGIS PV Performance Comparison

This notebook compares ERA5 solar radiation data with PVGIS reference data.

In [1]:
import os
import sys
import xarray as xr
import pandas as pd
import numpy as np
import plotly.express as px
from pathlib import Path
from plotly.subplots import make_subplots
import plotly.graph_objects as go

import inspect

# Add the project root to sys.path
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
if project_root not in sys.path:
    sys.path.insert(0, project_root)

from scripts.analysis.pv_performance import PVPerformanceAnalyzer, PVGISClient, ERA5Processor

In [2]:
# Configuration
LAT = LATITUDE = 50.7  # Bonn, Germany
LON = LONGITUDE = 7.1
START_YEAR = 2021
END_YEAR = 2024
ERA5_FILE = os.path.join(project_root, 'data', 'processed', 'era5_merged_Bonn.nc')

# Initialize analyzer
analyzer = PVPerformanceAnalyzer(lat=LAT, lon=LON)

In [3]:
# 1. Load and process ERA5 data
print("Loading ERA5 data...")
try:
    era5_ds = xr.open_dataset(ERA5_FILE)
    print("ERA5 data loaded successfully")
    print("Time range:", era5_ds.time.min().values, "to", era5_ds.time.max().values)
    print("Available variables:", list(era5_ds.data_vars.keys()))
    
    # Process ERA5 data
    era5_df = analyzer.process_era5_data(era5_ds)
    if not isinstance(era5_df.index, pd.DatetimeIndex):
        era5_df = era5_df.set_index('time')
    
    print("\nERA5 data sample:")
    display(era5_df[['ssrd', 'ssrd_wm2', 'capacity_factor']].head())
    
    # Plot 6-hourly data
    fig = px.line(
        era5_df.reset_index(), 
        x='time', 
        y='capacity_factor',
        title=f'ERA5 6-Hourly Capacity Factor ({START_YEAR}-{END_YEAR})',
        labels={'capacity_factor': 'Capacity Factor', 'time': 'Date'}
    )
    fig.update_layout(height=500)
    fig.show()
    
except Exception as e:
    print(f"Error loading/processing ERA5 data: {e}")
    era5_ds = None

Loading ERA5 data...
ERA5 data loaded successfully
Time range: 2021-01-01T00:00:00.000000000 to 2024-12-31T18:00:00.000000000
Available variables: ['sp', 'strd', 'v10', 'ssrd', 'cbh', 'tcc', 'fdir', 'tsr', 'str', 'ssrdc', 't2m', 'ssr', 'u10']

ERA5 data sample:


  era5_ds = xr.open_dataset(ERA5_FILE)


Unnamed: 0_level_0,ssrd,ssrd_wm2,capacity_factor
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2021-01-01 12:00:00,489100.75,22.643553,0.003397
2021-01-02 12:00:00,233523.1875,10.811258,0.001622
2021-01-03 12:00:00,304985.5625,14.119702,0.002118
2021-01-04 12:00:00,443660.75,20.53985,0.003081
2021-01-05 12:00:00,343334.375,15.89511,0.002384


In [4]:
# 2. Get PVGIS data
print("\nFetching PVGIS data...")
try:
    # Test connection first
    if PVGISClient.test_connection(LAT, LON):
        print("PVGIS API is accessible")
        use_sample = False
    else:
        print("Warning: Could not connect to PVGIS API. Using sample data.")
        use_sample = True
    
    # Get the data
    pvgis_df = analyzer.get_pvgis_data(
        start_year=START_YEAR,
        end_year=END_YEAR,
        peak_power=1.0,
        system_loss=14.0,
        angle=35.0,
        aspect=0.0,
        use_sample_data=use_sample
    )
    
    # Plot the data if successful
    if pvgis_df is not None and not pvgis_df.empty:
        print("\nPVGIS data retrieved successfully!")
        print("Date range:", pvgis_df.index.min(), "to", pvgis_df.index.max())
        
        # Plot the data
        pvgis_plot_df = pvgis_df.reset_index().rename(columns={'index': 'time'})
        fig = px.line(
            pvgis_plot_df, 
            x='time', 
            y='P',
            title=f'PVGIS Hourly PV Power Output ({START_YEAR}-{END_YEAR})',
            labels={'P': 'Power (W)', 'time': 'Date'},
        )
        fig.update_layout(height=500)
        fig.show()
    else:
        print("No PVGIS data was retrieved.")
        
except Exception as e:
    print(f"Error in PVGIS data retrieval: {e}")
    import traceback
    traceback.print_exc()
    print("Using sample data instead...")
    pvgis_df = PVGISClient._get_sample_data(START_YEAR, END_YEAR)


Fetching PVGIS data...
Generating sample PVGIS data...

PVGIS data retrieved successfully!
Date range: 2021-01-01 00:00:00 to 2024-12-31 23:00:00



'H' is deprecated and will be removed in a future version, please use 'h' instead.



In [None]:
# 3. Compare ERA5 and PVGIS data if both are available
if (
    pvgis_df is not None and not pvgis_df.empty and
    'era5_df' in locals() and not era5_df.empty and
    'capacity_factor' in era5_df.columns and
    'capacity_factor' in pvgis_df.columns
):
    print("\nComparing datasets...")
    try:
        # Compare the datasets
        comparison = analyzer.compare_datasets(
            era5_data=era5_df,
            pvgis_data=pvgis_df,
            era5_cf_col='capacity_factor',
            pvgis_cf_col='capacity_factor'
        )

        # Plot comparison
        fig = analyzer.plot_comparison(
            comparison,
            title=f'ERA5 vs PVGIS Comparison ({START_YEAR}-{END_YEAR})'
        )
        fig.show()

        # Print statistics
        print("\nComparison Statistics:")
        stats = comparison[['era5_cf', 'pvgis_cf', 'performance_ratio']].describe()
        display(stats)

        # Plot performance ratio distribution
        pr_clean = comparison['performance_ratio'].dropna()
        pr_range = pr_clean.quantile([0.02, 0.98])
        fig = px.histogram(
            comparison,
            x='performance_ratio',
            title='Performance Ratio Distribution (ERA5/PVGIS)',
            labels={'performance_ratio': 'Performance Ratio'},
            nbins=50,
            opacity=0.8,
            color_discrete_sequence=['#2ca02c']
        )
        # Add median and mean lines
        median_ratio = pr_clean.median()
        mean_ratio = pr_clean.mean()
        fig.add_vline(
            x=median_ratio,
            line=dict(color='red', width=2, dash='dash'),
            annotation_text=f'Median: {median_ratio:.3f}',
            annotation_position='top right'
        )
        fig.add_vline(
            x=mean_ratio,
            line=dict(color='blue', width=2, dash='dash'),
            annotation_text=f'Mean: {mean_ratio:.3f}',
            annotation_position='bottom right'
        )
        fig.update_layout(
            height=400,
            showlegend=False,
            plot_bgcolor='white',
            xaxis=dict(
                title='Performance Ratio (ERA5/PVGIS)',
                range=[max(0, pr_range.iloc[0]), min(3, pr_range.iloc[1])]
            ),
            yaxis_title='Count',
            margin=dict(t=50, b=50, l=50, r=50),
            bargap=0.1
        )
        fig.update_traces(
            marker=dict(
                line=dict(width=0.8, color='white'),
                opacity=0.8
            )
        )
        fig.show()

        # Additional statistics
        print("\nAdditional Statistics:")
        print(f"Percentage of hours where ERA5 > PVGIS: {(pr_clean > 1).mean()*100:.2f}%")
        print(f"Percentage of hours where PVGIS > ERA5: {(pr_clean < 1).mean()*100:.2f}%")
        print("\nHours with highest ERA5 overestimation (Top 5):")
        display(comparison.nlargest(5, 'performance_ratio'))
        print("\nHours with highest PVGIS overestimation (Top 5):")
        display(comparison.nsmallest(5, 'performance_ratio'))

    except Exception as e:
        print(f"Error during comparison: {e}")
        import traceback
        traceback.print_exc()
else:
    print("\nSkipping comparison - missing or invalid data.")
    if pvgis_df is None or pvgis_df.empty:
        print("- PVGIS data not available or empty")
    if 'era5_df' not in locals() or era5_df.empty:
        print("- ERA5 data not available or empty")
    elif 'capacity_factor' not in era5_df.columns:
        print("- Capacity factor not calculated in ERA5 data")
    elif 'capacity_factor' not in pvgis_df.columns:
        print("- Capacity factor not calculated in PVGIS data")


Comparing datasets...
Error during comparison: Only valid with DatetimeIndex, TimedeltaIndex or PeriodIndex, but got an instance of 'Index'


Traceback (most recent call last):
  File "/var/folders/z6/6d6lkl057p12xjjzc2ml33vh0000gn/T/ipykernel_55869/1216365338.py", line 21, in <module>
    era5_daily = era5_processed['capacity_factor'].resample('D').mean().dropna()
  File "/Users/iman/miniforge3/envs/era_energy/lib/python3.10/site-packages/pandas/core/generic.py", line 9771, in resample
    return get_resampler(
  File "/Users/iman/miniforge3/envs/era_energy/lib/python3.10/site-packages/pandas/core/resample.py", line 2050, in get_resampler
    return tg._get_resampler(obj, kind=kind)
  File "/Users/iman/miniforge3/envs/era_energy/lib/python3.10/site-packages/pandas/core/resample.py", line 2272, in _get_resampler
    raise TypeError(
TypeError: Only valid with DatetimeIndex, TimedeltaIndex or PeriodIndex, but got an instance of 'Index'


In [6]:
# Import inspect module


# Let's look at the process_era5_data method
print("ERA5 Processing Method:")
print("----------------------")
print(inspect.getsource(ERA5Processor.process_era5_data))

# Let's also check the ERA5Processor initialization
print("\nERA5Processor __init__ Method:")
print("----------------------------")
print(inspect.getsource(ERA5Processor.__init__))

ERA5 Processing Method:
----------------------
    @staticmethod
    def process_era5_data(
        era5_ds: xr.Dataset,
        lat: float,
        lon: float,
        ssrd_var: str = 'ssrd',
        system_efficiency: float = 0.15,  # Typical PV system efficiency
        tracking: str = 'fixed'  # 'fixed' or 'single_axis'
    ) -> pd.DataFrame:
        """
        Process ERA5 dataset for PV performance analysis with improved capacity factor calculation.
        
        Args:
            era5_ds: xarray Dataset containing ERA5 data
            lat: Target latitude (in degrees, positive for North)
            lon: Target longitude (in degrees, positive for East)
            ssrd_var: Name of the surface solar radiation variable
            system_efficiency: PV system efficiency (0-1)
            tracking: Tracking system type ('fixed' or 'single_axis')
            
        Returns:
            DataFrame with processed ERA5 data
        """
        # Select nearest point
        era5

TypeError: module, class, method, function, traceback, frame, or code object was expected, got wrapper_descriptor

In [None]:
# Configuration
LATITUDE = 50.7  # Bonn, Germany
LONGITUDE = 7.1
START_YEAR = 2021
END_YEAR = 2024


def compare_era5_pvgis(era5_ds, pvgis_df, lat, lon, start_year, end_year):
    """Compare ERA5 and PVGIS data with diagnostic plots and statistics."""
    import matplotlib.pyplot as plt
    import seaborn as sns
    from matplotlib.dates import MonthLocator, DateFormatter
    
    # Process ERA5 data
    era5_processor = ERA5Processor(era5_ds)
    era5_df = era5_processor.process_era5_data()
    
    # Process PVGIS data
    pvgis_hourly = pvgis_df['P'] / 1000  # Convert W to kW for 1kWp system
    pvgis_daily = pvgis_hourly.resample('D').mean()
    
    # Resample ERA5 to daily
    era5_daily = era5_df['capacity_factor'].resample('D').mean()
    
    # Align time indices
    common_index = era5_daily.index.intersection(pvgis_daily.index)
    era5_aligned = era5_daily[common_index]
    pvgis_aligned = pvgis_daily[common_index]
    
    # Create comparison DataFrame
    comparison = pd.DataFrame({
        'era5_cf': era5_aligned,
        'pvgis_cf': pvgis_aligned
    })
    
    # 1. Time Series Comparison
    plt.figure(figsize=(15, 10))
    
    # Plot 30-day rolling mean to see seasonal patterns
    comparison['era5_roll'] = comparison['era5_cf'].rolling(30, min_periods=1).mean()
    comparison['pvgis_roll'] = comparison['pvgis_cf'].rolling(30, min_periods=1).mean()
    
    plt.subplot(2, 2, 1)
    plt.plot(comparison.index, comparison['era5_roll'], label='ERA5 (30d mean)')
    plt.plot(comparison.index, comparison['pvgis_roll'], label='PVGIS (30d mean)')
    plt.title('30-day Rolling Mean Capacity Factor')
    plt.legend()
    plt.grid(True)
    
    # 2. Monthly Averages
    monthly = comparison.resample('M').mean()
    plt.subplot(2, 2, 2)
    monthly[['era5_cf', 'pvgis_cf']].plot()
    plt.title('Monthly Average Capacity Factor')
    plt.grid(True)
    
    # 3. Diurnal Cycle (Hourly)
    comparison_hourly = pd.DataFrame({
        'era5': era5_df['capacity_factor'],
        'pvgis': pvgis_df['P'] / 1000
    }).dropna()
    
    comparison_hourly['hour'] = comparison_hourly.index.hour
    diurnal = comparison_hourly.groupby('hour').mean()
    
    plt.subplot(2, 2, 3)
    diurnal.plot()
    plt.title('Average Diurnal Cycle')
    plt.grid(True)
    
    # 4. Scatter plot
    plt.subplot(2, 2, 4)
    sns.scatterplot(x='era5_cf', y='pvgis_cf', data=comparison, alpha=0.1)
    plt.plot([0, 1], [0, 1], 'r--')
    plt.title('ERA5 vs PVGIS Capacity Factor')
    plt.xlabel('ERA5 CF')
    plt.ylabel('PVGIS CF')
    plt.grid(True)
    
    plt.tight_layout()
    plt.show()
    
    # Print key statistics
    print("\nKey Statistics:")
    print("---------------")
    print("ERA5 CF - Max: {:.3f}, Mean: {:.3f}, Min: {:.3f}".format(
        comparison['era5_cf'].max(),
        comparison['era5_cf'].mean(),
        comparison['era5_cf'].min()
    ))
    print("PVGIS CF - Max: {:.3f}, Mean: {:.3f}, Min: {:.3f}".format(
        comparison['pvgis_cf'].max(),
        comparison['pvgis_cf'].mean(),
        comparison['pvgis_cf'].min()
    ))
    
    # Check for timezone issues
    print("\nTime Range:")
    print("-----------")
    print(f"ERA5: {era5_df.index[0]} to {era5_df.index[-1]}")
    print(f"PVGIS: {pvgis_df.index[0]} to {pvgis_df.index[-1]}")
    
    # Check coordinates
    print("\nCoordinates:")
    print("------------")
    print(f"ERA5 lat: {era5_ds.latitude.values}, lon: {era5_ds.longitude.values}")
    print(f"PVGIS lat: {lat}, lon: {lon}")
    
    return comparison

# Run the comparison
comparison_df = compare_era5_pvgis(era5_ds, pvgis_df, LATITUDE, LONGITUDE, START_YEAR, END_YEAR)

TypeError: ERA5Processor() takes no arguments