In [None]:
%%capture
%pip install openmeteo-requests requests-cache retry-requests numpy pandas matplotlib torch scikit-learn scipy seaborn statsmodels seaborn

In [None]:
import openmeteo_requests
import requests_cache
import torch 
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from retry_requests import retry
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.stattools import grangercausalitytests
# from statsmodels.tsa.stattools import coint_johansen # ????

In [None]:
# Load solar data
solar_ts = pd.read_csv("/home/moonchild/PythonProjects/solar-prediction/data/energy_charts.csv", sep=",", header=0)
solar_ts["date"] = pd.to_datetime(solar_ts["Datum"], utc=True)
solar_ts.drop(columns=["Datum"], inplace=True)

In [None]:
# Setup the Open-Meteo API client with cache and retry on error
cache_session = requests_cache.CachedSession('.cache', expire_after = -1)
retry_session = retry(cache_session, retries = 5, backoff_factor = 0.2)
openmeteo = openmeteo_requests.Client(session = retry_session)

# Make sure all required train variables are listed here
# The order of variables in hourly or daily is important to assign them correctly below
url = "https://archive-api.open-meteo.com/v1/archive"
params = {
	"latitude": 52.5244,
	"longitude": 13.4105,
	"start_date": "2014-01-01",
	"end_date": "2024-05-01",
	"hourly": [
        "temperature_2m", 
        "cloud_cover", 
        "shortwave_radiation", 
        "diffuse_radiation", 
        "direct_normal_irradiance", 
        "is_day", 
        "sunshine_duration"
        ],
	"timezone": "Europe/Berlin"
}

responses = openmeteo.weather_api(url, params=params)

# Process first location. Add a for-loop for multiple locations or weather models
response = responses[0]
print(f"Coordinates {response.Latitude()}°N {response.Longitude()}°E")
print(f"Elevation {response.Elevation()} m asl")
print(f"Timezone {response.Timezone()} {response.TimezoneAbbreviation()}")
print(f"Timezone difference to GMT+0 {response.UtcOffsetSeconds()} s")

# Process hourly data. The order of variables needs to be the same as requested.
hourly = response.Hourly()
hourly_temperature_2m = hourly.Variables(0).ValuesAsNumpy()
hourly_cloud_cover = hourly.Variables(1).ValuesAsNumpy()
hourly_shortwave_radiation = hourly.Variables(2).ValuesAsNumpy()
hourly_diffuse_radiation = hourly.Variables(3).ValuesAsNumpy()
hourly_direct_normal_irradiance = hourly.Variables(4).ValuesAsNumpy()
hourly_is_day = hourly.Variables(5).ValuesAsNumpy()
hourly_sunshine_duration = hourly.Variables(6).ValuesAsNumpy()

hourly_data = {"date": pd.date_range(
	start = pd.to_datetime(hourly.Time(), unit = "s", utc = True),
	end = pd.to_datetime(hourly.TimeEnd(), unit = "s", utc = True),
	freq = pd.Timedelta(seconds = hourly.Interval()),
	inclusive = "left"
)}
hourly_data["temperature_2m"] = hourly_temperature_2m
hourly_data["cloud_cover"] = hourly_cloud_cover
hourly_data["shortwave_radiation"] = hourly_shortwave_radiation
hourly_data["diffuse_radiation"] = hourly_diffuse_radiation
hourly_data["direct_normal_irradiance"] = hourly_direct_normal_irradiance
hourly_data["is_day"] = hourly_is_day
hourly_data["sunshine_duration"] = hourly_sunshine_duration

hourly_dataframe = pd.DataFrame(data = hourly_data)


In [None]:
# hourly_dataframe.info()
# hourly_dataframe.describe()
# hourly_dataframe.plot(x = "date", y = "temperature_2m")
# hourly_dataframe.tail()

In [None]:
# merge with solar_ts
energy_ts = solar_ts.merge(hourly_dataframe, on='date', how='inner')
energy_ts.drop(columns=['is_day', 'sunshine_duration'], inplace=True)
# energy_ts.info()
# energy_ts

In [None]:
# Split
train_ts = energy_ts[energy_ts["date"] < "2023-01-01"]
val_ts = energy_ts[(energy_ts["date"] >= "2023-01-01") & (energy_ts["date"] < "2024-01-01")]
test_ts = energy_ts[energy_ts["date"] >= "2024-01-01"]

train_ts = train_ts.set_index(keys="date", drop=True)
val_ts = val_ts.set_index(keys="date", drop=True)
test_ts = test_ts.set_index(keys="date", drop=True)

**Statioarity**

In [None]:
# switch train and val temporarily for testing to save memory
train_ts = val_ts
# train_ts

In [None]:
# Stationarity - Augmented Dicky Fuller Test
def make_stationary(train_ts, val_ts, test_ts):
    """
    Perform the Augmented Dickey-Fuller test on train_ts. If train_ts is not stationary,
    difference train_ts, val_ts, and test_ts based on the train_ts result.
    
    Parameters:
    train_ts (pd.DataFrame): The training time series data.
    val_ts (pd.DataFrame): The validation time series data.
    test_ts (pd.DataFrame): The test time series data.
    
    Returns:
    None
    """
    def perform_adf(series, column_name):
        result = adfuller(series, autolag='AIC')
        print(f'ADF Statistic for {column_name}: {result[0]}')
        print(f'p-value for {column_name}: {result[1]}')
        for key, value in result[4].items():
            print(f'Critical Values for {column_name} {key}: {value}')
        return result[1] < 0.05  # Return True if the series is stationary

    def check_and_difference(series, column_name):
        differenced = False
        is_stationary = perform_adf(series, column_name)
        iteration = 0
        while not is_stationary:
            print(f'The time series {column_name} is not stationary. Differencing the series and re-testing...')
            series = series.diff().dropna()
            is_stationary = perform_adf(series, f'{column_name} (Differenced {iteration+1})')
            differenced = True
            iteration += 1
        
        if is_stationary:
            print(f'The time series {column_name} is stationary after differencing {iteration} time(s).')
        else:
            print(f'The time series {column_name} is still not stationary after differencing {iteration} time(s).')
        
        return series, differenced

    for col in train_ts.columns:
        print(f'Checking stationarity for {col}')
        train_ts[col], differenced = check_and_difference(train_ts[col], col)
        if differenced:
            val_ts[col] = val_ts[col].diff().dropna()
            test_ts[col] = test_ts[col].diff().dropna()
        print("\n")  # Add a space between outputs

# Iterate over each column and perform the combined ADF test
make_stationary(train_ts, val_ts, test_ts)

In [None]:
# Plot and decompose each time series as grid of plots
for column in train_ts.columns:
    fig, axes = plt.subplots(4, 1, figsize=(10, 10))
    axes[0].set_title(f"{column} Time Series")
    train_ts[column].plot(ax=axes[0])
    axes[0].set_ylabel("Value")
    axes[0].set_xlabel("Date")
    axes[0].grid(True)
    
    decomposition = seasonal_decompose(train_ts[column], model='additive', period=24)
    decomposition.trend.plot(ax=axes[1])
    axes[1].set_title(f"{column} Trend")
    axes[1].set_ylabel("Value")
    axes[1].set_xlabel("Date")
    axes[1].grid(True)
    
    decomposition.seasonal.plot(ax=axes[2])
    axes[2].set_title(f"{column} Seasonal")
    axes[2].set_ylabel("Value")
    axes[2].set_xlabel("Date")
    axes[2].grid(True)
    
    decomposition.resid.plot(ax=axes[3])
    axes[3].set_title(f"{column} Residual")
    axes[3].set_ylabel("Value")
    axes[3].set_xlabel("Date")
    axes[3].grid(True)
    
    plt.tight_layout()
    plt.show()

*Test ACF and PACF*

In [None]:
# Check autocorrelation of each series in the training data and display as grid

from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
for col in train_ts.columns:
    fig, ax = plt.subplots(1, 2, figsize=(15, 5))
    plot_acf(train_ts[col], ax=ax[0], lags=50, title=f'Autocorrelation of {col}')
    plot_pacf(train_ts[col], ax=ax[1], lags=50, title=f'Partial Autocorrelation of {col}')
    plt.show()

# MULTIVARIATE ANALYSIS

In [None]:
# Scatter Plots (pairwise)
sns.pairplot(train_ts, diag_kind="kde")  
plt.show() 

# Sort features based on target variable correlation (absolute value)
sorted_features = train_ts.corr().iloc[0,:].abs().sort_values(ascending=False).index.tolist()

# Ordered heatmap 
corr_matrix = train_ts[sorted_features].corr()
sns.heatmap(corr_matrix, annot=True, cmap="twilight")  
plt.show()

# Correlation Matrix
print(corr_matrix.to_string())  # Print the correlation matrix as text

In [None]:
# Granger Causality Tests for the series in train_ts
from statsmodels.tsa.stattools import grangercausalitytests

for col in train_ts.columns:
    for col2 in train_ts.columns:
        if col != col2:
            print(f'Granger Causality Test for {col} and {col2}')
            data = pd.concat([train_ts[col], train_ts[col2]], axis=1)
            max_lag = 12
            results = grangercausalitytests(data, max_lag, verbose=True)
            print("\n")  # Add a space between outputs

# Baseline tbd