In [5]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()
spark

In [6]:
import yaml

# load config yaml file
config_file = "/Users/macbook/Development/solar_power_forecast_ML/config.yaml"
with open(config_file, 'r') as f:
    config = yaml.safe_load(f)
    
interim_data_path = config['paths']['interim_data_path']
cleaned_file_name = config['file_names']['cleaned_file_name']
cleaned_data_file =  f"{interim_data_path}/{cleaned_file_name}"

In [7]:

df = spark.read.parquet(cleaned_data_file, header=True, inferSchema=True)

In [8]:
df.printSchema()
df.show(5)

root
 |-- timestamp: timestamp (nullable = true)
 |-- power_output: double (nullable = true)
 |-- direct_irradiance: double (nullable = true)
 |-- diffuse_irradiance: double (nullable = true)
 |-- sun_height: double (nullable = true)
 |-- air_temperature: double (nullable = true)
 |-- wind_speed_at_10m: double (nullable = true)

+-------------------+------------+-----------------+------------------+----------+---------------+-----------------+
|          timestamp|power_output|direct_irradiance|diffuse_irradiance|sun_height|air_temperature|wind_speed_at_10m|
+-------------------+------------+-----------------+------------------+----------+---------------+-----------------+
|2020-01-01 00:11:00|         0.0|              0.0|               0.0|       0.0|            1.8|             2.34|
|2020-01-01 01:11:00|         0.0|              0.0|               0.0|       0.0|           1.48|             2.41|
|2020-01-01 02:11:00|         0.0|              0.0|               0.0|       0.0|  

## Features engineering
- define target variable 
- add lag variables
- add time features

In [9]:
# Create a window specification for the lead function
from pyspark.sql.window import Window
import pyspark.sql.functions as F

# Define window specification, ordered by time
window_spec = Window.orderBy("timestamp")

# Create the target column by getting the NEXT hour's power output
df_target = df.withColumn("power_output_lead_1h",
                          F.lead("power_output", 1).over(window_spec)
                         )
df_target.printSchema()
df_target.show(5)

root
 |-- timestamp: timestamp (nullable = true)
 |-- power_output: double (nullable = true)
 |-- direct_irradiance: double (nullable = true)
 |-- diffuse_irradiance: double (nullable = true)
 |-- sun_height: double (nullable = true)
 |-- air_temperature: double (nullable = true)
 |-- wind_speed_at_10m: double (nullable = true)
 |-- power_output_lead_1h: double (nullable = true)



25/04/14 08:56:05 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/04/14 08:56:05 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/04/14 08:56:05 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/04/14 08:56:05 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/04/14 08:56:05 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


+-------------------+------------+-----------------+------------------+----------+---------------+-----------------+--------------------+
|          timestamp|power_output|direct_irradiance|diffuse_irradiance|sun_height|air_temperature|wind_speed_at_10m|power_output_lead_1h|
+-------------------+------------+-----------------+------------------+----------+---------------+-----------------+--------------------+
|2020-01-01 00:11:00|         0.0|              0.0|               0.0|       0.0|            1.8|             2.34|                 0.0|
|2020-01-01 01:11:00|         0.0|              0.0|               0.0|       0.0|           1.48|             2.41|                 0.0|
|2020-01-01 02:11:00|         0.0|              0.0|               0.0|       0.0|           1.04|             2.34|                 0.0|
|2020-01-01 03:11:00|         0.0|              0.0|               0.0|       0.0|           0.64|             2.21|                 0.0|
|2020-01-01 04:11:00|         0.0|

In [10]:
# Assuming window_spec is defined as above

df_lags = df_target

lags_to_create = {
    "power_output": [1, 2],      # Lag 1 and 2 hours for power
    "direct_irradiance": [1, 2],
    "diffuse_irradiance": [1, 2],
    "sun_height": [1, 2],
    "air_temperature": [1],       # Lag 1 hour for temperature
    "wind_speed_at_10m": [1, 2]    
}

for col_name, lag_periods in lags_to_create.items():
    for period in lag_periods:
        lag_col_name = f"{col_name}_lag_{period}hr"
        df_lags = df_lags.withColumn(
            lag_col_name,
            F.lag(col_name, period).over(window_spec)
        )
df_lags.printSchema()
df_lags.show(5)

root
 |-- timestamp: timestamp (nullable = true)
 |-- power_output: double (nullable = true)
 |-- direct_irradiance: double (nullable = true)
 |-- diffuse_irradiance: double (nullable = true)
 |-- sun_height: double (nullable = true)
 |-- air_temperature: double (nullable = true)
 |-- wind_speed_at_10m: double (nullable = true)
 |-- power_output_lead_1h: double (nullable = true)
 |-- power_output_lag_1hr: double (nullable = true)
 |-- power_output_lag_2hr: double (nullable = true)
 |-- direct_irradiance_lag_1hr: double (nullable = true)
 |-- direct_irradiance_lag_2hr: double (nullable = true)
 |-- diffuse_irradiance_lag_1hr: double (nullable = true)
 |-- diffuse_irradiance_lag_2hr: double (nullable = true)
 |-- sun_height_lag_1hr: double (nullable = true)
 |-- sun_height_lag_2hr: double (nullable = true)
 |-- air_temperature_lag_1hr: double (nullable = true)
 |-- wind_speed_at_10m_lag_1hr: double (nullable = true)
 |-- wind_speed_at_10m_lag_2hr: double (nullable = true)



25/04/14 08:56:06 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/04/14 08:56:06 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/04/14 08:56:06 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/04/14 08:56:07 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/04/14 08:56:07 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


+-------------------+------------+-----------------+------------------+----------+---------------+-----------------+--------------------+--------------------+--------------------+-------------------------+-------------------------+--------------------------+--------------------------+------------------+------------------+-----------------------+-------------------------+-------------------------+
|          timestamp|power_output|direct_irradiance|diffuse_irradiance|sun_height|air_temperature|wind_speed_at_10m|power_output_lead_1h|power_output_lag_1hr|power_output_lag_2hr|direct_irradiance_lag_1hr|direct_irradiance_lag_2hr|diffuse_irradiance_lag_1hr|diffuse_irradiance_lag_2hr|sun_height_lag_1hr|sun_height_lag_2hr|air_temperature_lag_1hr|wind_speed_at_10m_lag_1hr|wind_speed_at_10m_lag_2hr|
+-------------------+------------+-----------------+------------------+----------+---------------+-----------------+--------------------+--------------------+--------------------+---------------------

In [11]:
# count null values
df_lags.select([F.count(F.when(F.col(c).isNull(), c)).alias(c) for c in df_lags.columns]).show()


25/04/14 08:56:07 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/04/14 08:56:07 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/04/14 08:56:07 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/04/14 08:56:07 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


+---------+------------+-----------------+------------------+----------+---------------+-----------------+--------------------+--------------------+--------------------+-------------------------+-------------------------+--------------------------+--------------------------+------------------+------------------+-----------------------+-------------------------+-------------------------+
|timestamp|power_output|direct_irradiance|diffuse_irradiance|sun_height|air_temperature|wind_speed_at_10m|power_output_lead_1h|power_output_lag_1hr|power_output_lag_2hr|direct_irradiance_lag_1hr|direct_irradiance_lag_2hr|diffuse_irradiance_lag_1hr|diffuse_irradiance_lag_2hr|sun_height_lag_1hr|sun_height_lag_2hr|air_temperature_lag_1hr|wind_speed_at_10m_lag_1hr|wind_speed_at_10m_lag_2hr|
+---------+------------+-----------------+------------------+----------+---------------+-----------------+--------------------+--------------------+--------------------+-------------------------+-------------------------

In [12]:
print(f"Rows before dropping NA values: {df_lags.count()}")
df_cleaned = df_lags.na.drop()

print(f"Rows AFTER dropping NA values: {df_cleaned.count()}")

# count null values
df_cleaned.select([F.count(F.when(F.col(c).isNull(), c)).alias(c) for c in df_cleaned.columns]).show()


Rows before dropping NA values: 35064


25/04/14 08:56:08 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/04/14 08:56:08 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/04/14 08:56:08 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/04/14 08:56:08 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


Rows AFTER dropping NA values: 35061


25/04/14 08:56:08 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/04/14 08:56:08 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/04/14 08:56:09 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/04/14 08:56:09 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


+---------+------------+-----------------+------------------+----------+---------------+-----------------+--------------------+--------------------+--------------------+-------------------------+-------------------------+--------------------------+--------------------------+------------------+------------------+-----------------------+-------------------------+-------------------------+
|timestamp|power_output|direct_irradiance|diffuse_irradiance|sun_height|air_temperature|wind_speed_at_10m|power_output_lead_1h|power_output_lag_1hr|power_output_lag_2hr|direct_irradiance_lag_1hr|direct_irradiance_lag_2hr|diffuse_irradiance_lag_1hr|diffuse_irradiance_lag_2hr|sun_height_lag_1hr|sun_height_lag_2hr|air_temperature_lag_1hr|wind_speed_at_10m_lag_1hr|wind_speed_at_10m_lag_2hr|
+---------+------------+-----------------+------------------+----------+---------------+-----------------+--------------------+--------------------+--------------------+-------------------------+-------------------------

### Convert timestamp time features: year, month, day, dayofweek, hour

In [13]:
from pyspark.sql import functions as F

df_time_feats = df_cleaned.withColumn("hour",F.hour("timestamp")      
                         ).withColumn( "dayofweek", F.dayofweek("timestamp")                      
                         ).withColumn("day", F.dayofmonth("timestamp")
                         ).withColumn( "month", F.month("timestamp")
                         ).withColumn("year",F.year("timestamp")
                         ).withColumn("local_timestamp", F.from_utc_timestamp(F.col("timestamp"), "Europe/Berlin")
                                      )

df_time_feats.printSchema()
df_time_feats.show(5)


root
 |-- timestamp: timestamp (nullable = true)
 |-- power_output: double (nullable = true)
 |-- direct_irradiance: double (nullable = true)
 |-- diffuse_irradiance: double (nullable = true)
 |-- sun_height: double (nullable = true)
 |-- air_temperature: double (nullable = true)
 |-- wind_speed_at_10m: double (nullable = true)
 |-- power_output_lead_1h: double (nullable = true)
 |-- power_output_lag_1hr: double (nullable = true)
 |-- power_output_lag_2hr: double (nullable = true)
 |-- direct_irradiance_lag_1hr: double (nullable = true)
 |-- direct_irradiance_lag_2hr: double (nullable = true)
 |-- diffuse_irradiance_lag_1hr: double (nullable = true)
 |-- diffuse_irradiance_lag_2hr: double (nullable = true)
 |-- sun_height_lag_1hr: double (nullable = true)
 |-- sun_height_lag_2hr: double (nullable = true)
 |-- air_temperature_lag_1hr: double (nullable = true)
 |-- wind_speed_at_10m_lag_1hr: double (nullable = true)
 |-- wind_speed_at_10m_lag_2hr: double (nullable = true)
 |-- hour: inte

25/04/14 08:56:09 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/04/14 08:56:09 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/04/14 08:56:09 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/04/14 08:56:09 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/04/14 08:56:09 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


+-------------------+------------+-----------------+------------------+----------+---------------+-----------------+--------------------+--------------------+--------------------+-------------------------+-------------------------+--------------------------+--------------------------+------------------+------------------+-----------------------+-------------------------+-------------------------+----+---------+---+-----+----+-------------------+
|          timestamp|power_output|direct_irradiance|diffuse_irradiance|sun_height|air_temperature|wind_speed_at_10m|power_output_lead_1h|power_output_lag_1hr|power_output_lag_2hr|direct_irradiance_lag_1hr|direct_irradiance_lag_2hr|diffuse_irradiance_lag_1hr|diffuse_irradiance_lag_2hr|sun_height_lag_1hr|sun_height_lag_2hr|air_temperature_lag_1hr|wind_speed_at_10m_lag_1hr|wind_speed_at_10m_lag_2hr|hour|dayofweek|day|month|year|    local_timestamp|
+-------------------+------------+-----------------+------------------+----------+---------------+--

In [14]:
# reorgarnise logically the columns
old_order = df_time_feats.columns

new_order = [
    "timestamp",        
    "local_timestamp",
    "year",
    "month",
    "day",
    "dayofweek",
    "hour",

    "power_output_lead_1h", 

    "power_output",
    "power_output_lag_1hr",
    "power_output_lag_2hr",

    "direct_irradiance",
    "direct_irradiance_lag_1hr",
    "direct_irradiance_lag_2hr",
    "diffuse_irradiance",
    "diffuse_irradiance_lag_1hr",
    "diffuse_irradiance_lag_2hr",

    "sun_height",
    "sun_height_lag_1hr",
    "sun_height_lag_2hr",

    "air_temperature",
    "air_temperature_lag_1hr",
    "wind_speed_at_10m",
    "wind_speed_at_10m_lag_1hr",
    "wind_speed_at_10m_lag_2hr",
]

# 3. Perform the simple set comparison
if set(old_order) == set(new_order):
    print("👍 Old columns & new columns contain the exact same set of names.")
    
else:
    print("ERROR/WARNING: No match.")
    print(f"  Columns ONLY in old columns: {sorted(list(set(old_order) - set(new_order)))}")
    print(f"  Columns ONLY in new column: {sorted(list(set(new_order) - set(old_order)))}")

👍 Old columns & new columns contain the exact same set of names.


In [15]:
# change the order of the columns
df_time_feats = df_time_feats.select(new_order)

df_time_feats.printSchema()
df_time_feats.describe().show()
df_time_feats.show(5)

root
 |-- timestamp: timestamp (nullable = true)
 |-- local_timestamp: timestamp (nullable = true)
 |-- year: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- day: integer (nullable = true)
 |-- dayofweek: integer (nullable = true)
 |-- hour: integer (nullable = true)
 |-- power_output_lead_1h: double (nullable = true)
 |-- power_output: double (nullable = true)
 |-- power_output_lag_1hr: double (nullable = true)
 |-- power_output_lag_2hr: double (nullable = true)
 |-- direct_irradiance: double (nullable = true)
 |-- direct_irradiance_lag_1hr: double (nullable = true)
 |-- direct_irradiance_lag_2hr: double (nullable = true)
 |-- diffuse_irradiance: double (nullable = true)
 |-- diffuse_irradiance_lag_1hr: double (nullable = true)
 |-- diffuse_irradiance_lag_2hr: double (nullable = true)
 |-- sun_height: double (nullable = true)
 |-- sun_height_lag_1hr: double (nullable = true)
 |-- sun_height_lag_2hr: double (nullable = true)
 |-- air_temperature: double (nullable =

25/04/14 08:56:10 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/04/14 08:56:10 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/04/14 08:56:10 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
25/04/14 08:56:10 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/04/14 08:56:10 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/04/14 08:56:12 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance

+-------+------------------+------------------+------------------+------------------+------------------+--------------------+------------------+--------------------+--------------------+-----------------+-------------------------+-------------------------+------------------+--------------------------+--------------------------+------------------+------------------+------------------+------------------+-----------------------+------------------+-------------------------+-------------------------+
|summary|              year|             month|               day|         dayofweek|              hour|power_output_lead_1h|      power_output|power_output_lag_1hr|power_output_lag_2hr|direct_irradiance|direct_irradiance_lag_1hr|direct_irradiance_lag_2hr|diffuse_irradiance|diffuse_irradiance_lag_1hr|diffuse_irradiance_lag_2hr|        sun_height|sun_height_lag_1hr|sun_height_lag_2hr|   air_temperature|air_temperature_lag_1hr| wind_speed_at_10m|wind_speed_at_10m_lag_1hr|wind_speed_at_10m_lag_2hr

25/04/14 08:56:12 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/04/14 08:56:12 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


## Prepare train, eval. & test data


In [16]:
import pandas as pd

# turn Spark df into a pandas dataframe & dropping timestamps features

df_pd_train = df_time_feats.drop("timestamp", "local_timestamp").toPandas()
round(df_pd_train.describe(),2)
df_pd_train.info()

25/04/14 08:56:13 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/04/14 08:56:13 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/04/14 08:56:13 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/04/14 08:56:13 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/04/14 08:56:13 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35061 entries, 0 to 35060
Data columns (total 23 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   year                        35061 non-null  int32  
 1   month                       35061 non-null  int32  
 2   day                         35061 non-null  int32  
 3   dayofweek                   35061 non-null  int32  
 4   hour                        35061 non-null  int32  
 5   power_output_lead_1h        35061 non-null  float64
 6   power_output                35061 non-null  float64
 7   power_output_lag_1hr        35061 non-null  float64
 8   power_output_lag_2hr        35061 non-null  float64
 9   direct_irradiance           35061 non-null  float64
 10  direct_irradiance_lag_1hr   35061 non-null  float64
 11  direct_irradiance_lag_2hr   35061 non-null  float64
 12  diffuse_irradiance          35061 non-null  float64
 13  diffuse_irradiance_lag_1hr  350

In [17]:
from sklearn.model_selection import train_test_split

features = list(df_pd_train.drop("power_output_lead_1h", axis=1).columns)
#print(type(features))
#print(features)
target = "power_output_lead_1h"

X = df_pd_train[features]
y = df_pd_train[target]


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape) 


(28048, 22)
(7013, 22)
(28048,)
(7013,)


In [18]:
import plotly.express as px
template = 'plotly_dark'

fig = px.scatter(X_train, 
              x="direct_irradiance", 
              y="power_output",
              template=template
    )
fig.show()

fig = px.scatter(X_test, 
              x="direct_irradiance", 
              y="power_output",
              template=template
    )
fig.show()

## Scale data with StandardScaler

In [19]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(X_train) 
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

## Model test - Regression
- use a simple linear regression model
- use a gradient boosting regression tree model

In [20]:
from sklearn.linear_model import LinearRegression

# Create linear regression object   
linear_model = LinearRegression()
linear_model.fit(X_train_scaled, y_train)
linear_model.score(X_test_scaled, y_test)


0.8897928455189041

In [21]:
# evaluate performance
from sklearn.metrics import mean_absolute_percentage_error

y_pred = linear_model.predict(X_test)
mean_absolute_percentage_error(y_test, y_pred)


X has feature names, but LinearRegression was fitted without feature names



8.696334452913329e+19

In [22]:
from sklearn.ensemble import HistGradientBoostingRegressor

HGBR_model = HistGradientBoostingRegressor()
HGBR_model.fit(X_train_scaled, y_train)
HGBR_model.score(X_test_scaled, y_test)


0.9121859029173282

In [23]:
# evaluate performance
from sklearn.metrics import mean_absolute_percentage_error

y_pred = HGBR_model.predict(X_test)
mean_absolute_percentage_error(y_test, y_pred)


X has feature names, but HistGradientBoostingRegressor was fitted without feature names



6.909155745748114e+17

## Change the split strategy & use TimeSeriesSplit

In [24]:
from sklearn.model_selection import TimeSeriesSplit

tscv = TimeSeriesSplit(
    n_splits=5,
    gap=48
)

splits = list(tscv.split(X, y))


In [25]:
# Training the model and evaluating its performance based on MAPE.
# --- Loop through the splits ---
for train_idx, test_idx in tscv.split(X):
    X_train = X.iloc[train_idx]
    X_test = X.iloc[test_idx]
    y_train = y.iloc[train_idx]
    y_test = y.iloc[test_idx]

model = HistGradientBoostingRegressor().fit(X_train, y_train)
y_pred = model.predict(X_test)
mean_absolute_percentage_error(y_test, y_pred)

1.675633049465108e+16

## Save data for next notebook

In [30]:
features_egnineered_file_name = config['file_names']['features_engineered_file_name']
processed_data_path = config['paths']['processed_data_path']
features_data_file = f"{processed_data_path}/{features_egnineered_file_name}"

df_time_feats.toPandas().to_parquet(features_data_file, index=False, compression="snappy")