In [None]:
import pandas as pd
import numpy as np
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split ,GridSearchCV,cross_val_score
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import os

In [78]:
# Load data
wind_assset_name = 'AKE1 McBride Lake Windfarm'
solar_asset_name = 'BSC1 Brooks Solar'

def clean_df(file_path: os.PathLike):
    df = pd.read_csv(file_path, encoding='unicode_escape')  # Read CSV file with proper encoding
    df = df.loc[:, ~df.columns.str.contains('^Unnamed')]  # Remove any "Unnamed" columns that might be auto-generated
    return df  # Return the cleaned DataFrame

In [79]:
# Read the data
data_wind = clean_df(f'./Data/Merged/{wind_assset_name}_Processed_and_Data.csv')
data_solar = clean_df(f'./Data/Merged/{solar_asset_name}_Processed_and_Data.csv')
data_solar["Is Solar"] = True
data_wind["Is Solar"] = False
data = pd.concat([data_wind, data_solar], axis=0)
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 70144 entries, 0 to 43842
Data columns (total 13 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Volume                        70144 non-null  float64
 1   Maximum Capability            70144 non-null  float64
 2   System Capability             70144 non-null  float64
 3   Date                          70144 non-null  object 
 4   Air Temp. Inst. (ÃÂ°C)       70144 non-null  float64
 5   Humidity Inst. (%)            70144 non-null  float64
 6   Incoming Solar Rad. (W/m2)    70144 non-null  object 
 7   Precip. (mm)                  70144 non-null  float64
 8   Wind Speed 10 m Syno. (km/h)  70144 non-null  float64
 9   Wind Dir. 10 m Syno. (ÃÂ°)   70144 non-null  float64
 10  Wind Speed 10 m Avg. (km/h)   70144 non-null  float64
 11  Wind Dir. 10 m Avg. (ÃÂ°)    70144 non-null  float64
 12  Is Solar                      70144 non-null  bool   
dtypes: boo

In [80]:
# Rename columns to fix encoding issues
data.rename(columns={
    "Air Temp. Inst. (Ã‚Â°C)": "Air Temp. Inst. (°C)",
    "Wind Dir. 10 m Syno. (Ã‚Â°)": "Wind Dir. 10 m Syno. (°)",
    "Wind Dir. 10 m Avg. (Ã‚Â°)": "Wind Dir. 10 m Avg. (°)"
}, inplace=True)

In [81]:
# Convert and fill missing values
data["Incoming Solar Rad. (W/m2)"] = pd.to_numeric(data["Incoming Solar Rad. (W/m2)"], errors="coerce")
data["Incoming Solar Rad. (W/m2)"].fillna(data["Incoming Solar Rad. (W/m2)"].median(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data["Incoming Solar Rad. (W/m2)"].fillna(data["Incoming Solar Rad. (W/m2)"].median(), inplace=True)


In [82]:
# Fix Datetime
data["Date"] = pd.to_datetime(data["Date"])  # Convert the date column to datetime
# data.set_index("Date", inplace=True)  # Set Date as index
data["Year"]  = data["Date"].dt.year
data["Month"]  = data["Date"].dt.month
data["Day"]  = data["Date"].dt.day
data["Hour"]  = data["Date"].dt.hour
data.drop("Date", axis=1, inplace=True)  # Drop the original date column
data.drop("Maximum Capability", axis=1, inplace=True)  # Drop the original date column
data.drop("System Capability", axis=1, inplace=True)  # Drop the original date column
data.sort_index(inplace=True)  # Ensure the data is sorted by time
data.head()

Unnamed: 0,Volume,Air Temp. Inst. (ÃÂ°C),Humidity Inst. (%),Incoming Solar Rad. (W/m2),Precip. (mm),Wind Speed 10 m Syno. (km/h),Wind Dir. 10 m Syno. (ÃÂ°),Wind Speed 10 m Avg. (km/h),Wind Dir. 10 m Avg. (ÃÂ°),Is Solar,Year,Month,Day,Hour
0,0.0,-1.3,83.0,0.0,0.0,9.7,287.0,12.9,279.0,True,2020,1,1,0
0,23.665234,-22.0,62.3,0.0,0.0,21.4,253.4,19.7,251.5,False,2022,1,1,0
1,29.009501,-20.8,64.9,0.0,0.0,28.8,241.0,27.0,246.3,False,2022,1,1,1
1,0.0,-0.6,79.0,0.0,0.0,11.9,314.0,9.9,303.0,True,2020,1,1,1
2,0.0,-1.9,82.0,0.0,0.0,9.6,335.0,11.0,322.0,True,2020,1,1,2


In [83]:
# Define seasons based on month
def get_season(month):
    if month in [12, 1, 2]:
        return "Winter"
    elif month in [3, 4, 5]:
        return "Spring"
    elif month in [6, 7, 8]:
        return "Summer"
    else:
        return "Fall"

data["Season"] = data["Month"].apply(get_season)
data = pd.get_dummies(data, columns=["Season"], drop_first=True)  # One-hot encode

In [84]:
# Cyclical encoding for month and hour
data["Month_sin"] = np.sin(2 * np.pi * data["Month"] / 12)
data["Month_cos"] = np.cos(2 * np.pi * data["Month"] / 12)
data["Hour_sin"] = np.sin(2 * np.pi * data["Hour"] / 24)
data["Hour_cos"] = np.cos(2 * np.pi * data["Hour"] / 24)


In [85]:
# Define the target variable: 'Volume' (the power generated)
data['Incoming Solar Rad. (W/m2)'] = pd.to_numeric(data['Incoming Solar Rad. (W/m2)'], errors='coerce')

# Handle 0 values in the target variable by applying log transformation with +1 (log(x+1)) to avoid issues with log(0)
y = data['Volume'] # Replace zero values with NaN for easier handling

# Create a binary indicator feature for zero values
data['Zero_Volume'] = (data['Volume'] == 0).astype(int)

# Optionally, you can replace NaN values with the mean or median (if using log transformation and there are NaNs)
y = y.fillna(y.mean())  # You can also try using median depending on the data

# Apply log transformation to the target variable (log(x + 1))
y_log_transformed = np.log1p(y)

# Define the features: all columns except 'Volume' (and the new 'Zero_Volume' indicator)
X = data.drop(['Volume'], axis=1)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_log_transformed, test_size=0.2, random_state=42)

# Checking the data info to verify the changes
X.info()


<class 'pandas.core.frame.DataFrame'>
Index: 70144 entries, 0 to 43842
Data columns (total 21 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Air Temp. Inst. (ÃÂ°C)       70144 non-null  float64
 1   Humidity Inst. (%)            70144 non-null  float64
 2   Incoming Solar Rad. (W/m2)    70144 non-null  float64
 3   Precip. (mm)                  70144 non-null  float64
 4   Wind Speed 10 m Syno. (km/h)  70144 non-null  float64
 5   Wind Dir. 10 m Syno. (ÃÂ°)   70144 non-null  float64
 6   Wind Speed 10 m Avg. (km/h)   70144 non-null  float64
 7   Wind Dir. 10 m Avg. (ÃÂ°)    70144 non-null  float64
 8   Is Solar                      70144 non-null  bool   
 9   Year                          70144 non-null  int32  
 10  Month                         70144 non-null  int32  
 11  Day                           70144 non-null  int32  
 12  Hour                          70144 non-null  int32  
 13  Season

In [None]:
rf = MLPRegressor(solver='lbfgs', alpha=1e-5,
                    hidden_layer_sizes=(5, 2), random_state=1)

rf.fit(X_train, y_train)

In [87]:
from sklearn.metrics import mean_absolute_error, mean_squared_log_error, r2_score

# Predict the target values using the trained model on both training and test sets
y_train_pred = rf.predict(X_train)  # Predictions on the training set
y_test_pred = rf.predict(X_test)    # Predictions on the test set

# Calculate the Mean Absolute Error (MAE) for both training and test sets
# MAE measures the average absolute difference between predicted and actual values.
train_mae = mean_absolute_error(y_train, y_train_pred)
valid_mae = mean_absolute_error(y_test, y_test_pred)

# Calculate the Root Mean Squared Logarithmic Error (RMSLE) for both training and test sets
# RMSLE is often used for regression tasks when predictions can have a wide range.
# It penalizes underestimations more than overestimations.
train_rmsle = np.sqrt(mean_squared_log_error(y_train, y_train_pred))
valid_rmsle = np.sqrt(mean_squared_log_error(y_test, y_test_pred))

# Calculate the R-squared (R^2) score for both training and test sets
# R^2 measures how well the model explains the variance of the target variable.
# Higher values closer to 1 indicate better fit.
train_r2 = r2_score(y_train, y_train_pred)
valid_r2 = r2_score(y_test, y_test_pred)

# Create a dictionary to store and organize all the calculated results
results = {
    'Training MAE': train_mae,  # MAE for training set
    'Valid MAE': valid_mae,     # MAE for validation/test set
    'Training RMSLE': np.float64(train_rmsle),  # RMSLE for training set
    'Valid RMSLE': np.float64(valid_rmsle),    # RMSLE for validation/test set
    'Training R^2': train_r2,  # R^2 for training set
    'Valid R^2': valid_r2     # R^2 for validation/test set
}

# Print the results dictionary containing all the metrics
print(results)

{'Training MAE': 1.267434837663954, 'Valid MAE': 1.2687187009568146, 'Training RMSLE': np.float64(0.6587339436255257), 'Valid RMSLE': np.float64(0.6584453106943468), 'Training R^2': 0.0, 'Valid R^2': -3.481658370607654e-05}


In [88]:
rf.predict(X_test.head())

array([1.23224468, 1.23224468, 1.23224468, 1.23224468, 1.23224468])

In [89]:
y_test.head()

16168    4.268519
31036    2.044210
7458     0.000000
13394    0.000000
7141     0.631863
Name: Volume, dtype: float64