In [2]:
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import os

In [3]:
assset_name = 'AKE1 McBride Lake Windfarm'

In [4]:
def clean_df(file_path: os.PathLike):
    df = pd.read_csv(file_path, encoding='unicode_escape')  # Read CSV file with proper encoding
    df = df.loc[:, ~df.columns.str.contains('^Unnamed')]  # Remove any "Unnamed" columns that might be auto-generated
    return df  # Return the cleaned DataFrame

In [5]:
# Read the data
data = clean_df(f'./Data/Merged/{assset_name}_Processed_and_Data.csv')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26301 entries, 0 to 26300
Data columns (total 12 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Volume                        26301 non-null  float64
 1   Maximum Capability            26301 non-null  float64
 2   System Capability             26301 non-null  float64
 3   Date                          26301 non-null  object 
 4   Air Temp. Inst. (ÃÂ°C)       26301 non-null  float64
 5   Humidity Inst. (%)            26301 non-null  float64
 6   Incoming Solar Rad. (W/m2)    26301 non-null  object 
 7   Precip. (mm)                  26301 non-null  float64
 8   Wind Speed 10 m Syno. (km/h)  26301 non-null  float64
 9   Wind Dir. 10 m Syno. (ÃÂ°)   26301 non-null  float64
 10  Wind Speed 10 m Avg. (km/h)   26301 non-null  float64
 11  Wind Dir. 10 m Avg. (ÃÂ°)    26301 non-null  float64
dtypes: float64(10), object(2)
memory usage: 2.4+ MB


In [6]:
# Fix Datetime
data["Date"] = pd.to_datetime(data["Date"])  # Convert the date column to datetime
# data.set_index("Date", inplace=True)  # Set Date as index
data["Year"]  = data["Date"].dt.year
data["Month"]  = data["Date"].dt.month
data["Day"]  = data["Date"].dt.day
data["Hour"]  = data["Date"].dt.hour
data.drop("Date", axis=1, inplace=True)  # Drop the original date column
data.drop("Maximum Capability", axis=1, inplace=True)  # Drop the original date column
data.drop("System Capability", axis=1, inplace=True)  # Drop the original date column
data.sort_index(inplace=True)  # Ensure the data is sorted by time
data.head()

Unnamed: 0,Volume,Air Temp. Inst. (ÃÂ°C),Humidity Inst. (%),Incoming Solar Rad. (W/m2),Precip. (mm),Wind Speed 10 m Syno. (km/h),Wind Dir. 10 m Syno. (ÃÂ°),Wind Speed 10 m Avg. (km/h),Wind Dir. 10 m Avg. (ÃÂ°),Year,Month,Day,Hour
0,23.665234,-22.0,62.3,0.0,0.0,21.4,253.4,19.7,251.5,2022,1,1,0
1,29.009501,-20.8,64.9,0.0,0.0,28.8,241.0,27.0,246.3,2022,1,1,1
2,29.921424,-20.2,64.3,0.0,0.0,36.4,255.0,37.0,255.3,2022,1,1,2
3,30.036901,-17.4,63.5,0.0,0.0,34.5,258.3,34.6,255.7,2022,1,1,3
4,30.229873,-16.1,63.4,0.0,0.0,43.9,269.3,38.1,264.6,2022,1,1,4


In [7]:
# 1. Wind Speed Squared (captures nonlinearity)
data["Wind Speed 10m Avg Squared"] = data["Wind Speed 10 m Avg. (km/h)"] ** 2

# 2. Wind Speed × Humidity Interaction
data["Wind Speed × Humidity"] = data["Wind Speed 10 m Avg. (km/h)"] * data["Humidity Inst. (%)"]

# 3. Temperature Difference from Daily Mean
data["Daily Avg Temp"] = data.groupby(["Year", "Month", "Day"])["Air Temp. Inst. (ÃÂ°C)"].transform("mean")
data["Temp Difference"] = data["Air Temp. Inst. (ÃÂ°C)"] - data["Daily Avg Temp"]

# 4. Convert Month & Day to "Day of Year"
data["Day of Year"] = pd.to_datetime(data[["Year", "Month", "Day"]]).dt.dayofyear


# One-Hot Encode the Time Period feature
# data = pd.get_dummies(df, columns=["Time Period"], drop_first=True)

# Drop intermediate columns if necessary
data.drop(columns=["Daily Avg Temp"], inplace=True)

In [8]:
# Define the target variable: 'Volume' (the power generated)
data['Incoming Solar Rad. (W/m2)'] = pd.to_numeric(data['Incoming Solar Rad. (W/m2)'], errors='coerce')
y = data['Volume']  # Power generation output that we want to predict

# Define the features: all columns except 'Volume'
X = data.drop('Volume', axis=1)  # Drop the 'Volume' column to use all others as features

X = X.dropna()
y = y[X.index] 

from sklearn.model_selection import train_test_split

# Split the dataset into training and testing sets
# X contains the features (independent variables), y contains the target (dependent variable)
# The test size is set to 20%, meaning 80% of the data will be used for training, and 20% will be used for testing.
# random_state ensures the split is reproducible, so you get the same split every time you run the code.

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X.info()

<class 'pandas.core.frame.DataFrame'>
Index: 26300 entries, 0 to 26300
Data columns (total 16 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Air Temp. Inst. (ÃÂ°C)       26300 non-null  float64
 1   Humidity Inst. (%)            26300 non-null  float64
 2   Incoming Solar Rad. (W/m2)    26300 non-null  float64
 3   Precip. (mm)                  26300 non-null  float64
 4   Wind Speed 10 m Syno. (km/h)  26300 non-null  float64
 5   Wind Dir. 10 m Syno. (ÃÂ°)   26300 non-null  float64
 6   Wind Speed 10 m Avg. (km/h)   26300 non-null  float64
 7   Wind Dir. 10 m Avg. (ÃÂ°)    26300 non-null  float64
 8   Year                          26300 non-null  int32  
 9   Month                         26300 non-null  int32  
 10  Day                           26300 non-null  int32  
 11  Hour                          26300 non-null  int32  
 12  Wind Speed 10m Avg Squared    26300 non-null  float64
 13  Wind S

In [18]:
from sklearn.ensemble import GradientBoostingRegressor
# Initialize the Random Forest model
# rf = RandomForestRegressor(random_state=42)
# Reduce complexity by limiting max depth and increasing min samples

rf = GradientBoostingRegressor(
    n_estimators=300,  # Reduced number of estimators to avoid overfitting
    max_depth=13,  # Limited depth of trees to prevent deep, overfitted trees
    min_samples_split=7,  # Keep as is; good for preventing overfitting
    min_samples_leaf=7,  # Keep as is; ensures leaves have enough samples
    max_features='sqrt',  # Use a subset of features at each split to improve generalization
    learning_rate=0.02,  # Added learning rate to reduce the contribution of each tree, aiding generalization
    random_state=42
)

# Train the model on the training dataset (X_train, y_train)
rf.fit(X_train, y_train)

# XGBoost
# import xgboost as xgb
# rf = xgb.XGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.3, learning_rate = 0.1,
#                 max_depth = 12, alpha = 10, n_estimators = 200)
# rf.fit(X_train,y_train)


In [19]:
from sklearn.metrics import mean_absolute_error, mean_squared_log_error, r2_score

# Predict the target values using the trained model on both training and test sets
y_train_pred = rf.predict(X_train)  # Predictions on the training set
y_test_pred = rf.predict(X_test)    # Predictions on the test set

# Calculate the Mean Absolute Error (MAE) for both training and test sets
# MAE measures the average absolute difference between predicted and actual values.
train_mae = mean_absolute_error(y_train, y_train_pred)
valid_mae = mean_absolute_error(y_test, y_test_pred)

# Calculate the Root Mean Squared Logarithmic Error (RMSLE) for both training and test sets
# RMSLE is often used for regression tasks when predictions can have a wide range.
# It penalizes underestimations more than overestimations.
# train_rmsle = np.sqrt(mean_squared_log_error(y_train, y_train_pred))
# valid_rmsle = np.sqrt(mean_squared_log_error(y_test, y_test_pred))

# Calculate the R-squared (R^2) score for both training and test sets
# R^2 measures how well the model explains the variance of the target variable.
# Higher values closer to 1 indicate better fit.
train_r2 = r2_score(y_train, y_train_pred)
valid_r2 = r2_score(y_test, y_test_pred)

# Create a dictionary to store and organize all the calculated results
results = {
    'Training MAE': train_mae,  # MAE for training set
    'Valid MAE': valid_mae,     # MAE for validation/test set
    # 'Training RMSLE': np.float64(train_rmsle),  # RMSLE for training set
    # 'Valid RMSLE': np.float64(valid_rmsle),    # RMSLE for validation/test set
    'Training R^2': train_r2,  # R^2 for training set
    'Valid R^2': valid_r2     # R^2 for validation/test set
}

# Print the results dictionary containing all the metrics
print(results)

{'Training MAE': 2.4981175198715033, 'Valid MAE': 6.195167936004635, 'Training R^2': 0.9759598749753985, 'Valid R^2': 0.8489869364603974}


In [None]:
# Get feature importances from the trained RandomForest model
feature_importances = rf.feature_importances_  # Importance of each feature in the model's prediction

# Get the names of the features (columns)
feature_names = X.columns  # Feature names from the training data

# Create a DataFrame to organize feature names and their corresponding importances for easy viewing
importance_df = pd.DataFrame({
    'Feature': feature_names,         # Feature names
    'Importance': feature_importances  # Corresponding importance values
})

# Sort the DataFrame by the 'Importance' column in descending order to see the most important features first
importance_df = importance_df.sort_values(by='Importance', ascending=False)

# Print the DataFrame containing feature names and their importance values
print(importance_df)

                         Feature  Importance
6    Wind Speed 10 m Avg. (km/h)    0.279048
4   Wind Speed 10 m Syno. (km/h)    0.262486
13         Wind Speed × Humidity    0.143363
12    Wind Speed 10m Avg Squared    0.062944
7     Wind Dir. 10 m Avg. (ÃÂ°)    0.049424
5    Wind Dir. 10 m Syno. (ÃÂ°)    0.034692
1             Humidity Inst. (%)    0.031339
15                   Day of Year    0.027251
0        Air Temp. Inst. (ÃÂ°C)    0.026283
14               Temp Difference    0.018995
10                           Day    0.016060
2     Incoming Solar Rad. (W/m2)    0.014431
11                          Hour    0.013095
9                          Month    0.010600
8                           Year    0.008906
3                   Precip. (mm)    0.001083


: 

In [None]:
from sklearn.model_selection import cross_val_score
import numpy as np


# Perform 5-fold cross-validation
cv_scores = cross_val_score(rf, X, y, cv=5, scoring='neg_mean_absolute_error')

# Convert negative MAE to positive
cv_scores = -cv_scores

# Print out the cross-validation scores
print("Cross-validation MAE scores:", cv_scores)
print("Average Cross-validation MAE:", np.mean(cv_scores))


KeyboardInterrupt: 

: 