# Random Forest

In [1]:
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import os

#### Meta Data

In [2]:
assset_name = 'BSC1 Brooks Solar'

In [3]:
def clean_df(file_path: os.PathLike):
    df = pd.read_csv(file_path, encoding='unicode_escape')  # Read CSV file with proper encoding
    df = df.loc[:, ~df.columns.str.contains('^Unnamed')]  # Remove any "Unnamed" columns that might be auto-generated
    return df  # Return the cleaned DataFrame

##### Read Data

In [4]:
# Read the data
data = clean_df(f'./Data/Merged/{assset_name}_Processed_and_Data.csv')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43843 entries, 0 to 43842
Data columns (total 12 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Volume                        43843 non-null  float64
 1   Maximum Capability            43843 non-null  float64
 2   System Capability             43843 non-null  float64
 3   Date                          43843 non-null  object 
 4   Air Temp. Inst. (ÃÂ°C)       43843 non-null  float64
 5   Humidity Inst. (%)            43843 non-null  float64
 6   Incoming Solar Rad. (W/m2)    43843 non-null  float64
 7   Precip. (mm)                  43843 non-null  float64
 8   Wind Speed 10 m Syno. (km/h)  43843 non-null  float64
 9   Wind Dir. 10 m Syno. (ÃÂ°)   43843 non-null  float64
 10  Wind Speed 10 m Avg. (km/h)   43843 non-null  float64
 11  Wind Dir. 10 m Avg. (ÃÂ°)    43843 non-null  float64
dtypes: float64(11), object(1)
memory usage: 4.0+ MB


##### Feature Extraction

In [5]:
# Fix Datetime
data["Date"] = pd.to_datetime(data["Date"])  # Convert the date column to datetime
# data.set_index("Date", inplace=True)  # Set Date as index
data["Year"]  = data["Date"].dt.year
data["Month"]  = data["Date"].dt.month
data["Day"]  = data["Date"].dt.day
data["Hour"]  = data["Date"].dt.hour
data.drop("Date", axis=1, inplace=True)  # Drop the original date column
data.sort_index(inplace=True)  # Ensure the data is sorted by time
data.head()

Unnamed: 0,Volume,Maximum Capability,System Capability,Air Temp. Inst. (ÃÂ°C),Humidity Inst. (%),Incoming Solar Rad. (W/m2),Precip. (mm),Wind Speed 10 m Syno. (km/h),Wind Dir. 10 m Syno. (ÃÂ°),Wind Speed 10 m Avg. (km/h),Wind Dir. 10 m Avg. (ÃÂ°),Year,Month,Day,Hour
0,0.0,15.0,15.0,-1.3,83.0,0.0,0.0,9.7,287.0,12.9,279.0,2020,1,1,0
1,0.0,15.0,15.0,-0.6,79.0,0.0,0.0,11.9,314.0,9.9,303.0,2020,1,1,1
2,0.0,15.0,15.0,-1.9,82.0,0.0,0.0,9.6,335.0,11.0,322.0,2020,1,1,2
3,0.0,15.0,15.0,-2.3,86.0,0.0,0.0,9.3,345.0,10.9,341.0,2020,1,1,3
4,0.0,15.0,15.0,-1.8,83.0,0.0,0.0,9.3,347.0,8.7,338.0,2020,1,1,4


##### Split Data into Features and Target & Training and Test Sets

In [6]:
# Define the target variable: 'Volume' (the power generated)
y = data['Volume']  # Power generation output that we want to predict

# Define the features: all columns except 'Volume'
X = data.drop('Volume', axis=1)  # Drop the 'Volume' column to use all others as features

from sklearn.model_selection import train_test_split

# Split the dataset into training and testing sets
# X contains the features (independent variables), y contains the target (dependent variable)
# The test size is set to 20%, meaning 80% of the data will be used for training, and 20% will be used for testing.
# random_state ensures the split is reproducible, so you get the same split every time you run the code.

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
from sklearn.ensemble import RandomForestRegressor
# Initialize the Random Forest model
# rf = RandomForestRegressor(random_state=42)
# Reduce complexity by limiting max depth and increasing min samples
rf = RandomForestRegressor(
    n_estimators=100,  # You can try fewer estimators to prevent overfitting
    max_depth=10,  # Limit depth to prevent deep trees
    min_samples_split=10,  # Increase the minimum number of samples required to split a node
    min_samples_leaf=5,  # Increase the minimum number of samples required at a leaf node
    max_features='sqrt',  # Use a subset of features at each split to prevent overfitting
    random_state=42
)

# Train the model on the training dataset (X_train, y_train)
rf.fit(X_train, y_train)

In [8]:
from sklearn.metrics import mean_absolute_error, mean_squared_log_error, r2_score

# Predict the target values using the trained model on both training and test sets
y_train_pred = rf.predict(X_train)  # Predictions on the training set
y_test_pred = rf.predict(X_test)    # Predictions on the test set

# Calculate the Mean Absolute Error (MAE) for both training and test sets
# MAE measures the average absolute difference between predicted and actual values.
train_mae = mean_absolute_error(y_train, y_train_pred)
valid_mae = mean_absolute_error(y_test, y_test_pred)

# Calculate the Root Mean Squared Logarithmic Error (RMSLE) for both training and test sets
# RMSLE is often used for regression tasks when predictions can have a wide range.
# It penalizes underestimations more than overestimations.
train_rmsle = np.sqrt(mean_squared_log_error(y_train, y_train_pred))
valid_rmsle = np.sqrt(mean_squared_log_error(y_test, y_test_pred))

# Calculate the R-squared (R^2) score for both training and test sets
# R^2 measures how well the model explains the variance of the target variable.
# Higher values closer to 1 indicate better fit.
train_r2 = r2_score(y_train, y_train_pred)
valid_r2 = r2_score(y_test, y_test_pred)

# Create a dictionary to store and organize all the calculated results
results = {
    'Training MAE': train_mae,  # MAE for training set
    'Valid MAE': valid_mae,     # MAE for validation/test set
    'Training RMSLE': np.float64(train_rmsle),  # RMSLE for training set
    'Valid RMSLE': np.float64(valid_rmsle),    # RMSLE for validation/test set
    'Training R^2': train_r2,  # R^2 for training set
    'Valid R^2': valid_r2     # R^2 for validation/test set
}

# Print the results dictionary containing all the metrics
print(results)

{'Training MAE': 0.6761291861549726, 'Valid MAE': 0.7203959622141913, 'Training RMSLE': np.float64(0.29575378883806697), 'Valid RMSLE': np.float64(0.3153953143983385), 'Training R^2': 0.9009117000379785, 'Valid R^2': 0.8796304995529968}


In [9]:
# Get feature importances from the trained RandomForest model
feature_importances = rf.feature_importances_  # Importance of each feature in the model's prediction

# Get the names of the features (columns)
feature_names = X.columns  # Feature names from the training data

# Create a DataFrame to organize feature names and their corresponding importances for easy viewing
importance_df = pd.DataFrame({
    'Feature': feature_names,         # Feature names
    'Importance': feature_importances  # Corresponding importance values
})

# Sort the DataFrame by the 'Importance' column in descending order to see the most important features first
importance_df = importance_df.sort_values(by='Importance', ascending=False)

# Print the DataFrame containing feature names and their importance values
print(importance_df)

                         Feature  Importance
4     Incoming Solar Rad. (W/m2)    0.485109
13                          Hour    0.233628
3             Humidity Inst. (%)    0.126589
2        Air Temp. Inst. (ÃÂ°C)    0.097027
11                         Month    0.023635
6   Wind Speed 10 m Syno. (km/h)    0.009245
8    Wind Speed 10 m Avg. (km/h)    0.006735
7    Wind Dir. 10 m Syno. (ÃÂ°)    0.006105
9     Wind Dir. 10 m Avg. (ÃÂ°)    0.005670
12                           Day    0.003128
10                          Year    0.001754
5                   Precip. (mm)    0.001374
0             Maximum Capability    0.000000
1              System Capability    0.000000


##### Cross-validation

In [10]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor
import numpy as np


# Perform 5-fold cross-validation
cv_scores = cross_val_score(rf, X, y, cv=5, scoring='neg_mean_absolute_error')

# Convert negative MAE to positive
cv_scores = -cv_scores

# Print out the cross-validation scores
print("Cross-validation MAE scores:", cv_scores)
print("Average Cross-validation MAE:", np.mean(cv_scores))


Cross-validation MAE scores: [0.70257867 0.7787032  0.74850695 0.72941372 0.70731347]
Average Cross-validation MAE: 0.7333032010458905
