In [56]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split ,GridSearchCV,cross_val_score
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import os

In [57]:
# Load data
asset_name = "AKE1 McBride Lake Windfarm"

def clean_df(file_path: os.PathLike):
    df = pd.read_csv(file_path, encoding='unicode_escape')  # Read CSV file with proper encoding
    df = df.loc[:, ~df.columns.str.contains('^Unnamed')]  # Remove any "Unnamed" columns that might be auto-generated
    return df  # Return the cleaned DataFrame

In [58]:
# Read the data
data = clean_df(f'./Data/Merged/{asset_name}_Processed_and_Data.csv')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26301 entries, 0 to 26300
Data columns (total 12 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Volume                        26301 non-null  float64
 1   Maximum Capability            26301 non-null  float64
 2   System Capability             26301 non-null  float64
 3   Date                          26301 non-null  object 
 4   Air Temp. Inst. (ÃÂ°C)       26301 non-null  float64
 5   Humidity Inst. (%)            26301 non-null  float64
 6   Incoming Solar Rad. (W/m2)    26301 non-null  float64
 7   Precip. (mm)                  26301 non-null  float64
 8   Wind Speed 10 m Syno. (km/h)  26301 non-null  float64
 9   Wind Dir. 10 m Syno. (ÃÂ°)   26301 non-null  float64
 10  Wind Speed 10 m Avg. (km/h)   26301 non-null  float64
 11  Wind Dir. 10 m Avg. (ÃÂ°)    26301 non-null  float64
dtypes: float64(11), object(1)
memory usage: 2.4+ MB


In [59]:
# Rename columns to fix encoding issues
data.rename(columns={
    "Air Temp. Inst. (Ã‚Â°C)": "Air Temp. Inst. (°C)",
    "Wind Dir. 10 m Syno. (Ã‚Â°)": "Wind Dir. 10 m Syno. (°)",
    "Wind Dir. 10 m Avg. (Ã‚Â°)": "Wind Dir. 10 m Avg. (°)"
}, inplace=True)

In [60]:
# Convert and fill missing values
data["Incoming Solar Rad. (W/m2)"] = pd.to_numeric(data["Incoming Solar Rad. (W/m2)"], errors="coerce")
data["Incoming Solar Rad. (W/m2)"].fillna(data["Incoming Solar Rad. (W/m2)"].median(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data["Incoming Solar Rad. (W/m2)"].fillna(data["Incoming Solar Rad. (W/m2)"].median(), inplace=True)


In [61]:
# Fix Datetime
data["Date"] = pd.to_datetime(data["Date"])  # Convert the date column to datetime
# data.set_index("Date", inplace=True)  # Set Date as index
data["Year"]  = data["Date"].dt.year
data["Month"]  = data["Date"].dt.month
data["Day"]  = data["Date"].dt.day
data["Hour"]  = data["Date"].dt.hour
data.drop("Date", axis=1, inplace=True)  # Drop the original date column
data.drop("Maximum Capability", axis=1, inplace=True)  # Drop the original date column
data.drop("System Capability", axis=1, inplace=True)  # Drop the original date column
data.sort_index(inplace=True)  # Ensure the data is sorted by time
data.head()

Unnamed: 0,Volume,Air Temp. Inst. (ÃÂ°C),Humidity Inst. (%),Incoming Solar Rad. (W/m2),Precip. (mm),Wind Speed 10 m Syno. (km/h),Wind Dir. 10 m Syno. (ÃÂ°),Wind Speed 10 m Avg. (km/h),Wind Dir. 10 m Avg. (ÃÂ°),Year,Month,Day,Hour
0,23.665234,-32.0,71.0,0.0,0.0,7.3,166.0,7.6,172.0,2022,1,1,0
1,29.009501,-32.5,71.0,0.0,0.0,6.5,175.0,6.8,170.0,2022,1,1,1
2,29.921424,-33.6,69.0,0.0,0.0,6.5,161.0,5.7,178.0,2022,1,1,2
3,30.036901,-32.7,71.0,0.0,0.0,8.2,179.0,7.4,173.0,2022,1,1,3
4,30.229873,-32.3,71.0,0.0,0.0,8.6,177.0,9.6,179.0,2022,1,1,4


In [62]:
# Define seasons based on month
def get_season(month):
    if month in [12, 1, 2]:
        return "Winter"
    elif month in [3, 4, 5]:
        return "Spring"
    elif month in [6, 7, 8]:
        return "Summer"
    else:
        return "Fall"

data["Season"] = data["Month"].apply(get_season)
data = pd.get_dummies(data, columns=["Season"], drop_first=True)  # One-hot encode

In [63]:
# Cyclical encoding for month and hour
data["Month_sin"] = np.sin(2 * np.pi * data["Month"] / 12)
data["Month_cos"] = np.cos(2 * np.pi * data["Month"] / 12)
data["Hour_sin"] = np.sin(2 * np.pi * data["Hour"] / 24)
data["Hour_cos"] = np.cos(2 * np.pi * data["Hour"] / 24)


In [64]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

# Define the target variable: 'Volume' (the power generated)
data['Incoming Solar Rad. (W/m2)'] = pd.to_numeric(data['Incoming Solar Rad. (W/m2)'], errors='coerce')

# Handle 0 values in the target variable by applying log transformation with +1 (log(x+1)) to avoid issues with log(0)
y = data['Volume'].replace(0, np.nan)  # Replace zero values with NaN for easier handling

# Create a binary indicator feature for zero values
data['Zero_Volume'] = (data['Volume'] == 0).astype(int)

# Optionally, you can replace NaN values with the mean or median (if using log transformation and there are NaNs)
y = y.fillna(y.mean())  # You can also try using median depending on the data

# Apply log transformation to the target variable (log(x + 1))
y_log_transformed = np.log1p(y)

# Define the features: all columns except 'Volume' (and the new 'Zero_Volume' indicator)
X = data.drop(['Volume', 'Zero_Volume'], axis=1)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_log_transformed, test_size=0.2, random_state=42)

# Checking the data info to verify the changes
X.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26301 entries, 0 to 26300
Data columns (total 19 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Air Temp. Inst. (ÃÂ°C)       26301 non-null  float64
 1   Humidity Inst. (%)            26301 non-null  float64
 2   Incoming Solar Rad. (W/m2)    26301 non-null  float64
 3   Precip. (mm)                  26301 non-null  float64
 4   Wind Speed 10 m Syno. (km/h)  26301 non-null  float64
 5   Wind Dir. 10 m Syno. (ÃÂ°)   26301 non-null  float64
 6   Wind Speed 10 m Avg. (km/h)   26301 non-null  float64
 7   Wind Dir. 10 m Avg. (ÃÂ°)    26301 non-null  float64
 8   Year                          26301 non-null  int32  
 9   Month                         26301 non-null  int32  
 10  Day                           26301 non-null  int32  
 11  Hour                          26301 non-null  int32  
 12  Season_Spring                 26301 non-null  bool   
 13  S

In [65]:
from sklearn.ensemble import RandomForestRegressor
# Initialize the Random Forest model
# rf = RandomForestRegressor(random_state=42)
# Reduce complexity by limiting max depth and increasing min samples
rf = RandomForestRegressor(
    n_estimators=100,  # You can try fewer estimators to prevent overfitting
    oob_score= True,  # Increase the minimum number of samples required at a leaf node # Use a subset of features at each split to prevent overfitting
    random_state=1
)

# Train the model on the training dataset (X_train, y_train)
rf.fit(X_train, y_train)

In [66]:
from sklearn.metrics import mean_absolute_error, mean_squared_log_error, r2_score

# Predict the target values using the trained model on both training and test sets
y_train_pred = rf.predict(X_train)  # Predictions on the training set
y_test_pred = rf.predict(X_test)    # Predictions on the test set

# Calculate the Mean Absolute Error (MAE) for both training and test sets
# MAE measures the average absolute difference between predicted and actual values.
train_mae = mean_absolute_error(y_train, y_train_pred)
valid_mae = mean_absolute_error(y_test, y_test_pred)

# Calculate the Root Mean Squared Logarithmic Error (RMSLE) for both training and test sets
# RMSLE is often used for regression tasks when predictions can have a wide range.
# It penalizes underestimations more than overestimations.
train_rmsle = np.sqrt(mean_squared_log_error(y_train, y_train_pred))
valid_rmsle = np.sqrt(mean_squared_log_error(y_test, y_test_pred))

# Calculate the R-squared (R^2) score for both training and test sets
# R^2 measures how well the model explains the variance of the target variable.
# Higher values closer to 1 indicate better fit.
train_r2 = r2_score(y_train, y_train_pred)
valid_r2 = r2_score(y_test, y_test_pred)

# Create a dictionary to store and organize all the calculated results
results = {
    'Training MAE': train_mae,  # MAE for training set
    'Valid MAE': valid_mae,     # MAE for validation/test set
    'Training RMSLE': np.float64(train_rmsle),  # RMSLE for training set
    'Valid RMSLE': np.float64(valid_rmsle),    # RMSLE for validation/test set
    'Training R^2': train_r2,  # R^2 for training set
    'Valid R^2': valid_r2     # R^2 for validation/test set
}

# Print the results dictionary containing all the metrics
print(results)

{'Training MAE': 0.31029916829986104, 'Valid MAE': 0.8439011382319158, 'Training RMSLE': np.float64(0.18911347627032418), 'Valid RMSLE': np.float64(0.40196769123513537), 'Training R^2': 0.900765909761286, 'Valid R^2': 0.29235563245014873}


In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

# Sample data
data = {
    'Date': ['2022-01-01 00:00:00', '2022-01-01 01:00:00', '2022-01-01 02:00:00'],
    'Air Temp. Inst. (Â°C)': [-32.0, -32.5, -33.6],
    'Humidity Inst. (%)': [71.0, 71.0, 69.0],
    'Incoming Solar Rad. (W/m2)': [0.0, 0.0, 0.0],
    'Precip. (mm)': [0.0, 0.0, 0.0],
    'Wind Speed 10 m Syno. (km/h)': [7.3, 6.5, 6.5],
    'Wind Dir. 10 m Syno. (Â°)': [166.0, 175.0, 161.0],
    'Wind Speed 10 m Avg. (km/h)': [7.6, 6.8, 5.7],
    'Wind Dir. 10 m Avg. (Â°)': [172.0, 170.0, 178.0],
    'Volume': [23.665, 29.009, 29.921],
    'Maximum Capability': [73.0, 73.0, 73.0],
    'System Capability': [73.0, 73.0, 73.0]
}

# Create a DataFrame
df = pd.DataFrame(data)

# Convert 'Date' column to datetime
df['Date'] = pd.to_datetime(df['Date'])

# Extract hour and day features
df['Hour'] = df['Date'].dt.hour
df['Day'] = df['Date'].dt.day

# Extract the features (excluding 'Date' and 'Volume')
features = df.drop(columns=['Date', 'Volume'])

# Target variable (volume)
target = df['Volume']

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Train a RandomForest model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Now let's predict the volume for new data (same structure as the original data)
future_data = df[['Date', 'Air Temp. Inst. (Â°C)', 'Humidity Inst. (%)', 'Incoming Solar Rad. (W/m2)', 'Precip. (mm)', 
                  'Wind Speed 10 m Syno. (km/h)', 'Wind Dir. 10 m Syno. (Â°)', 'Wind Speed 10 m Avg. (km/h)', 
                  'Wind Dir. 10 m Avg. (Â°)', 'Maximum Capability', 'System Capability']]

# Extract hour and day for prediction data
future_data['Hour'] = pd.to_datetime(future_data['Date']).dt.hour
future_data['Day'] = pd.to_datetime(future_data['Date']).dt.day

# Predict the volume using the trained RandomForest model
predicted_volume = rf.predict(future_data.drop(columns=['Date']))

# Create the output DataFrame with 'Date', 'Maximum Capability', 'System Capability' and predicted 'Volume'
output = future_data[['Date', 'Maximum Capability', 'System Capability']].copy()
output['Predicted Volume'] = predicted_volume

# Print the final output with 'Date', 'Maximum Capability', 'System Capability' and 'Predicted Volume'
print(output)
import pandas as pd
import numpy as np

# Define the get_season function
def get_season(month):
    if month in [12, 1, 2]:
        return "Winter"
    elif month in [3, 4, 5]:
        return "Spring"
    elif month in [6, 7, 8]:
        return "Summer"
    else:
        return "Fall"

# Generate future data (March 2030)
future_dates = pd.date_range(start="2030-03-01", end="2030-03-31", freq="H")  # Hourly data for March 2030
future_data = pd.DataFrame({'Date': future_dates})
future_data['Month'] = future_data['Date'].dt.month
future_data['Year'] = future_data['Date'].dt.year
future_data['Day'] = future_data['Date'].dt.day
future_data['Hour'] = future_data['Date'].dt.hour

# One-hot encoding for the "Season" feature based on month
future_data['Season'] = future_data['Month'].apply(get_season)
future_data = pd.get_dummies(future_data, columns=["Season"], drop_first=True)  # One-hot encode season

# Generate random data for 'Volume' (target), 'Capacity' and 'Wind Speed'
# You should replace these with realistic values or forecasting methods based on your model
future_data['Volume'] = np.random.uniform(low=0, high=100, size=len(future_data))  # Random generation data
future_data['Wind Speed'] = np.random.uniform(low=0, high=30, size=len(future_data))  # Random wind speed
future_data['Maximum Capability'] = np.full(len(future_data), 73.0)  # Assuming constant max capability
future_data['System Capability'] = np.full(len(future_data), 73.0)  # Assuming constant system capability

# Reorganize the columns as per your request
future_data = future_data[['Date', 'Hour', 'Volume',  'Maximum Capability', 'System Capability', 'Wind Speed']]  # Keep necessary columns

# Save the future forecast data to a CSV file
future_data.to_csv("future_trend_forecasting_2030_march.csv", index=False)

# Check the first few rows of the generated data
future_data.head()


                 Date  Maximum Capability  System Capability  Predicted Volume
0 2022-01-01 00:00:00                73.0               73.0          29.32820
1 2022-01-01 01:00:00                73.0               73.0          29.20964
2 2022-01-01 02:00:00                73.0               73.0          29.68388


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  future_data['Hour'] = pd.to_datetime(future_data['Date']).dt.hour
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  future_data['Day'] = pd.to_datetime(future_data['Date']).dt.day
  future_dates = pd.date_range(start="2030-03-01", end="2030-03-31", freq="H")  # Hourly data for March 2030


KeyError: "['Maximum Capability', 'System Capability'] not in index"