# Solar PV Forecasting - Model Development

This notebook loads the cleaned dataset and prepares it for model training.

## 1. Import Libraries

In [4]:
import pandas as pd
import numpy as np

## 2. Load Cleaned Dataset

In [5]:
# Load the cleaned dataset with datetime index
df = pd.read_csv('data/solar_pv_clean_hourly.csv', index_col=0, parse_dates=True)

# Display dataset shape
print(f"Dataset Shape: {df.shape}")

# Display column names
print(f"\nColumn Names:")
print(df.columns.tolist())

# Display date range
print(f"\nDate Range:")
print(f"Start: {df.index.min()}")
print(f"End: {df.index.max()}")

Dataset Shape: (505, 9)

Column Names:
['PLANT_ID_x', 'DC_POWER', 'AC_POWER', 'DAILY_YIELD', 'TOTAL_YIELD', 'PLANT_ID_y', 'AMBIENT_TEMPERATURE', 'MODULE_TEMPERATURE', 'IRRADIATION']

Date Range:
Start: 2020-05-15 05:00:00
End: 2020-06-17 18:00:00


In [6]:
# Display first few rows
df.head()

Unnamed: 0_level_0,PLANT_ID_x,DC_POWER,AC_POWER,DAILY_YIELD,TOTAL_YIELD,PLANT_ID_y,AMBIENT_TEMPERATURE,MODULE_TEMPERATURE,IRRADIATION
DATE_TIME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2020-05-15 05:00:00,4135001.0,0.0,0.0,0.0,6450830.0,4136001.0,24.966926,23.906986,0.00071
2020-05-15 06:00:00,4135001.0,558.825893,54.269643,13.0,7160322.0,4136001.0,24.835316,24.682242,0.044983
2020-05-15 07:00:00,4135001.0,1927.08125,188.893661,146.607143,6874186.0,4136001.0,26.323637,31.549644,0.337079
2020-05-15 08:00:00,4135001.0,3986.140476,391.35619,437.780952,6433823.0,4136001.0,28.369425,37.962135,0.540164
2020-05-15 09:00:00,4135001.0,5088.5,498.246429,872.214286,6699904.0,4136001.0,30.419307,43.7004,0.728268


## 3. Train-Test Split

In [7]:
# Calculate split point for 80-20 split
split_idx = int(len(df) * 0.8)

# Split data chronologically (no shuffle)
train_df = df.iloc[:split_idx]
test_df = df.iloc[split_idx:]

# Separate features and target
X_train = train_df.drop('DC_POWER', axis=1)
y_train = train_df['DC_POWER']

X_test = test_df.drop('DC_POWER', axis=1)
y_test = test_df['DC_POWER']

# Print shapes
print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")

# Print date ranges for train and test sets
print(f"\nTrain date range: {X_train.index.min()} to {X_train.index.max()}")
print(f"Test date range: {X_test.index.min()} to {X_test.index.max()}")

X_train shape: (404, 8)
X_test shape: (101, 8)
y_train shape: (404,)
y_test shape: (101,)

Train date range: 2020-05-15 05:00:00 to 2020-06-11 09:00:00
Test date range: 2020-06-11 10:00:00 to 2020-06-17 18:00:00


In [8]:
print(df.shape)
print(df.columns)
print(df.index.min(), df.index.max())


(505, 9)
Index(['PLANT_ID_x', 'DC_POWER', 'AC_POWER', 'DAILY_YIELD', 'TOTAL_YIELD',
       'PLANT_ID_y', 'AMBIENT_TEMPERATURE', 'MODULE_TEMPERATURE',
       'IRRADIATION'],
      dtype='object')
2020-05-15 05:00:00 2020-06-17 18:00:00


In [9]:
print(X_train.shape, X_test.shape)


(404, 8) (101, 8)


In [10]:
print(X_train.index.max())
print(X_test.index.min())


2020-06-11 09:00:00
2020-06-11 10:00:00


## 13. Evaluate LSTM Model on Test Set

In [None]:
# Generate predictions on test set
y_pred_lstm_scaled = lstm_model.predict(X_test_lstm_scaled)
y_pred_lstm = y_pred_lstm_scaled.flatten()  # Flatten to 1D array

# Note: We don't need to inverse transform since we didn't scale the target variable
# Only the input features were scaled

# Compute evaluation metrics
mae_lstm = mean_absolute_error(y_test_lstm, y_pred_lstm)
rmse_lstm = np.sqrt(mean_squared_error(y_test_lstm, y_pred_lstm))
mape_lstm = np.mean(np.abs((y_test_lstm - y_pred_lstm) / (y_test_lstm + 1e-8))) * 100

# Print metrics
print("\n" + "="*50)
print("LSTM Model - Test Set Performance")
print("="*50)
print(f"MAE:  {mae_lstm:.4f} kW")
print(f"RMSE: {rmse_lstm:.4f} kW")
print(f"MAPE: {mape_lstm:.2f}%")
print("="*50)

# Plot actual vs predicted
plt.figure(figsize=(14, 6))
plt.plot(y_test_lstm, label='Actual DC_POWER', linewidth=2, alpha=0.7)
plt.plot(y_pred_lstm, label='LSTM Predicted', linewidth=2, alpha=0.7)
plt.xlabel('Time Step', fontsize=12)
plt.ylabel('DC_POWER (kW)', fontsize=12)
plt.title('LSTM Model: Actual vs Predicted DC_POWER (Test Set)', fontsize=14, fontweight='bold')
plt.legend(fontsize=11)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

# Plot residuals
residuals_lstm = y_test_lstm - y_pred_lstm
plt.figure(figsize=(14, 5))
plt.plot(residuals_lstm, linewidth=1.5, alpha=0.7, color='red')
plt.axhline(y=0, color='black', linestyle='--', linewidth=1)
plt.xlabel('Time Step', fontsize=12)
plt.ylabel('Residual (kW)', fontsize=12)
plt.title('LSTM Model Residuals (Actual - Predicted)', fontsize=14, fontweight='bold')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

## 14. Final Model Comparison

In [None]:
# Create comprehensive comparison table
comparison_df = pd.DataFrame({
    'Model': ['Persistence Baseline', 'Random Forest', 'LSTM'],
    'MAE (kW)': [mae_persistence, mae_rf, mae_lstm],
    'RMSE (kW)': [rmse_persistence, rmse_rf, rmse_lstm],
    'MAPE (%)': [mape_persistence, mape_rf, mape_lstm]
})

# Display the comparison table
print("\n" + "="*70)
print("FINAL MODEL COMPARISON - Test Set Performance")
print("="*70)
print(comparison_df.to_string(index=False))
print("="*70)

# Find best model for each metric
best_mae = comparison_df.loc[comparison_df['MAE (kW)'].idxmin(), 'Model']
best_rmse = comparison_df.loc[comparison_df['RMSE (kW)'].idxmin(), 'Model']
best_mape = comparison_df.loc[comparison_df['MAPE (%)'].idxmin(), 'Model']

print(f"\nBest Model by MAE:  {best_mae}")
print(f"Best Model by RMSE: {best_rmse}")
print(f"Best Model by MAPE: {best_mape}")

# Visualize comparison
fig, axes = plt.subplots(1, 3, figsize=(15, 5))

# MAE comparison
axes[0].bar(comparison_df['Model'], comparison_df['MAE (kW)'], color=['#FF6B6B', '#4ECDC4', '#45B7D1'])
axes[0].set_ylabel('MAE (kW)', fontsize=11)
axes[0].set_title('Mean Absolute Error', fontsize=12, fontweight='bold')
axes[0].tick_params(axis='x', rotation=15)
axes[0].grid(axis='y', alpha=0.3)

# RMSE comparison
axes[1].bar(comparison_df['Model'], comparison_df['RMSE (kW)'], color=['#FF6B6B', '#4ECDC4', '#45B7D1'])
axes[1].set_ylabel('RMSE (kW)', fontsize=11)
axes[1].set_title('Root Mean Squared Error', fontsize=12, fontweight='bold')
axes[1].tick_params(axis='x', rotation=15)
axes[1].grid(axis='y', alpha=0.3)

# MAPE comparison
axes[2].bar(comparison_df['Model'], comparison_df['MAPE (%)'], color=['#FF6B6B', '#4ECDC4', '#45B7D1'])
axes[2].set_ylabel('MAPE (%)', fontsize=11)
axes[2].set_title('Mean Absolute Percentage Error', fontsize=12, fontweight='bold')
axes[2].tick_params(axis='x', rotation=15)
axes[2].grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.show()