# Solar PV Forecasting - Model Development

This notebook loads the cleaned dataset and prepares it for model training.

## 1. Import Libraries

In [4]:
import pandas as pd
import numpy as np

## 2. Load Cleaned Dataset

In [5]:
# Load the cleaned dataset with datetime index
df = pd.read_csv('data/solar_pv_clean_hourly.csv', index_col=0, parse_dates=True)

# Display dataset shape
print(f"Dataset Shape: {df.shape}")

# Display column names
print(f"\nColumn Names:")
print(df.columns.tolist())

# Display date range
print(f"\nDate Range:")
print(f"Start: {df.index.min()}")
print(f"End: {df.index.max()}")

Dataset Shape: (505, 9)

Column Names:
['PLANT_ID_x', 'DC_POWER', 'AC_POWER', 'DAILY_YIELD', 'TOTAL_YIELD', 'PLANT_ID_y', 'AMBIENT_TEMPERATURE', 'MODULE_TEMPERATURE', 'IRRADIATION']

Date Range:
Start: 2020-05-15 05:00:00
End: 2020-06-17 18:00:00


In [6]:
# Display first few rows
df.head()

Unnamed: 0_level_0,PLANT_ID_x,DC_POWER,AC_POWER,DAILY_YIELD,TOTAL_YIELD,PLANT_ID_y,AMBIENT_TEMPERATURE,MODULE_TEMPERATURE,IRRADIATION
DATE_TIME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2020-05-15 05:00:00,4135001.0,0.0,0.0,0.0,6450830.0,4136001.0,24.966926,23.906986,0.00071
2020-05-15 06:00:00,4135001.0,558.825893,54.269643,13.0,7160322.0,4136001.0,24.835316,24.682242,0.044983
2020-05-15 07:00:00,4135001.0,1927.08125,188.893661,146.607143,6874186.0,4136001.0,26.323637,31.549644,0.337079
2020-05-15 08:00:00,4135001.0,3986.140476,391.35619,437.780952,6433823.0,4136001.0,28.369425,37.962135,0.540164
2020-05-15 09:00:00,4135001.0,5088.5,498.246429,872.214286,6699904.0,4136001.0,30.419307,43.7004,0.728268


## 3. Train-Test Split

In [7]:
# Calculate split point for 80-20 split
split_idx = int(len(df) * 0.8)

# Split data chronologically (no shuffle)
train_df = df.iloc[:split_idx]
test_df = df.iloc[split_idx:]

# Separate features and target
X_train = train_df.drop('DC_POWER', axis=1)
y_train = train_df['DC_POWER']

X_test = test_df.drop('DC_POWER', axis=1)
y_test = test_df['DC_POWER']

# Print shapes
print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")

# Print date ranges for train and test sets
print(f"\nTrain date range: {X_train.index.min()} to {X_train.index.max()}")
print(f"Test date range: {X_test.index.min()} to {X_test.index.max()}")

X_train shape: (404, 8)
X_test shape: (101, 8)
y_train shape: (404,)
y_test shape: (101,)

Train date range: 2020-05-15 05:00:00 to 2020-06-11 09:00:00
Test date range: 2020-06-11 10:00:00 to 2020-06-17 18:00:00


In [8]:
print(df.shape)
print(df.columns)
print(df.index.min(), df.index.max())


(505, 9)
Index(['PLANT_ID_x', 'DC_POWER', 'AC_POWER', 'DAILY_YIELD', 'TOTAL_YIELD',
       'PLANT_ID_y', 'AMBIENT_TEMPERATURE', 'MODULE_TEMPERATURE',
       'IRRADIATION'],
      dtype='object')
2020-05-15 05:00:00 2020-06-17 18:00:00


In [9]:
print(X_train.shape, X_test.shape)


(404, 8) (101, 8)


In [10]:
print(X_train.index.max())
print(X_test.index.min())


2020-06-11 09:00:00
2020-06-11 10:00:00
