In [11]:
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import numpy as np
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Hyperparameters


In [12]:
COLUMNS_TO_KEEP = [
    "direct_rad:W",
    "clear_sky_rad:W",
    "diffuse_rad:W",
    "direct_rad_1h:J",
    "is_in_shadow:idx",
    "clear_sky_energy_1h:J",
    "diffuse_rad_1h:J",
    "is_day:idx",
    "sun_elevation:d",
    "ceiling_height_agl:m",
    "effective_cloud_cover:p",
    "visibility:m",
    'total_cloud_cover:p',
    'air_density_2m:kgm3',
    'wind_speed_v_10m:ms',
    'dew_point_2m:K',
    'wind_speed_u_10m:ms',
    't_1000hPa:K',
    'absolute_humidity_2m:gm3',
    'snow_water:kgm2',
    'relative_humidity_1000hPa:p',
    'fresh_snow_24h:cm',
    'cloud_base_agl:m',
    'fresh_snow_12h:cm',
    'snow_depth:cm',
    'dew_or_rime:idx',
    'fresh_snow_6h:cm',
    'super_cooled_liquid_water:kgm2',
    'fresh_snow_3h:cm',
    'rain_water:kgm2',
    'precip_type_5min:idx',
    'precip_5min:mm',
    'fresh_snow_1h:cm',
    'sun_azimuth:d',
    'msl_pressure:hPa',
    'pressure_100m:hPa',
    'pressure_50m:hPa',
    'sfc_pressure:hPa',
    'prob_rime:p',
    'wind_speed_10m:ms',
    'elevation:m',
    'snow_density:kgm3',
    'snow_drift:idx',
    'snow_melt_10min:mm',
    'wind_speed_w_1000hPa:ms',
    # "date_calc", something wrong with this column
    "pv_measurement",
]
LEARNING_RATE = 0.001
NUM_EPOCHS = 100
BATCH_SIZE = 32
NUM_FEATURES = len(COLUMNS_TO_KEEP) - 1  # -1 because pv_measurement is the target
FEATURE_SIZE = 4  # 7 days of hourly data
WEIGHT_DECAY = 0.01
SEQUENCE_LENGTH = 14*24

# Neural net


In [13]:
def create_sequences(data, sequence_length):
    """
    Converts time series data into overlapping sequences/windows.
    """
    sequences = []
    target_length = 1
    for i in range(len(data) - sequence_length - target_length + 1):
        seq = data[i : i + sequence_length]
        sequences.append(seq)
    return np.array(sequences)

class SolarPredictionNet(nn.Module):
    def __init__(self, sequence_length, num_channels):
        super(SolarPredictionNet, self).__init__()

        self.conv1 = nn.Conv1d(num_channels, 32, kernel_size=sequence_length, stride=1)
        self.conv2 = nn.Conv1d(32, 64, kernel_size=1, stride=1)
        self.fc1 = nn.Linear(64, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 1)
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.relu(self.conv1(x))
        x = self.relu(self.conv2(x))
        x = x.view(x.size(0), -1)  # Flatten
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        return self.fc3(x)

# Load dataset


In [14]:
# Load data from Parquet files
df_data = pd.read_parquet("data/B/X_train_observed.parquet")
df_target = pd.read_parquet("data/B/train_targets.parquet")

# Merge the datasets
df_merged = pd.merge(
    df_data, df_target, left_on="date_forecast", right_on="time", how="inner"
)

# Downsampling the dataframe to hourly intervals
df_merged = df_merged.resample('H', on="date_forecast").mean()

df_merged = df_merged[COLUMNS_TO_KEEP]

# Set all NaN values to 0
df_merged.fillna(0, inplace=True)



y = df_merged["pv_measurement"]
X = df_merged.drop("pv_measurement", axis=1)

# Convert dataframes to sequences
X_sequences = create_sequences(X.values, SEQUENCE_LENGTH)
# Adjust the sequence creation for y
y_sequences = y.values[SEQUENCE_LENGTH-1:-1]  # Aligned with the end of each sequence and remove the last element

# Sequential Split
train_size = int(0.8 * len(X_sequences))
X_train, X_val = X_sequences[:train_size], X_sequences[train_size:]
y_train, y_val = y_sequences[:train_size], y_sequences[train_size:]

# Normalize the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train.reshape(-1, X_train.shape[-1])).reshape(X_train.shape)
X_val = scaler.transform(X_val.reshape(-1, X_val.shape[-1])).reshape(X_val.shape)


# Convert data to PyTorch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32).transpose(1, 2)  # Adjust shape to [batch, channels, sequence]
y_train_tensor = torch.tensor(y_train, dtype=torch.float32).unsqueeze(1)
X_val_tensor = torch.tensor(X_val, dtype=torch.float32).transpose(1, 2)  # Adjust shape to [batch, channels, sequence]
y_val_tensor = torch.tensor(y_val, dtype=torch.float32).unsqueeze(1)


# Create a custom dataset
class SolarDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]


# Create datasets for training and validation
train_dataset = SolarDataset(X_train_tensor, y_train_tensor)
val_dataset = SolarDataset(X_val_tensor, y_val_tensor)

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=False)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)

# ... [Neural Network and Training code from previous messages]


# Training Loop


In [15]:
def train_model(model, train_loader, val_loader):
    criterion = nn.L1Loss()
    optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)

    for epoch in range(NUM_EPOCHS):
        model.train()
        for batch_idx, (data, target) in enumerate(train_loader):
            optimizer.zero_grad()
            output = model(data)
            loss = criterion(output, target)
            loss.backward()
            optimizer.step()

        # Evaluate the model on the validation set
        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for data, target in val_loader:
                output = model(data)
                loss = criterion(output, target)
                val_loss += loss.item()

        # Average validation loss
        val_loss /= len(val_loader)

        print(
            f"Epoch {epoch + 1}/{NUM_EPOCHS}, Training Loss: {loss.item()}, Validation Loss: {val_loss}"
        )

    print("Training complete!")

In [16]:
model = SolarPredictionNet(SEQUENCE_LENGTH, NUM_FEATURES)
train_model(model, train_loader, val_loader)

Epoch 1/100, Training Loss: 101.9988784790039, Validation Loss: 34.25285643849583
Epoch 2/100, Training Loss: 90.08866119384766, Validation Loss: 33.40768072122286
Epoch 3/100, Training Loss: 121.97625732421875, Validation Loss: 34.845036467671065
Epoch 4/100, Training Loss: 131.37075805664062, Validation Loss: 34.34315785760502
Epoch 5/100, Training Loss: 134.6566619873047, Validation Loss: 33.92464612052102
Epoch 6/100, Training Loss: 139.3184814453125, Validation Loss: 34.459578983732676
Epoch 7/100, Training Loss: 173.90777587890625, Validation Loss: 38.732122387839944
Epoch 8/100, Training Loss: 183.73184204101562, Validation Loss: 39.253562171838006
Epoch 9/100, Training Loss: 136.56338500976562, Validation Loss: 33.251044010496194
Epoch 10/100, Training Loss: 150.40426635742188, Validation Loss: 40.194606494890444
Epoch 11/100, Training Loss: 137.59530639648438, Validation Loss: 36.05945878553729
Epoch 12/100, Training Loss: 131.32803344726562, Validation Loss: 45.36662857802533