In [241]:
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import numpy as np
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Hyperparameters


In [242]:
COLUMNS_TO_KEEP = [
    "direct_rad:W",
    "clear_sky_rad:W",
    "diffuse_rad:W",
    "direct_rad_1h:J",
    "is_in_shadow:idx",
    "clear_sky_energy_1h:J",
    "diffuse_rad_1h:J",
    "is_day:idx",
    "sun_elevation:d",
    "ceiling_height_agl:m",
    "effective_cloud_cover:p",
    "visibility:m",
    'total_cloud_cover:p',
    'air_density_2m:kgm3',
    'wind_speed_v_10m:ms',
    'dew_point_2m:K',
    'wind_speed_u_10m:ms',
    # 't_1000hPa:K',
    'absolute_humidity_2m:gm3',
    # 'snow_water:kgm2',
    'relative_humidity_1000hPa:p',
    # 'fresh_snow_24h:cm',
    'cloud_base_agl:m',
    # 'fresh_snow_12h:cm',
    # 'snow_depth:cm',
    # 'dew_or_rime:idx',
    # 'fresh_snow_6h:cm',
    # 'super_cooled_liquid_water:kgm2',
    # 'fresh_snow_3h:cm',
    'rain_water:kgm2',
    # 'precip_type_5min:idx',
    # 'precip_5min:mm',
    # 'fresh_snow_1h:cm',
    # 'sun_azimuth:d',
    'msl_pressure:hPa',
    # 'pressure_100m:hPa',
    # 'pressure_50m:hPa',
    # 'sfc_pressure:hPa',
    # 'prob_rime:p',
    # 'wind_speed_10m:ms',
    # 'elevation:m',
    # 'snow_density:kgm3',
    # 'snow_drift:idx',
    # 'snow_melt_10min:mm',
    # 'wind_speed_w_1000hPa:ms',
    # "date_calc", 
    "pv_measurement",
]
LEARNING_RATE = 0.00008
NUM_EPOCHS = 100
BATCH_SIZE = 32
NUM_FEATURES = len(COLUMNS_TO_KEEP) - 1  # -1 because pv_measurement is the target
FEATURE_SIZE = 4  # 7 days of hourly data
WEIGHT_DECAY = 0.12
SEQUENCE_LENGTH = 24

# Neural net


In [243]:
def create_sequences(data, sequence_length):
    """
    Converts time series data into overlapping sequences/windows.
    """
    sequences = []
    target_length = 1
    for i in range(len(data) - sequence_length + 1):
        seq = data[i : i + sequence_length]
        sequences.append(seq)
    return np.array(sequences)

def apply_pca(data, q=None, center=True, niter=2):
    """
    Applies PCA to the given data tensor.
    Returns transformed data and PCA components.
    """
    U, S, V = torch.pca_lowrank(data, q=q, center=center, niter=niter)
    transformed_data = torch.matmul(data, V)
    return transformed_data, V


class SolarPredictionNet(nn.Module):
    def __init__(self, sequence_length, num_channels):
        super(SolarPredictionNet, self).__init__()

        self.conv1 = nn.Conv1d(num_channels, 32, kernel_size=sequence_length, stride=1)
        self.conv2 = nn.Conv1d(32, 64, kernel_size=1, stride=1)
        self.dropout = nn.Dropout(0.5)  # Add dropout layer
        self.fc1 = nn.Linear(64, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 1)
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.relu(self.conv1(x))
        x = self.relu(self.conv2(x))
        x = x.view(x.size(0), -1)  # Flatten
        x = self.relu(self.fc1(x))
        x = self.dropout(x)  # Use dropout after first fully connected layer
        x = self.relu(self.fc2(x))
        return self.fc3(x)
    

# Load dataset


In [244]:
# Load data from Parquet files
location = "A"
# 1. Load data
df_observed = pd.read_parquet(f"data/{location}/X_train_observed.parquet")
df_estimated = pd.read_parquet(f"data/{location}/X_train_estimated.parquet")
df_target = pd.read_parquet(f"data/{location}/train_targets.parquet")

# 2. Combine observed and estimated datasets
df_combined = pd.concat([df_observed, df_estimated], axis=0).sort_values(by='date_forecast')

# 3. Merge with target data
df_merged = pd.merge(
    df_combined, df_target, left_on="date_forecast", right_on="time", how="inner"
)

# Downsampling the dataframe to hourly intervals
df_merged = df_merged.resample('H', on="date_forecast").mean()

# Keep only relevant columns
df_merged = df_merged[COLUMNS_TO_KEEP]

# 4. Extract features and target
df_merged.fillna(0, inplace=True)  # Fill NaN values

y = df_merged["pv_measurement"]
X = df_merged.drop("pv_measurement", axis=1)

# Convert dataframes to sequences
X_sequences = create_sequences(X.values, SEQUENCE_LENGTH)

# Adjust the sequence creation for y
y_sequences = y.values[SEQUENCE_LENGTH-1:]  # Aligned with the end of each sequence # Aligned with the end of each sequence and remove the last element

# Sequential Split
train_size = int(0.8 * len(X_sequences))
X_train, X_val = X_sequences[:train_size], X_sequences[train_size:]
y_train, y_val = y_sequences[:train_size], y_sequences[train_size:]

# Normalize the data and apply PCA
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train.reshape(-1, X_train.shape[-1])).reshape(X_train.shape)
X_val = scaler.transform(X_val.reshape(-1, X_val.shape[-1])).reshape(X_val.shape)

# Convert to tensors and apply PCA
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
X_val_tensor = torch.tensor(X_val, dtype=torch.float32)

# Here, q can be a smaller number than NUM_FEATURES if you want to reduce the number of features further
X_train_tensor, pca_components = apply_pca(X_train_tensor, q=NUM_FEATURES)
X_val_tensor, _ = apply_pca(X_val_tensor, q=NUM_FEATURES)

# Ensure shapes are [batch, channels, sequence]
X_train_tensor = X_train_tensor.transpose(1, 2)  
X_val_tensor = X_val_tensor.transpose(1, 2)  

# Update NUM_FEATURES to the size after PCA
NUM_FEATURES = pca_components.shape[1]

# Convert data to PyTorch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32).transpose(1, 2)  # Adjust shape to [batch, channels, sequence]
y_train_tensor = torch.tensor(y_train, dtype=torch.float32).unsqueeze(1)
X_val_tensor = torch.tensor(X_val, dtype=torch.float32).transpose(1, 2)  # Adjust shape to [batch, channels, sequence]
y_val_tensor = torch.tensor(y_val, dtype=torch.float32).unsqueeze(1)




# Create a custom dataset
class SolarDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]


# Create datasets for training and validation
train_dataset = SolarDataset(X_train_tensor, y_train_tensor)
val_dataset = SolarDataset(X_val_tensor, y_val_tensor)

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=False)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)

# ... [Neural Network and Training code from previous messages]


# Training Loop


In [245]:
def train_model(model, train_loader, val_loader):
    criterion = nn.L1Loss()
    optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)

    for epoch in range(NUM_EPOCHS):
        model.train()

        total_train_loss = 0.0  # Initialize accumulated training loss

        for batch_idx, (data, target) in enumerate(train_loader):
            optimizer.zero_grad()
            output = model(data)
            loss = criterion(output, target)
            total_train_loss += loss.item()  # Accumulate the training loss
            loss.backward()
            optimizer.step()

        avg_train_loss = total_train_loss / len(train_loader)  # Compute the average training loss

        # Evaluate the model on the validation set (This part remains unchanged)
        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for data, target in val_loader:
                output = model(data)
                loss = criterion(output, target)
                val_loss += loss.item()

        avg_val_loss = val_loss / len(val_loader)  # Compute the average validation loss

        print(
            f"Epoch {epoch + 1}/{NUM_EPOCHS}, Training Loss: {avg_train_loss}, Validation Loss: {avg_val_loss}"
        )

    print("Training complete!")



In [246]:
def save_model(model, location):
    filename = f"model_location_{location}.pt"
    torch.save(model.state_dict(), filename)
    print(f"Model saved as {filename}")

# ... [Training Loop]

model = SolarPredictionNet(SEQUENCE_LENGTH, NUM_FEATURES)
train_model(model, train_loader, val_loader)
save_model(model, location) 

Epoch 1/100, Training Loss: 657.371711291764, Validation Loss: 409.4694804670506
Epoch 2/100, Training Loss: 337.93757997465383, Validation Loss: 220.64079688122106
Epoch 3/100, Training Loss: 282.26717600056, Validation Loss: 207.31011888093727
Epoch 4/100, Training Loss: 269.8601480572112, Validation Loss: 197.38974555522032
Epoch 5/100, Training Loss: 258.3044326064745, Validation Loss: 188.43943860306902
Epoch 6/100, Training Loss: 251.32838731162607, Validation Loss: 182.5164016463301
Epoch 7/100, Training Loss: 246.44788876933978, Validation Loss: 178.72871884952008
Epoch 8/100, Training Loss: 243.46175483234288, Validation Loss: 173.59500411878068
Epoch 9/100, Training Loss: 239.8860886778374, Validation Loss: 174.44583752670553
Epoch 10/100, Training Loss: 236.2460063343635, Validation Loss: 170.11998238904914
Epoch 11/100, Training Loss: 234.99973709505733, Validation Loss: 169.50905488640018
Epoch 12/100, Training Loss: 233.8180434154727, Validation Loss: 167.91540117676783
E

In [240]:

def load_model(location):
    model = SolarPredictionNet(SEQUENCE_LENGTH, NUM_FEATURES)
    model.load_state_dict(torch.load(f"model_location_{location}.pt"))
    model.eval()
    return model

def pad_data(data, sequence_length):
    """
    Pads the data with zeros at the beginning to ensure 
    the final number of sequences matches the original number of data points.
    """
    padding = np.zeros((sequence_length - 1, data.shape[1]))
    return np.vstack((padding, data))


def make_predictions(location, df_test):
    # Load model
    model = load_model(location)
    
    # Ensure the index is a datetime
    df_test['date_forecast'] = pd.to_datetime(df_test['date_forecast'])
    # Set the date_calc column as the index for resampling
    df_test.set_index('date_forecast', inplace=True)
    # Resample to 1-hour intervals
    df_test = df_test.resample('1H').mean()
    df_test = df_test.dropna(how='all').reset_index(drop=True)
    
    # Keep only the columns used during training (minus the target column)
    relevant_columns = [col for col in COLUMNS_TO_KEEP if col != "pv_measurement"]
    df_test = df_test[relevant_columns]
    
    # Fill NaNs (if any after resampling)
    df_test.fillna(0, inplace=True)
    # Create sequences and normalize
    padded_data = pad_data(df_test.values, SEQUENCE_LENGTH)
    test_sequences = create_sequences(padded_data, SEQUENCE_LENGTH)
    test_sequences = scaler.transform(test_sequences.reshape(-1, test_sequences.shape[-1])).reshape(test_sequences.shape)
    test_tensor = torch.tensor(test_sequences, dtype=torch.float32).transpose(1, 2)
    
    # Make predictions
    with torch.no_grad():
        predictions = model(test_tensor)
        predictions = predictions.numpy().flatten()
    return predictions



# Read the Kaggle test.csv to get the location and ids
df_submission = pd.read_csv("data/test.csv")

locations = ["A", "B", "C"]

# Iterate over the locations and fill in the predictions
for loc in locations:
    print(loc)
    # Load forecasted weather data for testing for the current location
    df_loc = pd.read_parquet(f"data/{loc}/X_test_estimated.parquet")
    preds = make_predictions(loc, df_loc)
    # Assign the predictions to df_submission for the current location
    mask = df_submission["location"] == loc
    df_submission.loc[mask, "prediction"] = preds

# Save the results to a new submission file
df_submission[["id", "prediction"]].to_csv("sample_kaggle_submission.csv", index=False)


A
B
C


 2.15334606e+00 2.21422253e+01 1.55246033e+02 4.59206421e+02
 8.66337158e+02 1.12080969e+03 1.14144531e+03 1.17497144e+03
 9.77783936e+02 1.09333643e+03 1.41139917e+03 1.06533069e+03
 7.09244446e+02 3.79455627e+02 1.15042870e+02 3.28770921e-06
 3.27935004e-06 3.25304791e-06 3.20458685e-06 3.17543822e-06
 3.16835940e-06 3.20221170e-06 3.26968757e-06 1.82681717e+02
 1.05808057e+03 1.82512134e+03 2.56051221e+03 3.44787695e+03
 4.23896631e+03 4.56325049e+03 4.31683398e+03 4.41301416e+03
 4.23042139e+03 3.89182812e+03 3.40924219e+03 2.76311084e+03
 1.70245239e+03 8.12387451e+02 3.90298096e+02 8.35193100e+01
 8.19496095e-01 3.31730052e-06 3.31015895e-06 3.30547459e-06
 3.30810849e-06 3.31492106e-06 3.34504013e+01 2.50453979e+02
 8.10433716e+02 1.92697290e+03 3.18616870e+03 4.25847168e+03
 5.15312500e+03 5.57746973e+03 5.75160156e+03 5.29172852e+03
 4.33784570e+03 3.76835767e+03 2.83749976e+03 1.71526575e+03
 9.24757935e+02 6.71833801e+02 3.61337036e+02 1.52849197e+02
 3.29362933e-06 3.293660