In [1]:
import pandas as pd
# Read the tab-delimited file
df_point1 = pd.read_csv('/content/ERA5_Point 1.txt', delimiter='\t')
df_point2 = pd.read_csv('/content/ERA5_Point 2.txt', delimiter='\t')
df_point3 = pd.read_csv('/content/ERA5_Point 3.txt', delimiter='\t')
df_point4 = pd.read_csv('/content/ERA5_Point 4.txt', delimiter='\t')

# Change the time and data to the correct type
df_point1['Date/time [UTC]'] = pd.to_datetime(df_point1['Date/time [UTC]'])
df_point2['Date/time [UTC]'] = pd.to_datetime(df_point2['Date/time [UTC]'])
df_point3['Date/time [UTC]'] = pd.to_datetime(df_point3['Date/time [UTC]'])
df_point4['Date/time [UTC]'] = pd.to_datetime(df_point4['Date/time [UTC]'])

# Splitting data into training and test
# Training years MUST be prior to test years
training_years = [2005, 2006, 2007, 2008]
test_years = [2011, 2012, 2013, 2014]
# Training data
train_1 = df_point1[df_point1['Date/time [UTC]'].dt.year.isin(training_years)]
train_2 = df_point2[df_point2['Date/time [UTC]'].dt.year.isin(training_years)].drop(columns = "Date/time [UTC]")
train_3 = df_point3[df_point3['Date/time [UTC]'].dt.year.isin(training_years)].drop(columns = "Date/time [UTC]")
train_4 = df_point4[df_point4['Date/time [UTC]'].dt.year.isin(training_years)].drop(columns = "Date/time [UTC]")

# Test Data
test_1 = df_point1[df_point1['Date/time [UTC]'].dt.year.isin(test_years)]
test_2 = df_point2[df_point2['Date/time [UTC]'].dt.year.isin(test_years)].drop(columns = "Date/time [UTC]")
test_3 = df_point3[df_point3['Date/time [UTC]'].dt.year.isin(test_years)].drop(columns = "Date/time [UTC]")
test_4 = df_point4[df_point4['Date/time [UTC]'].dt.year.isin(test_years)].drop(columns = "Date/time [UTC]")

FileNotFoundError: [Errno 2] No such file or directory: '/content/ERA5_Point 1.txt'

In [None]:
# Concat all data to one df
train_data_total = pd.concat([train_1, train_2, train_3, train_4], axis=1)
# Get all DateTime Info
time_data = pd.to_datetime(train_data_total['Date/time [UTC]'])
# Extract useful date features
X_time_features = pd.DataFrame({
    "year": time_data.dt.year,
    "month": time_data.dt.month,
    "day_of_year": time_data.dt.dayofyear,
    "day_of_week": time_data.dt.weekday,  # 0=Monday, 6=Sunday
    "hour": time_data.dt.hour,
    "minute": time_data.dt.minute})
# Add all these columns to our df and remove the DateTime values
# train_data_total = pd.concat([train_data_total, X_time_features], axis=1)
train_data_total = train_data_total.drop(columns=['Date/time [UTC]'])
# Checking for the shape fo the new data
print(f"Data is {train_data_total.shape}")

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np

class WeatherDataset(Dataset):
    def __init__(self, data, temporal_features, normalize=True):
        """
        Custom Dataset for weather data with advanced normalization
        """
        self.data = torch.FloatTensor(data)
        self.temporal_features = torch.FloatTensor(temporal_features)

        if normalize:
            # Per-feature normalization with robust scaling
            self.data_median = self.data.median(dim=0).values
            self.data_iqr = (self.data.quantile(0.75, dim=0) -
                              self.data.quantile(0.25, dim=0))

            # Robust normalization to handle outliers
            self.normalized_data = (self.data - self.data_median) / (self.data_iqr + 1e-7)
        else:
            self.normalized_data = self.data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return (
            self.normalized_data[idx],
            self.temporal_features[idx]
        )

class Swish(nn.Module):
    """Swish activation function"""
    def forward(self, x):
        return x * torch.sigmoid(x)

class ResidualBlock(nn.Module):
    """Residual block for deeper networks"""
    def __init__(self, in_features):
        super().__init__()
        self.block = nn.Sequential(
            nn.Linear(in_features, in_features),
            nn.BatchNorm1d(in_features),
            Swish(),
            nn.Linear(in_features, in_features),
            nn.BatchNorm1d(in_features)
        )

    def forward(self, x):
        return x + self.block(x)

class WeatherVAE(nn.Module):
    def __init__(self,
                 input_dim=16,
                 temporal_dim=6,
                 hidden_dims=[64, 32],
                 latent_dim=10,
                 num_residual_blocks=2):
        super(WeatherVAE, self).__init__()
        self.latent_dim = latent_dim
        # Combine weather and temporal features
        combined_dim = input_dim + temporal_dim

        # Encoder with residual connections
        encoder_layers = []
        prev_dim = combined_dim
        for hidden_dim in hidden_dims:
            encoder_layers.extend([
                nn.Linear(prev_dim, hidden_dim),
                nn.BatchNorm1d(hidden_dim),
                Swish(),
                *[ResidualBlock(hidden_dim) for _ in range(num_residual_blocks)]
            ])
            prev_dim = hidden_dim

        self.encoder = nn.Sequential(*encoder_layers)

        # Latent space layers with improved initialization
        self.fc_mu = nn.Linear(prev_dim, latent_dim)
        self.fc_logvar = nn.Linear(prev_dim, latent_dim)

        # Initialize latent space layers with small weights
        nn.init.xavier_normal_(self.fc_mu.weight, gain=0.01)
        nn.init.xavier_normal_(self.fc_logvar.weight, gain=0.01)

        # Decoder with residual connections
        decoder_layers = []
        prev_dim = latent_dim + temporal_dim
        for hidden_dim in reversed(hidden_dims):
            decoder_layers.extend([
                nn.Linear(prev_dim, hidden_dim),
                nn.BatchNorm1d(hidden_dim),
                Swish(),
                *[ResidualBlock(hidden_dim) for _ in range(num_residual_blocks)]
            ])
            prev_dim = hidden_dim

        decoder_layers.append(nn.Linear(prev_dim, input_dim))
        self.decoder = nn.Sequential(*decoder_layers)

    def encode(self, x, temporal):
        """Encode input to latent distribution"""
        combined = torch.cat([x, temporal], dim=1)
        h = self.encoder(combined)
        return self.fc_mu(h), self.fc_logvar(h)

    def reparameterize(self, mu, logvar):
        """Reparameterization trick with clipping"""
        std = torch.exp(0.5 * torch.clamp(logvar, min=-10, max=2))
        eps = torch.randn_like(std)
        return mu + eps * std

    def decode(self, z, temporal):
        """Decode latent representation"""
        combined = torch.cat([z, temporal], dim=1)
        return self.decoder(combined)

    def forward(self, x, temporal):
        """Forward pass"""
        mu, logvar = self.encode(x, temporal)
        z = self.reparameterize(mu, logvar)
        return self.decode(z, temporal), mu, logvar

    def sample(self, temporal, num_samples=1):
        """
        Sample from the latent space
        """
        temporal = temporal.repeat(num_samples, 1)
        z = torch.randn(num_samples, self.latent_dim, device=temporal.device)

        with torch.no_grad():
            samples = self.decode(z, temporal)

        return samples

def advanced_vae_loss(recon_x, x, mu, logvar, beta=1.0):
    """
    Advanced VAE Loss Function
    - Reconstruction Loss (Huber Loss for robustness)
    - KL Divergence with adaptive beta
    - Gradient clipping
    """
    # Huber loss for more robust reconstruction
    recon_loss = F.smooth_l1_loss(recon_x, x, reduction='sum')

    # KL Divergence loss with adaptive scaling
    kl_loss = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp())

    # Adaptive beta based on reconstruction loss
    adaptive_beta = beta * (recon_loss.item() / (x.numel() + 1e-8))

    return recon_loss + adaptive_beta * kl_loss

def train_vae(model, dataloader, optimizer, device, epochs=50,
              early_stopping_patience=10):
    """
    Enhanced training loop with early stopping and learning rate scheduling
    """
    model.to(device)

    # Learning rate scheduler
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(
        optimizer,
        mode='min',
        factor=0.5,
        patience=5,
        verbose=True
    )

    best_loss = float('inf')
    patience_counter = 0

    for epoch in range(epochs):
        model.train()
        total_loss = 0

        for batch_features, batch_temporal in dataloader:
            batch_features = batch_features.to(device)
            batch_temporal = batch_temporal.to(device)

            optimizer.zero_grad()

            # Forward pass
            recon_batch, mu, logvar = model(batch_features, batch_temporal)

            # Compute loss
            loss = advanced_vae_loss(recon_batch, batch_features, mu, logvar)

            # Backward pass with gradient clipping
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()

            total_loss += loss.item()

        # Average loss and learning rate scheduling
        avg_loss = total_loss / len(dataloader)
        scheduler.step(avg_loss)

        # Early stopping
        if avg_loss < best_loss:
            best_loss = avg_loss
            patience_counter = 0
        else:
            patience_counter += 1

        print(f'Epoch [{epoch+1}/{epochs}], Loss: {avg_loss:.4f}')

        # Stop if no improvement
        if patience_counter >= early_stopping_patience:
            print("Early stopping triggered")
            break

    return model

In [None]:
# Example usage (you would replace this with your actual data)
def main():
    # Simulated data (replace with your actual weather dataset)
    weather_data = np.array(train_data_total) # 1000 samples, 16 features
    temporal_data = np.array(X_time_features)  # 6 temporal features

    # Create dataset and dataloader
    dataset = WeatherDataset(weather_data, temporal_data)
    dataloader = DataLoader(dataset, batch_size=32, shuffle=False)

    # Initialize model
    model = WeatherVAE()
    optimizer = optim.Adam(model.parameters(), lr=1e-3)

    # Train model
    trained_model = train_vae(model, dataloader, optimizer,
                               device=torch.device('cuda' if torch.cuda.is_available() else 'cpu'))
    return trained_model, generated_samples

if __name__ == "__main__":
    model, samples = main()

Epoch [1/50], Loss: 144.3283
Epoch [2/50], Loss: 143.2419
Epoch [3/50], Loss: 140.2427
Epoch [4/50], Loss: 135.9487
Epoch [5/50], Loss: 132.9676
Epoch [6/50], Loss: 132.3508
Epoch [7/50], Loss: 131.3245
Epoch [8/50], Loss: 130.1719
Epoch [9/50], Loss: 128.9714
Epoch [10/50], Loss: 128.6170
Epoch [11/50], Loss: 127.9933
Epoch [12/50], Loss: 127.0288
Epoch [13/50], Loss: 125.9158
Epoch [14/50], Loss: 125.0675
Epoch [15/50], Loss: 124.7369
Epoch [16/50], Loss: 124.8794
Epoch [17/50], Loss: 124.2744
Epoch [18/50], Loss: 124.3570
Epoch [19/50], Loss: 123.3692
Epoch [20/50], Loss: 123.0800
Epoch [21/50], Loss: 121.3355
Epoch [22/50], Loss: 121.1839
Epoch [23/50], Loss: 122.6173
Epoch [24/50], Loss: 121.0485
Epoch [25/50], Loss: 120.9201
Epoch [26/50], Loss: 120.6899
Epoch [27/50], Loss: 119.5133
Epoch [28/50], Loss: 120.1273
Epoch [29/50], Loss: 121.0304
Epoch [30/50], Loss: 121.2323
Epoch [31/50], Loss: 121.1112
Epoch [32/50], Loss: 117.7377
Epoch [33/50], Loss: 115.3766
Epoch [34/50], Loss

In [None]:
import pandas as pd

def filter_data_by_datetime(df, datetime_col, year=None, month=None, day_of_year=None,
                            day_of_week=None, hour=None, minute=None):
    """
    Filters a DataFrame based on specific datetime components.

    Args:
        df (pd.DataFrame): The input DataFrame.
        datetime_col (str): The name of the column with datetime values.
        year (int, optional): The year to filter by.
        month (int, optional): The month to filter by (1-12).
        day_of_year (int, optional): The day of the year to filter by (1-366).
        day_of_week (int, optional): The day of the week to filter by (0=Monday, 6=Sunday).
        hour (int, optional): The hour to filter by (0-23).
        minute (int, optional): The minute to filter by (0-59).

    Returns:
        pd.DataFrame: Filtered DataFrame.
    """
    df = df.copy()  # Avoid modifying the original DataFrame
    df[datetime_col] = pd.to_datetime(df[datetime_col])  # Ensure datetime format

    # Filter based on conditions
    mask = pd.Series(True, index=df.index)  # Start with all True

    if year is not None:
        mask &= df[datetime_col].dt.year == year
    if month is not None:
        mask &= df[datetime_col].dt.month == month
    if day_of_year is not None:
        mask &= df[datetime_col].dt.dayofyear == day_of_year
    if day_of_week is not None:
        mask &= df[datetime_col].dt.dayofweek == day_of_week
    if hour is not None:
        mask &= df[datetime_col].dt.hour == hour
    if minute is not None:
        mask &= df[datetime_col].dt.minute == minute

    return df[mask]

In [None]:
  # Example sampling
sample_temporal = torch.tensor([[2009, 1, 1, 0, 6, 0]], dtype=torch.float32)
generated_samples = model.sample(sample_temporal, num_samples=5)

test = df_point1[df_point1['Date/time [UTC]'].dt.year.isin([2009])]

actual_data = filter_data_by_datetime(
                                      test,
                                      datetime_col='Date/time [UTC]',
                                      year=2009,
                                      month=1,
                                      day_of_year = 1,
                                      hour = 6,
                                      minute = 0)

print(f"Actual Data: {actual_data}")
print(generated_samples[:][0:4])

AttributeError: 'WeatherVAE' object has no attribute 'latent_dim'