<a href="https://colab.research.google.com/github/Kanav30/PBL-Project/blob/main/PBL%20Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import h5py
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader

In [None]:
import os

#Loading the file
file_path = "/content/drive/MyDrive/metr-la.h5"

if not os.path.exists(file_path):
    print(f"Error: The file '{file_path}' was not found.")
    print("Please make sure you have uploaded the 'metr-la (1).h5' file to your Colab environment's /content/ directory.")
else:
    with h5py.File(file_path, "r") as f:
        print(list(f.keys()))

['df']


In [None]:
#read traffic speed data
with h5py.File(file_path, "r") as f:
    # Access the 'df' group
    df_group = f["df"]
    # Access the actual speed data from the 'block0_values' dataset within the 'df' group
    speed_data = df_group["block0_values"][:]

print(speed_data.shape)

(34272, 207)


In [None]:
#One row
print(speed_data[0])

[64.375      67.625      67.125      61.5        66.875      68.75
 65.125      67.125      59.625      62.75       55.5        66.5
 64.25       68.5        60.375      67.5        37.75       63.125
 59.75       62.125      67.25       41.25       54.625      58.125
 65.125      64.25       61.25       62.75       66.875      47.375
 57.         67.75       65.125      66.875      64.         62.
 64.375      60.75       60.75       62.375      64.5        66.
 62.         64.875      55.875      67.75       63.         60.125
 60.25       62.25       42.25       48.25       65.125      63.5
 55.75       60.5        51.57142857 60.25       63.875      59.875
 64.625      64.         52.5        66.25       68.25       68.875
 66.375      64.5        67.875      67.25       63.125      66.25
 64.625      59.125      68.25       52.25       58.125      61.25
 66.125      69.375      69.75       63.5        64.25       68.
 56.875      64.375      65.25       65.75       61.25       63.

In [None]:
#Missing value

print("NaNs before:", np.isnan(speed_data).sum())
mean_value = np.nanmean(speed_data)
speed_data = np.nan_to_num(speed_data, nan=mean_value)

print("NaNs after:", np.isnan(speed_data).sum())


NaNs before: 0
NaNs after: 0


In [None]:
#Normalizing
mean = speed_data.mean()
std = speed_data.std()

speed_data = (speed_data - mean) / std

In [None]:
#Samples of the data

INPUT_WINDOW = 12
OUTPUT_WINDOW = 12

def create_samples(data, input_window, output_window):
    X, Y = [], []
    T = data.shape[0]

    for t in range(input_window, T - output_window):
        past = data[t - input_window:t]      #past 12 steps
        future = data[t:t + output_window]   #next 12 steps

        X.append(past)
        Y.append(future)

    return np.array(X), np.array(Y)


In [None]:
X, Y = create_samples(speed_data, INPUT_WINDOW, OUTPUT_WINDOW)

print("X shape:", X.shape)
print("Y shape:", Y.shape)


X shape: (34248, 12, 207)
Y shape: (34248, 12, 207)


In [None]:
X = X[..., np.newaxis]
Y = Y[..., np.newaxis]

print(X.shape)
print(Y.shape)


(34248, 12, 207, 1)
(34248, 12, 207, 1)


In [None]:
X = torch.tensor(X, dtype=torch.float32)
Y = torch.tensor(Y, dtype=torch.float32)


In [None]:
class METRLADataset(Dataset):
    def __init__(self, X, Y):
        self.X = X
        self.Y = Y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.Y[idx]


In [None]:
dataset = METRLADataset(X, Y)

dataloader = DataLoader(
    dataset,
    batch_size=32,
    shuffle=True
)


In [None]:
for xb, yb in dataloader:
    print("Input batch:", xb.shape)
    print("Target batch:", yb.shape)
    break


Input batch: torch.Size([32, 12, 207, 1])
Target batch: torch.Size([32, 12, 207, 1])
