Merge pull request #5 from JonathanCrabbe/nasa

Nasa Dataset
JonathanCrabbe · Jan 15, 2024 · dcf0a1e · dcf0a1e
2 parents 60a9c1c + 2438d31
commit dcf0a1e
Show file tree

Hide file tree

Showing 5 changed files with 1,855 additions and 1 deletion.
diff --git a/cmd/conf/datamodule/nasa.yaml b/cmd/conf/datamodule/nasa.yaml
@@ -0,0 +1,8 @@
+_target_: fdiff.dataloaders.datamodules.NASADatamodule
+data_dir: ${hydra:runtime.cwd}/data
+random_seed: ${random_seed}
+fourier_transform: ${fourier_transform}
+standardize: ${standardize}
+subdataset: charge
+remove_outlier_feature: True
+batch_size: 16
diff --git a/notebooks/nasa_exploration.ipynb b/notebooks/nasa_exploration.ipynb
diff --git a/src/fdiff/dataloaders/datamodules.py b/src/fdiff/dataloaders/datamodules.py
@@ -12,7 +12,11 @@
 
 from fdiff.utils.dataclasses import collate_batch
 from fdiff.utils.fourier import dft
-from fdiff.utils.preprocessing import mimic_preprocess, nasdaq_preprocess
+from fdiff.utils.preprocessing import (
+    mimic_preprocess,
+    nasdaq_preprocess,
+    nasa_preprocess,
+)
 
 
 class DiffusionDataset(Dataset):
@@ -385,3 +389,71 @@ def download_data(self) -> None:
     @property
     def dataset_name(self) -> str:
         return "nasdaq"
+
+
+class NASADatamodule(Datamodule):
+    def __init__(
+        self,
+        data_dir: Path | str = Path.cwd() / "data",
+        random_seed: int = 42,
+        batch_size: int = 32,
+        fourier_transform: bool = False,
+        standardize: bool = False,
+        subdataset: str = "charge",
+        remove_outlier_feature: bool = True,
+    ) -> None:
+        self.subdataset = subdataset
+        self.remove_outlier_feature = remove_outlier_feature
+
+        super().__init__(
+            data_dir=data_dir,
+            random_seed=random_seed,
+            batch_size=batch_size,
+            fourier_transform=fourier_transform,
+            standardize=standardize,
+        )
+
+    def setup(self, stage: str = "fit") -> None:
+        if (
+            not (self.data_dir / self.subdataset / "X_train.pt").exists()
+            or not (self.data_dir / self.subdataset / "X_test.pt").exists()
+        ):
+            logging.info(
+                f"Preprocessed tensors for {self.dataset_name}_{self.subdataset} not found. "
+                f"Now running the preprocessing pipeline."
+            )
+            nasa_preprocess(
+                data_dir=self.data_dir,
+                subdataset=self.subdataset,
+                random_seed=self.random_seed,
+            )
+            logging.info(
+                f"Preprocessing pipeline finished, tensors saved in {self.data_dir}."
+            )
+
+        # Load preprocessed tensors
+        self.X_train = torch.load(self.data_dir / self.subdataset / "X_train.pt")
+        self.X_test = torch.load(self.data_dir / self.subdataset / "X_test.pt")
+
+        if self.remove_outlier_feature and self.subdataset == "charge":
+            # Remove the third feature which has a bad range
+            self.X_train = self.X_train[:, ::2, [0, 1, 3, 4]]
+            self.X_test = self.X_test[:, ::2, [0, 1, 3, 4]]
+
+            assert self.X_train.shape[2] == self.X_test.shape[2] == 4
+            assert self.X_train.shape[1] == 251
+            assert self.X_test.shape[1] == 251
+        assert isinstance(self.X_train, torch.Tensor)
+        assert isinstance(self.X_test, torch.Tensor)
+
+    def download_data(self) -> None:
+        import kaggle
+
+        kaggle.api.authenticate()
+        kaggle.api.dataset_download_files(
+            "patrickfleith/nasa-battery-dataset", path=self.data_dir, unzip=True
+        )
+
+    @property
+    def dataset_name(self) -> str:
+        return "nasa"
diff --git a/src/fdiff/utils/extraction.py b/src/fdiff/utils/extraction.py
@@ -11,6 +11,7 @@
 def get_training_params(datamodule: Datamodule, trainer: pl.Trainer) -> dict[str, Any]:
     params = datamodule.dataset_parameters
     params["num_training_steps"] *= trainer.max_epochs
+    params["num_training_steps"] /= trainer.accumulate_grad_batches
     assert isinstance(params, dict)
     return params
 

diff --git a/src/fdiff/utils/preprocessing.py b/src/fdiff/utils/preprocessing.py
@@ -256,3 +256,95 @@ def nasdaq_preprocess(
     # Save the preprocessed tensors.
     for X, name in zip([X_train, X_test], ["train", "test"]):
         torch.save(X, data_dir / f"X_{name}.pt")
+
+
+def nasa_preprocess(
+    data_dir: Path,
+    subdataset: str = "charge",
+    train_frac: float = 0.9,
+    random_seed: int = 42,
+):
+    if subdataset == "charge":
+        features = [
+            "Voltage_measured",
+            "Current_measured",
+            "Temperature_measured",
+            "Current_charge",
+            "Voltage_charge",
+        ]
+        sub_dataset = "charge"
+        interval_bin = 10
+        cutoff_time = 5000 - 5000 % interval_bin
+    elif subdataset == "discharge":
+        features = [
+            "Voltage_measured",
+            "Current_measured",
+            "Temperature_measured",
+            "Current_load",
+            "Voltage_load",
+        ]
+        sub_dataset = "discharge"
+        interval_bin = 15
+        cutoff_time = 2000 - 2000 % interval_bin
+
+    else:
+        raise ValueError(f"Unknown subdataset {subdataset}")
+
+    # Read the metadata
+    metadata = pd.read_csv(data_dir / "cleaned_dataset" / "metadata.csv")
+    files = metadata[metadata["type"] == f"{sub_dataset}"]["filename"].values
+
+    full_df = pd.DataFrame()
+
+    for filename in tqdm(files):
+        data = pd.read_csv(data_dir / "cleaned_dataset" / "data" / filename)
+
+        # check if the maximum time is greater than the cutoff time
+        if data["Time"].max() > cutoff_time:
+            # Check that the maximum interval is less than the bin size
+            interval = data["Time"].diff().max()
+            if interval > interval_bin:
+                continue
+
+            # Remove the rows such that the time is greater than the cutoff time
+            data = data[data["Time"] < cutoff_time]
+
+            # bin the data
+
+            data["Time_Bin"] = pd.cut(
+                data["Time"],
+                bins=range(
+                    -interval_bin, int(cutoff_time + interval_bin), interval_bin
+                ),
+            )
+
+            # Group by custom bins and calculate the mean for each group
+            result_df = data.groupby("Time_Bin", observed=False).mean().reset_index()
+            result_df["Time_Bin"] = result_df.index
+
+            result_df["filename"] = filename
+            full_df = pd.concat([full_df, result_df])
+
+    df_pivot = full_df.pivot(index="filename", columns="Time_Bin", values=features)
+
+    num_timesteps = cutoff_time // interval_bin + 1
+    X_full = torch.tensor(df_pivot.values, dtype=torch.float32)
+    # Rearange to get a 3D tensor of shape (num_samples, num_timesteps, num_features)
+    X_reshaped = X_full.reshape(X_full.shape[0], -1, num_timesteps)
+    # Permute the last two dimensions
+    X = X_reshaped.permute(0, 2, 1)
+
+    # Train-test split
+    torch.manual_seed(random_seed)
+    num_train = int(train_frac * len(X))
+    perm_idx = torch.randperm(len(X))
+    train_stocks, test_stocks = perm_idx[:num_train], perm_idx[num_train:]
+    X_train, X_test = X[train_stocks], X[test_stocks]
+
+    # create the directory if it does not exist
+    folder = data_dir / subdataset
+    folder.mkdir(parents=True, exist_ok=True)
+
+    # Save the preprocessed tensors.
+    for X, name in zip([X_train, X_test], ["train", "test"]):
+        torch.save(X, data_dir / subdataset / f"X_{name}.pt")