Skip to content

Commit

Permalink
Merge pull request #5 from JonathanCrabbe/nasa
Browse files Browse the repository at this point in the history
Nasa Dataset
  • Loading branch information
JonathanCrabbe committed Jan 15, 2024
2 parents 60a9c1c + 2438d31 commit dcf0a1e
Show file tree
Hide file tree
Showing 5 changed files with 1,855 additions and 1 deletion.
8 changes: 8 additions & 0 deletions cmd/conf/datamodule/nasa.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
_target_: fdiff.dataloaders.datamodules.NASADatamodule
data_dir: ${hydra:runtime.cwd}/data
random_seed: ${random_seed}
fourier_transform: ${fourier_transform}
standardize: ${standardize}
subdataset: charge
remove_outlier_feature: True
batch_size: 16
1,681 changes: 1,681 additions & 0 deletions notebooks/nasa_exploration.ipynb

Large diffs are not rendered by default.

74 changes: 73 additions & 1 deletion src/fdiff/dataloaders/datamodules.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,11 @@

from fdiff.utils.dataclasses import collate_batch
from fdiff.utils.fourier import dft
from fdiff.utils.preprocessing import mimic_preprocess, nasdaq_preprocess
from fdiff.utils.preprocessing import (
mimic_preprocess,
nasdaq_preprocess,
nasa_preprocess,
)


class DiffusionDataset(Dataset):
Expand Down Expand Up @@ -385,3 +389,71 @@ def download_data(self) -> None:
@property
def dataset_name(self) -> str:
return "nasdaq"


class NASADatamodule(Datamodule):
def __init__(
self,
data_dir: Path | str = Path.cwd() / "data",
random_seed: int = 42,
batch_size: int = 32,
fourier_transform: bool = False,
standardize: bool = False,
subdataset: str = "charge",
remove_outlier_feature: bool = True,
) -> None:
self.subdataset = subdataset
self.remove_outlier_feature = remove_outlier_feature

super().__init__(
data_dir=data_dir,
random_seed=random_seed,
batch_size=batch_size,
fourier_transform=fourier_transform,
standardize=standardize,
)

def setup(self, stage: str = "fit") -> None:
if (
not (self.data_dir / self.subdataset / "X_train.pt").exists()
or not (self.data_dir / self.subdataset / "X_test.pt").exists()
):
logging.info(
f"Preprocessed tensors for {self.dataset_name}_{self.subdataset} not found. "
f"Now running the preprocessing pipeline."
)
nasa_preprocess(
data_dir=self.data_dir,
subdataset=self.subdataset,
random_seed=self.random_seed,
)
logging.info(
f"Preprocessing pipeline finished, tensors saved in {self.data_dir}."
)

# Load preprocessed tensors
self.X_train = torch.load(self.data_dir / self.subdataset / "X_train.pt")
self.X_test = torch.load(self.data_dir / self.subdataset / "X_test.pt")

if self.remove_outlier_feature and self.subdataset == "charge":
# Remove the third feature which has a bad range
self.X_train = self.X_train[:, ::2, [0, 1, 3, 4]]
self.X_test = self.X_test[:, ::2, [0, 1, 3, 4]]

assert self.X_train.shape[2] == self.X_test.shape[2] == 4
assert self.X_train.shape[1] == 251
assert self.X_test.shape[1] == 251
assert isinstance(self.X_train, torch.Tensor)
assert isinstance(self.X_test, torch.Tensor)

def download_data(self) -> None:
import kaggle

kaggle.api.authenticate()
kaggle.api.dataset_download_files(
"patrickfleith/nasa-battery-dataset", path=self.data_dir, unzip=True
)

@property
def dataset_name(self) -> str:
return "nasa"
1 change: 1 addition & 0 deletions src/fdiff/utils/extraction.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
def get_training_params(datamodule: Datamodule, trainer: pl.Trainer) -> dict[str, Any]:
params = datamodule.dataset_parameters
params["num_training_steps"] *= trainer.max_epochs
params["num_training_steps"] /= trainer.accumulate_grad_batches
assert isinstance(params, dict)
return params

Expand Down
92 changes: 92 additions & 0 deletions src/fdiff/utils/preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -256,3 +256,95 @@ def nasdaq_preprocess(
# Save the preprocessed tensors.
for X, name in zip([X_train, X_test], ["train", "test"]):
torch.save(X, data_dir / f"X_{name}.pt")


def nasa_preprocess(
data_dir: Path,
subdataset: str = "charge",
train_frac: float = 0.9,
random_seed: int = 42,
):
if subdataset == "charge":
features = [
"Voltage_measured",
"Current_measured",
"Temperature_measured",
"Current_charge",
"Voltage_charge",
]
sub_dataset = "charge"
interval_bin = 10
cutoff_time = 5000 - 5000 % interval_bin
elif subdataset == "discharge":
features = [
"Voltage_measured",
"Current_measured",
"Temperature_measured",
"Current_load",
"Voltage_load",
]
sub_dataset = "discharge"
interval_bin = 15
cutoff_time = 2000 - 2000 % interval_bin

else:
raise ValueError(f"Unknown subdataset {subdataset}")

# Read the metadata
metadata = pd.read_csv(data_dir / "cleaned_dataset" / "metadata.csv")
files = metadata[metadata["type"] == f"{sub_dataset}"]["filename"].values

full_df = pd.DataFrame()

for filename in tqdm(files):
data = pd.read_csv(data_dir / "cleaned_dataset" / "data" / filename)

# check if the maximum time is greater than the cutoff time
if data["Time"].max() > cutoff_time:
# Check that the maximum interval is less than the bin size
interval = data["Time"].diff().max()
if interval > interval_bin:
continue

# Remove the rows such that the time is greater than the cutoff time
data = data[data["Time"] < cutoff_time]

# bin the data

data["Time_Bin"] = pd.cut(
data["Time"],
bins=range(
-interval_bin, int(cutoff_time + interval_bin), interval_bin
),
)

# Group by custom bins and calculate the mean for each group
result_df = data.groupby("Time_Bin", observed=False).mean().reset_index()
result_df["Time_Bin"] = result_df.index

result_df["filename"] = filename
full_df = pd.concat([full_df, result_df])

df_pivot = full_df.pivot(index="filename", columns="Time_Bin", values=features)

num_timesteps = cutoff_time // interval_bin + 1
X_full = torch.tensor(df_pivot.values, dtype=torch.float32)
# Rearange to get a 3D tensor of shape (num_samples, num_timesteps, num_features)
X_reshaped = X_full.reshape(X_full.shape[0], -1, num_timesteps)
# Permute the last two dimensions
X = X_reshaped.permute(0, 2, 1)

# Train-test split
torch.manual_seed(random_seed)
num_train = int(train_frac * len(X))
perm_idx = torch.randperm(len(X))
train_stocks, test_stocks = perm_idx[:num_train], perm_idx[num_train:]
X_train, X_test = X[train_stocks], X[test_stocks]

# create the directory if it does not exist
folder = data_dir / subdataset
folder.mkdir(parents=True, exist_ok=True)

# Save the preprocessed tensors.
for X, name in zip([X_train, X_test], ["train", "test"]):
torch.save(X, data_dir / subdataset / f"X_{name}.pt")

0 comments on commit dcf0a1e

Please sign in to comment.