Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Nasa Dataset #5

Merged
merged 4 commits into from
Jan 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions cmd/conf/datamodule/nasa.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
_target_: fdiff.dataloaders.datamodules.NASADatamodule
data_dir: ${hydra:runtime.cwd}/data
random_seed: ${random_seed}
fourier_transform: ${fourier_transform}
standardize: ${standardize}
subdataset: charge
remove_outlier_feature: True
batch_size: 16
1,681 changes: 1,681 additions & 0 deletions notebooks/nasa_exploration.ipynb

Large diffs are not rendered by default.

74 changes: 73 additions & 1 deletion src/fdiff/dataloaders/datamodules.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,11 @@

from fdiff.utils.dataclasses import collate_batch
from fdiff.utils.fourier import dft
from fdiff.utils.preprocessing import mimic_preprocess, nasdaq_preprocess
from fdiff.utils.preprocessing import (
mimic_preprocess,
nasdaq_preprocess,
nasa_preprocess,
)


class DiffusionDataset(Dataset):
Expand Down Expand Up @@ -385,3 +389,71 @@ def download_data(self) -> None:
@property
def dataset_name(self) -> str:
return "nasdaq"


class NASADatamodule(Datamodule):
def __init__(
self,
data_dir: Path | str = Path.cwd() / "data",
random_seed: int = 42,
batch_size: int = 32,
fourier_transform: bool = False,
standardize: bool = False,
subdataset: str = "charge",
remove_outlier_feature: bool = True,
) -> None:
self.subdataset = subdataset
self.remove_outlier_feature = remove_outlier_feature

super().__init__(
data_dir=data_dir,
random_seed=random_seed,
batch_size=batch_size,
fourier_transform=fourier_transform,
standardize=standardize,
)

def setup(self, stage: str = "fit") -> None:
if (
not (self.data_dir / self.subdataset / "X_train.pt").exists()
or not (self.data_dir / self.subdataset / "X_test.pt").exists()
):
logging.info(
f"Preprocessed tensors for {self.dataset_name}_{self.subdataset} not found. "
f"Now running the preprocessing pipeline."
)
nasa_preprocess(
data_dir=self.data_dir,
subdataset=self.subdataset,
random_seed=self.random_seed,
)
logging.info(
f"Preprocessing pipeline finished, tensors saved in {self.data_dir}."
)

# Load preprocessed tensors
self.X_train = torch.load(self.data_dir / self.subdataset / "X_train.pt")
self.X_test = torch.load(self.data_dir / self.subdataset / "X_test.pt")

if self.remove_outlier_feature and self.subdataset == "charge":
# Remove the third feature which has a bad range
self.X_train = self.X_train[:, ::2, [0, 1, 3, 4]]
self.X_test = self.X_test[:, ::2, [0, 1, 3, 4]]

assert self.X_train.shape[2] == self.X_test.shape[2] == 4
assert self.X_train.shape[1] == 251
assert self.X_test.shape[1] == 251
assert isinstance(self.X_train, torch.Tensor)
assert isinstance(self.X_test, torch.Tensor)

def download_data(self) -> None:
import kaggle

kaggle.api.authenticate()
kaggle.api.dataset_download_files(
"patrickfleith/nasa-battery-dataset", path=self.data_dir, unzip=True
)

@property
def dataset_name(self) -> str:
return "nasa"
1 change: 1 addition & 0 deletions src/fdiff/utils/extraction.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
def get_training_params(datamodule: Datamodule, trainer: pl.Trainer) -> dict[str, Any]:
params = datamodule.dataset_parameters
params["num_training_steps"] *= trainer.max_epochs
params["num_training_steps"] /= trainer.accumulate_grad_batches
assert isinstance(params, dict)
return params

Expand Down
92 changes: 92 additions & 0 deletions src/fdiff/utils/preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -256,3 +256,95 @@ def nasdaq_preprocess(
# Save the preprocessed tensors.
for X, name in zip([X_train, X_test], ["train", "test"]):
torch.save(X, data_dir / f"X_{name}.pt")


def nasa_preprocess(
data_dir: Path,
subdataset: str = "charge",
train_frac: float = 0.9,
random_seed: int = 42,
):
if subdataset == "charge":
features = [
"Voltage_measured",
"Current_measured",
"Temperature_measured",
"Current_charge",
"Voltage_charge",
]
sub_dataset = "charge"
interval_bin = 10
cutoff_time = 5000 - 5000 % interval_bin
elif subdataset == "discharge":
features = [
"Voltage_measured",
"Current_measured",
"Temperature_measured",
"Current_load",
"Voltage_load",
]
sub_dataset = "discharge"
interval_bin = 15
cutoff_time = 2000 - 2000 % interval_bin

else:
raise ValueError(f"Unknown subdataset {subdataset}")

# Read the metadata
metadata = pd.read_csv(data_dir / "cleaned_dataset" / "metadata.csv")
files = metadata[metadata["type"] == f"{sub_dataset}"]["filename"].values

full_df = pd.DataFrame()

for filename in tqdm(files):
data = pd.read_csv(data_dir / "cleaned_dataset" / "data" / filename)

# check if the maximum time is greater than the cutoff time
if data["Time"].max() > cutoff_time:
# Check that the maximum interval is less than the bin size
interval = data["Time"].diff().max()
if interval > interval_bin:
continue

# Remove the rows such that the time is greater than the cutoff time
data = data[data["Time"] < cutoff_time]

# bin the data

data["Time_Bin"] = pd.cut(
data["Time"],
bins=range(
-interval_bin, int(cutoff_time + interval_bin), interval_bin
),
)

# Group by custom bins and calculate the mean for each group
result_df = data.groupby("Time_Bin", observed=False).mean().reset_index()
result_df["Time_Bin"] = result_df.index

result_df["filename"] = filename
full_df = pd.concat([full_df, result_df])

df_pivot = full_df.pivot(index="filename", columns="Time_Bin", values=features)

num_timesteps = cutoff_time // interval_bin + 1
X_full = torch.tensor(df_pivot.values, dtype=torch.float32)
# Rearange to get a 3D tensor of shape (num_samples, num_timesteps, num_features)
X_reshaped = X_full.reshape(X_full.shape[0], -1, num_timesteps)
# Permute the last two dimensions
X = X_reshaped.permute(0, 2, 1)

# Train-test split
torch.manual_seed(random_seed)
num_train = int(train_frac * len(X))
perm_idx = torch.randperm(len(X))
train_stocks, test_stocks = perm_idx[:num_train], perm_idx[num_train:]
X_train, X_test = X[train_stocks], X[test_stocks]

# create the directory if it does not exist
folder = data_dir / subdataset
folder.mkdir(parents=True, exist_ok=True)

# Save the preprocessed tensors.
for X, name in zip([X_train, X_test], ["train", "test"]):
torch.save(X, data_dir / subdataset / f"X_{name}.pt")