Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use a statistical model fitted to the original dataset to synthesize data #179

Open
wants to merge 21 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@ pyrealm.egg-info

# Data
pyrealm_build_data/inputs_data_24.25.nc
pyrealm_build_data/eda.py

# Profiling
prof/
2 changes: 1 addition & 1 deletion poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Binary file added pyrealm_build_data/data_model.nc
Binary file not shown.
107 changes: 107 additions & 0 deletions pyrealm_build_data/synth_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
"""This script uses a parametrized model to compress the input dataset.

It fits a time series model to the input data and stores the model parameters.
The dataset can then be reconstructed from the model parameters using the `reconstruct`
function, provided with a custom time index.
"""
tztsai marked this conversation as resolved.
Show resolved Hide resolved
from typing import Sequence

import numpy as np
import pandas as pd
import xarray as xr


def make_time_features(t: Sequence[float]) -> pd.DataFrame:
"""Make time features for a given time index.

The model can be written as
g(t) = a₀ + a₁ t + ∑_i b_i sin(2π f_i t) + c_i cos(2π f_i t),
where t is the time index, f_i are the frequencies, and a₀, a₁, b_i, c_i are the
model parameters.

Args:
t: An array of datetime values.
"""
dt = pd.to_datetime(t).rename("time")
df = pd.DataFrame(index=dt).assign(const=1.0)

df["linear"] = (dt - pd.Timestamp("2000-01-01")) / pd.Timedelta("365.25d")

for f in [730.5, 365.25, 12, 6, 4, 3, 2, 1, 1 / 2, 1 / 3, 1 / 4, 1 / 6]:
df[f"freq_{f:.2f}_sin"] = np.sin(2 * np.pi * f * df["linear"])
df[f"freq_{f:.2f}_cos"] = np.cos(2 * np.pi * f * df["linear"])

return df


def fit_ts_model(da: xr.DataArray, fs: pd.DataFrame) -> xr.DataArray:
"""Fit a time series model to the data.

Args:
da: A DataArray with the input data.
fs: A DataFrame with the time features.
"""
print("Fitting", da.name)

da = da.isel(time=slice(None, None, 4)) # downsample along time
da = da.dropna("time", how="all")
da = da.fillna(da.mean("time"))
df = da.to_series().unstack("time").T

Y = df.values # (times, locs)
X = fs.loc[df.index].values # (times, feats)
A, res, *_ = np.linalg.lstsq(X, Y, rcond=None) # (feats, locs)

loss = np.mean(res) / len(X) / np.var(Y)
pars = pd.DataFrame(A.T, index=df.columns, columns=fs.columns)

print("Loss:", loss)
return pars.to_xarray().to_dataarray()


def reconstruct(
ds: xr.Dataset, dt: Sequence[float], bounds: dict | None = None
) -> xr.Dataset:
"""Reconstruct the full dataset from the model parameters.

Args:
ds: A Dataset with the model parameters.
dt: An array of datetime values.
bounds: A dictionary with the bounds for the reconstructed variables.
"""
if bounds is None:
bounds = dict(
temp=(-25, 80),
patm=(3e4, 11e4),
vpd=(0, 1e4),
co2=(0, 1e3),
fapar=(0, 1),
ppfd=(0, 1e4),
)
x = make_time_features(dt).to_xarray().to_dataarray()
ds = xr.Dataset({k: a @ x for k, a in ds.items()})
ds = xr.Dataset({k: a.clip(*bounds[k]) for k, a in ds.items()})
return ds


if __name__ == "__main__":
ds = xr.open_dataset("pyrealm_build_data/inputs_data_24.25.nc")

# drop locations with all NaNs (for any variable)
mask = ~ds.isnull().all("time").to_dataarray().any("variable")
ds = ds.where(mask, drop=True)

special_time_features = dict(
patm=["const"],
co2=["const", "linear"],
)

features = make_time_features(ds.time)

model = xr.Dataset()
for k in ds.data_vars:
cols = special_time_features.get(k, features.columns)
model[k] = fit_ts_model(ds[k], features[cols])

model = model.fillna(0.0)
model.to_netcdf("pyrealm_build_data/data_model.nc")
46 changes: 46 additions & 0 deletions tests/regression/data/test_synth_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
"""Test the quality of the synthetic data generated from the model parameters."""

import numpy as np
import pytest
import xarray as xr

try:
DATASET = xr.open_dataset("pyrealm_build_data/inputs_data_24.25.nc")
VARS = DATASET.data_vars
except ValueError:
pytest.skip("Original LFS dataset not checked out.", allow_module_level=True)


def r2_score(y_true: xr.DataArray, y_pred: xr.DataArray) -> float:
"""Compute the R2 score."""
SSE = ((y_true - y_pred) ** 2).sum()
SST = ((y_true - y_true.mean()) ** 2).sum()
return 1 - SSE / SST


@pytest.fixture
def syndata(modelpath="pyrealm_build_data/data_model.nc"):
"""The synthetic dataset."""
from pyrealm_build_data.synth_data import reconstruct

model = xr.open_dataset(modelpath)
ts = xr.date_range("2012-01-01", "2018-01-01", freq="12h")
return reconstruct(model, ts)


@pytest.fixture
def dataset(syndata):
"""The original dataset."""
return DATASET.sel(time=syndata.time)


@pytest.mark.parametrize("var", VARS)
def test_synth_data_quality(dataset, syndata, var):
"""Test the quality of the synthetic data."""
times = syndata.time[np.random.choice(syndata.time.size, 1000, replace=False)]
lats = syndata.lat[np.random.choice(syndata.lat.size, 100, replace=False)]
t = dataset[var].sel(lat=lats, time=times)
p = syndata[var].sel(lat=lats, time=times)
s = r2_score(t, p)
print(f"R2 score for {var} is {s:.2f}")
assert s > 0.85
Loading