## Icechunk Version Control and Branching
Showcase for adding new data over time to an icechunk store, "time traveling", and making new branches

In [None]:
import warnings

from dotenv import load_dotenv

from icefabric.builds import IcechunkRepo, S3Path
from icefabric.helpers import virtualize_and_concat_archival_files_on_time
from icefabric.schemas import FileType, NGWPCLocations

warnings.filterwarnings("ignore")

# Before running this cell, make sure you created a .env file in this directory with your AWS credentials in it
# NOTE - if you authenticate with AWS SSO, leave it commented out
load_dotenv()

# Create icechunk repo at s3://hydrofabric-data/ic_testing/snodas_yearly_append_test
# NOTE - make sure this S3 directory doesn't exist prior to running this cell
new_repo_s3_path = S3Path(bucket="hydrofabric-data", prefix="ic_testing/snodas_yearly_append_test")
new_repo = IcechunkRepo(location=new_repo_s3_path)

In [2]:
# Print repo ancestry
new_repo.print_history()

Snapshot ID:	MWPZ0C0SCM3TRRKNF8D0
Timestamp:	2025-06-16 23:59:40.242436+00:00
Message:	Repository initialized



In [None]:
# Collect first five SNODAS netcdf files from 2009 and combine/virtualize them together into a single dataset
snodas_09_vds = virtualize_and_concat_archival_files_on_time(
    location=NGWPCLocations.SNODAS_REF.path,
    file_date_pattern="zz_ssmv11034tS__T0001TTNATS*05HP001.nc",
    file_type=FileType.NETCDF,
    manual_file_pattern="zz_ssmv11034tS__T0001TTNATS2009*.nc",
    loadable_vars=["crs"],
    testing_file_quantity=5,
)

# Add 09 data to SNODAS repo with a new snapshot
new_repo.write_dataset(ds=snodas_09_vds, virtualized=True, commit="First commit! 09 data added.")

zz_ssmv11034tS__T0001TTNATS2009*.nc


Opening files as virtual datasets.: 100%|[38;2;55;182;189m███████████████████████████████████████████████████[0m| 5/5 [01:27<00:00, 17.52s/files][0m


Dataset is uploaded. Commit: 9CNDKXE65PVGCHK4CHBG


In [4]:
# Now that we have a new snapshot, reprint the repo ancestry
new_repo.print_history()

Snapshot ID:	9CNDKXE65PVGCHK4CHBG
Timestamp:	2025-06-17 00:01:17.067141+00:00
Message:	First commit! 09 data added.

Snapshot ID:	MWPZ0C0SCM3TRRKNF8D0
Timestamp:	2025-06-16 23:59:40.242436+00:00
Message:	Repository initialized



In [5]:
# Print the data now contained within the SNODAS repo
snodas_data = new_repo.retrieve_dataset()
print(snodas_data)

<xarray.Dataset> Size: 671MB
Dimensions:  (lon: 8192, time: 5, lat: 4096)
Coordinates:
  * lon      (lon) float64 66kB -130.5 -130.5 -130.5 ... -62.27 -62.26 -62.25
  * lat      (lat) float64 33kB 24.1 24.11 24.12 24.13 ... 58.21 58.22 58.23
  * time     (time) datetime64[ns] 40B 2009-12-09 2009-12-10 ... 2009-12-13
Data variables:
    Band1    (time, lat, lon) float32 671MB dask.array<chunksize=(1, 4096, 8192), meta=np.ndarray>
    crs      (time) object 40B dask.array<chunksize=(5,), meta=np.ndarray>
Attributes:
    Conventions:  CF-1.5
    GDAL:         GDAL 3.11.0dev-f1386937cde9e540784909294fdd45cda3ee65d2, re...
    history:      Tue Feb 04 18:40:28 2025: GDAL CreateCopy( /data/unmasked/2...


In [None]:
# Much like the 09 SNODAS files were collected, do the same for 2010
snodas_10_vds = virtualize_and_concat_archival_files_on_time(
    location=NGWPCLocations.SNODAS_REF.path,
    file_date_pattern="zz_ssmv11034tS__T0001TTNATS*05HP001.nc",
    file_type=FileType.NETCDF,
    manual_file_pattern="zz_ssmv11034tS__T0001TTNATS2010*.nc",
    loadable_vars=["crs"],
    testing_file_quantity=5,
)

# Append 2010 data to SNODAS repo with a new snapshot
new_repo.append_virt_data_to_store(
    vds=snodas_10_vds, append_dim="time", commit="Appended new data from the year 2010"
)

zz_ssmv11034tS__T0001TTNATS2010*.nc


Opening files as virtual datasets.: 100%|[38;2;55;182;189m███████████████████████████████████████████████████[0m| 5/5 [01:27<00:00, 17.54s/files][0m


Dataset has been appended on the time dimension. Commit: 6FMVKF4CS7N5RW6PC86G


In [7]:
# Now that we have another new snapshot with 2010 data, reprint the repo ancestry
new_repo.print_history()

Snapshot ID:	6FMVKF4CS7N5RW6PC86G
Timestamp:	2025-06-17 00:03:27.965202+00:00
Message:	Appended new data from the year 2010

Snapshot ID:	9CNDKXE65PVGCHK4CHBG
Timestamp:	2025-06-17 00:01:17.067141+00:00
Message:	First commit! 09 data added.

Snapshot ID:	MWPZ0C0SCM3TRRKNF8D0
Timestamp:	2025-06-16 23:59:40.242436+00:00
Message:	Repository initialized



In [8]:
# Print the new repo collection with both 2009 and 2010 data
snodas_data = new_repo.retrieve_dataset()
print(snodas_data)

<xarray.Dataset> Size: 1GB
Dimensions:  (time: 10, lat: 4096, lon: 8192)
Coordinates:
  * lon      (lon) float64 66kB -130.5 -130.5 -130.5 ... -62.27 -62.26 -62.25
  * time     (time) datetime64[ns] 80B 2009-12-09 2009-12-10 ... 2010-01-05
  * lat      (lat) float64 33kB 24.1 24.11 24.12 24.13 ... 58.21 58.22 58.23
Data variables:
    Band1    (time, lat, lon) float32 1GB dask.array<chunksize=(1, 4096, 8192), meta=np.ndarray>
    crs      (time) object 80B dask.array<chunksize=(5,), meta=np.ndarray>
Attributes:
    Conventions:  CF-1.5
    GDAL:         GDAL 3.11.0dev-f1386937cde9e540784909294fdd45cda3ee65d2, re...
    history:      Tue Feb 04 19:54:02 2025: GDAL CreateCopy( /data/unmasked/2...


In [9]:
# Retrieve and print the data from the previous snapshot, before 2010 data was added
prev_snap_snodas_data = new_repo.retrieve_prev_snapshot()
print(prev_snap_snodas_data)

<xarray.Dataset> Size: 671MB
Dimensions:  (time: 5, lat: 4096, lon: 8192)
Coordinates:
  * lon      (lon) float64 66kB -130.5 -130.5 -130.5 ... -62.27 -62.26 -62.25
  * lat      (lat) float64 33kB 24.1 24.11 24.12 24.13 ... 58.21 58.22 58.23
  * time     (time) datetime64[ns] 40B 2009-12-09 2009-12-10 ... 2009-12-13
Data variables:
    Band1    (time, lat, lon) float32 671MB dask.array<chunksize=(1, 4096, 8192), meta=np.ndarray>
    crs      (time) object 40B dask.array<chunksize=(5,), meta=np.ndarray>
Attributes:
    Conventions:  CF-1.5
    GDAL:         GDAL 3.11.0dev-f1386937cde9e540784909294fdd45cda3ee65d2, re...
    history:      Tue Feb 04 18:40:28 2025: GDAL CreateCopy( /data/unmasked/2...


In [None]:
# Make a new feature branch based on "main" to add 2011 data
new_repo.create_new_branch(name="2011_feature")

# Much like the 09 SNODAS files were collected, do the same for 2010
snodas_11_vds = virtualize_and_concat_archival_files_on_time(
    location=NGWPCLocations.SNODAS_REF.path,
    file_date_pattern="zz_ssmv11034tS__T0001TTNATS*05HP001.nc",
    file_type=FileType.NETCDF,
    manual_file_pattern="zz_ssmv11034tS__T0001TTNATS2011*.nc",
    loadable_vars=["crs"],
    testing_file_quantity=5,
)

# Append 2011 data to SNODAS repo's new branch with a new snapshot
new_repo.append_virt_data_to_store(
    vds=snodas_11_vds, append_dim="time", commit="Appended new data from the year 2011", branch="2011_feature"
)

zz_ssmv11034tS__T0001TTNATS2011*.nc


Opening files as virtual datasets.: 100%|[38;2;55;182;189m███████████████████████████████████████████████████[0m| 5/5 [01:28<00:00, 17.79s/files][0m


Dataset has been appended on the time dimension. Commit: T30V4R9GM9TCKHFFBQFG


In [11]:
# Now that we have a new branch with new 2011 data, print the history of both branches
print("NEW BRANCH =====================================")
new_repo.print_history(branch="2011_feature")
print("MAIN BRANCH ====================================")
new_repo.print_history(branch="main")

Snapshot ID:	T30V4R9GM9TCKHFFBQFG
Timestamp:	2025-06-17 00:05:10.559344+00:00
Message:	Appended new data from the year 2011

Snapshot ID:	6FMVKF4CS7N5RW6PC86G
Timestamp:	2025-06-17 00:03:27.965202+00:00
Message:	Appended new data from the year 2010

Snapshot ID:	9CNDKXE65PVGCHK4CHBG
Timestamp:	2025-06-17 00:01:17.067141+00:00
Message:	First commit! 09 data added.

Snapshot ID:	MWPZ0C0SCM3TRRKNF8D0
Timestamp:	2025-06-16 23:59:40.242436+00:00
Message:	Repository initialized

Snapshot ID:	6FMVKF4CS7N5RW6PC86G
Timestamp:	2025-06-17 00:03:27.965202+00:00
Message:	Appended new data from the year 2010

Snapshot ID:	9CNDKXE65PVGCHK4CHBG
Timestamp:	2025-06-17 00:01:17.067141+00:00
Message:	First commit! 09 data added.

Snapshot ID:	MWPZ0C0SCM3TRRKNF8D0
Timestamp:	2025-06-16 23:59:40.242436+00:00
Message:	Repository initialized



In [12]:
# Print both branch's datasets - notice the new one has 2011 data
snodas_data_feat_branch = new_repo.retrieve_dataset(branch="2011_feature")
print("NEW BRANCH ========================================================")
print(snodas_data_feat_branch)
print("===================================================================")
print("MAIN BRANCH =======================================================")
print(snodas_data)
print("===================================================================")

<xarray.Dataset> Size: 2GB
Dimensions:  (time: 15, lon: 8192, lat: 4096)
Coordinates:
  * time     (time) datetime64[ns] 120B 2009-12-09 2009-12-10 ... 2011-01-05
  * lon      (lon) float64 66kB -130.5 -130.5 -130.5 ... -62.27 -62.26 -62.25
  * lat      (lat) float64 33kB 24.1 24.11 24.12 24.13 ... 58.21 58.22 58.23
Data variables:
    crs      (time) object 120B dask.array<chunksize=(5,), meta=np.ndarray>
    Band1    (time, lat, lon) float32 2GB dask.array<chunksize=(1, 4096, 8192), meta=np.ndarray>
Attributes:
    Conventions:  CF-1.5
    GDAL:         GDAL 3.11.0dev-f1386937cde9e540784909294fdd45cda3ee65d2, re...
    history:      Tue Feb 04 19:00:12 2025: GDAL CreateCopy( /data/unmasked/2...
<xarray.Dataset> Size: 1GB
Dimensions:  (time: 10, lat: 4096, lon: 8192)
Coordinates:
  * lon      (lon) float64 66kB -130.5 -130.5 -130.5 ... -62.27 -62.26 -62.25
  * time     (time) datetime64[ns] 80B 2009-12-09 2009-12-10 ... 2010-01-05
  * lat      (lat) float64 33kB 24.1 24.11 24.12 24.13

In [13]:
# Cleanup - delete the test repo entirely
new_repo.delete_repo(quiet=True)

Icechunk repo @ s3://hydrofabric-data/ic_testing/snodas_yearly_append_test in its entirety was successfully deleted.
