In [6]:
import os
import glob
import numpy as np
import pandas as pd
import xarray as xr
from tqdm import tqdm
#from darts import TimeSeries
#from darts.models import DLinearModel

In [8]:
def check_data_folder(folder: str) -> bool:
    return os.path.exists(folder) and os.path.isdir(folder)

def load_data(file_path: str) -> xr.Dataset:
    """
    Load data from a NetCDF file, trying netcdf4 then h5netcdf.
    """
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"File not found: {file_path}")
    # 優先 netcdf4
    try:
        return xr.open_dataset(file_path, engine="netcdf4")
    except Exception as e1:
        # 改試 h5netcdf
        try:
            return xr.open_dataset(file_path, engine="h5netcdf")
        except Exception as e2:
            raise RuntimeError(
                f"Failed to open {file_path} with netcdf4 and h5netcdf.\n"
                f"e1: {e1}\n"
                f"e2: {e2}\n"
                "請確認這個環境有安裝 netCDF4 或 h5netcdf。"
            )

# ================= main program =================

data_folder = "nc4"
var_name = "TLML"  # 目標變數名稱（請確認檔內真的叫這個）

# 1. 檢查資料夾
if not check_data_folder(data_folder):
    raise FileNotFoundError(f"Data folder not found: {data_folder}")
print(f"Data folder found: {data_folder}")

# 2. 找出所有像 1980-01.nc4 的檔案
pattern = os.path.join(data_folder, "*.nc4")
file_list = sorted(glob.glob(pattern))

if len(file_list) == 0:
    raise FileNotFoundError(f"No nc4 files found with pattern: {pattern}")

print(f"Found {len(file_list)} files.")
print("First 5 files:", file_list[:5])

# 3. 用第一個檔案確認經緯度與變數存在
with load_data(file_list[0]) as sample_data:
    if var_name not in sample_data:
        raise KeyError(f"Variable '{var_name}' not found in file: {file_list[0]}")
    lat = sample_data["lat"].values
    lon = sample_data["lon"].values

nlat = lat.shape[0]
nlon = lon.shape[0]
print(f"Lat: {nlat}, Lon: {nlon}")

# 4. 逐檔讀入，累積到 list
data_list = []
time_list = []

for f in tqdm(file_list, desc="Combining"):
    with load_data(f) as ds:
        da = ds[var_name]  # (time, lat, lon)

        # 確保 lat/lon 一致（保險，可視情況註解）
        if da.sizes["lat"] != nlat or da.sizes["lon"] != nlon:
            raise ValueError(f"Lat/Lon size mismatch in file: {f}")

        # 資料轉 float32，省記憶體
        data_list.append(da.values.astype(np.float32))

        if "time" not in ds:
            raise KeyError(f"'time' coordinate not found in file: {f}")

        # decode_cf 確保時間是真正 datetime
        t = xr.decode_cf(ds)["time"].values
        time_list.append(t.astype("datetime64[ns]"))

# 5. 串起來 → (ntot, lat, lon) & DatetimeIndex
combined = np.concatenate(data_list, axis=0)   # (ntot, nlat, nlon)
time_array = np.concatenate(time_list, axis=0) # (ntot,)

if combined.shape[0] != time_array.shape[0]:
    raise ValueError(
        f"time length ({time_array.shape[0]}) "
        f"!= data length ({combined.shape[0]})"
    )

# 依時間排序（通常已排序，這裡是保險）
sort_idx = np.argsort(time_array)
combined = combined[sort_idx]
time_array = time_array[sort_idx]

time_index = pd.to_datetime(time_array)

print(f"Combined data shape: {combined.shape}")
print(f"Time index: {time_index[0]} -> {time_index[-1]} (len={len(time_index)})")

# 6. 攤平成 cell × time
ntot, nlat, nlon = combined.shape
ncell = nlat * nlon

# (cell, time)
y_all = combined.reshape(ntot, ncell).T

# 建立每個 cell 的 (lon, lat)
lon_grid, lat_grid = np.meshgrid(lon, lat)
gg = np.column_stack([lon_grid.ravel(), lat_grid.ravel()])  # (cell, 2)

print(f"y_all shape: {y_all.shape}  (cells x time)")
print(f"gg shape: {gg.shape}        (cells x [lon, lat])")

Data folder found: nc4
Found 548 files.
First 5 files: ['nc4/1980-01.nc4', 'nc4/1980-02.nc4', 'nc4/1980-03.nc4', 'nc4/1980-04.nc4', 'nc4/1980-05.nc4']


RuntimeError: Failed to open nc4/1980-01.nc4 with netcdf4 and h5netcdf.
e1: unrecognized engine netcdf4 must be one of: ['scipy', 'store']
e2: unrecognized engine h5netcdf must be one of: ['scipy', 'store']
請確認這個環境有安裝 netCDF4 或 h5netcdf。