In [76]:
import glob
import xarray as xr
import numpy as np
from datetime import datetime
import re
from collections import defaultdict
import os
# ---------------------------
# 1. 利用 glob 获取所有目标文件并排序
# ---------------------------
patterns = ['/N/project/Zli_lab/gongg/stage4_data/regrid_cons/*',]

all_files = []
for pattern in patterns:
    all_files.extend(glob.glob(pattern))
all_files.sort()

In [77]:
month_groups = defaultdict(list)
pattern = re.compile(r".*/(\d{8})\.nc$")  # 用于从文件路径中提取形如 20020601 的日期

for fpath in all_files:
    match = pattern.match(fpath)
    if match:
        date_str = match.group(1)  # '20020601'
        year = date_str[:4]        # '2002'
        month = date_str[4:6]      # '06'
        # day = date_str[6:]       # '01'
        month_groups[(year, month)].append(fpath)

# 3. 打开参考数据 ds_ref，用来做空间掩膜
ds_ref = xr.open_dataset("/N/project/Zli_lab/gongg/regrid/ref_.1deg.nc")

# 定义经度区间及对应的 UTC 偏移量
lon_ranges = [(-np.inf, -112.5), (-112.5, -97.5), (-97.5, -82.5), (-82.5, np.inf)]
utc_offsets = [-8, -7, -6, -5]

# 设定输出目录
output_dir = "/N/project/Zli_lab/gongg/stage4_data/LST"
os.makedirs(output_dir, exist_ok=True) 


In [102]:
global_counter = 0

for (year, month), file_list in sorted(month_groups.items()):
    ds_stage4 = xr.open_mfdataset(file_list, combine='by_coords')
    mask_nan = ds_ref["tp"].isel(time=0).isnull()
    ds_stage4 = ds_stage4.where(~mask_nan)
    for (lon_min, lon_max), offset in zip(lon_ranges, utc_offsets):
        mask = (ds_stage4.lon >= lon_min) & (ds_stage4.lon < lon_max)
        ds_lon_subset = ds_stage4.where(mask, drop=True)
        if ds_lon_subset.lon.size > 0 and ds_lon_subset.lat.size > 0:
            original_times = ds_lon_subset.time
            adjusted_times = original_times + np.timedelta64(offset, 'h')
            ds_lon_subset = ds_lon_subset.assign_coords(time=adjusted_times)
            output_filename = f"{year}{month}_U{offset:+d}.nc"
            output_path = os.path.join(output_dir, output_filename)
            ds_lon_subset.to_netcdf(output_path)
            global_counter += 1
        if global_counter % 100 == 0:
            print(datetime.now())


2025-03-27 13:44:33.808465
2025-03-27 13:49:26.436096
2025-03-27 13:56:00.943471


In [119]:
# 输入与输出目录
input_dir = "/N/project/Zli_lab/gongg/stage4_data/LST"
output_dir = "/N/project/Zli_lab/gongg/stage4_data/JJA_LST"

# 获取输入目录下所有 .nc 文件
all_files = glob.glob(os.path.join(input_dir, "*.nc"))

# 根据文件名按 (year, utc_offset) 分组
groups = {}
for file in all_files:
    basename = os.path.basename(file)  # 如 "200506_U-8.nc"
    name, ext = os.path.splitext(basename)  # name: "200506_U-8"
    # year 为前4位
    year = name[:4]
    # utc 部分：通过下划线分隔，第二部分如 "U-8"
    parts = name.split('_')
    if len(parts) < 2:
        continue
    utc_str = parts[1]  # e.g., "U-8"
    # 去掉 'U' 后转成 int（这里 -8 表示 UTC-8）
    offset = int(utc_str.replace('U', ''))
    key = (year, offset)
    groups.setdefault(key, []).append(file)


# 遍历每个分组，读取组合数据，筛选6、7、8月，并保存到指定目录
counter = 0
for (year, offset), files in groups.items():
    ds = xr.open_mfdataset(files, combine='by_coords')
    ds_jja = ds.where(ds.time.dt.month.isin([6, 7, 8]), drop=True)
    
    output_filename = f"{year}_U_{abs(offset)}.nc"
    output_path = os.path.join(output_dir, output_filename)
    ds_jja.to_netcdf(output_path)
    
    counter += 1
    if counter % 10 == 0:
        print(datetime.now())

2025-03-27 14:13:15.416715
2025-03-27 14:13:40.301944
2025-03-27 14:14:04.504215
2025-03-27 14:14:33.163424
2025-03-27 14:15:03.051175
2025-03-27 14:15:38.043198
2025-03-27 14:16:12.114299
2025-03-27 14:16:36.234252
2025-03-27 14:17:01.042848
