# Data Concatenator

## 필요한 Data
- surface level ERA5, precipitation 제외
- surface level ERA5, precipitation 포함
- pressure level ERA5
- (optional) TOA solar incident radiation

In [13]:
import xarray as xr
import numpy as np
import his_utils

In [14]:
# set up dataset path 
ds_surface = "testdata/2022-01-01/2022-01-01_surface.grib"
ds_surface_precip = "testdata/2022-01-01/2022-01-01_tp.grib"
ds_surface_precip_prior = "testdata/2022-01-01/2021-01-01_tp prior.grib"
ds_pressure_level = "testdata/2022-01-01/2022-01-01_pressure level.grib"
ds_TOA = "testdata/2022-01-01/2022-01-01 TOA.grib"

# 1. Surface Data w/o Precipitation

precipitation 제외하고 받은 6-hrly data 전처리 하는 과정

## TODO
- dimension 이름 변경
- 불필요한 coordinate 삭제
- variable 이름 변경 -> 나중에 합치고 한 번에 진행 가능

In [15]:
ds1 = xr.open_dataset(ds_surface, engine='cfgrib')

Ignoring index file 'testdata/2022-01-01/2022-01-01_surface.grib.5b7b6.idx' incompatible with GRIB file
skipping variable: paramId==212 shortName='tisr'
Traceback (most recent call last):
  File "/home/hiskim1/.conda/envs/hiskim1_graphcast/lib/python3.11/site-packages/cfgrib/dataset.py", line 721, in build_dataset_components
    dict_merge(variables, coord_vars)
  File "/home/hiskim1/.conda/envs/hiskim1_graphcast/lib/python3.11/site-packages/cfgrib/dataset.py", line 639, in dict_merge
    raise DatasetBuildError(
cfgrib.dataset.DatasetBuildError: key present and new value is different: key='time' value=Variable(dimensions=('time',), data=array([1640995200, 1641016800, 1641038400, 1641060000, 1641081600,
       1641103200, 1641124800, 1641146400, 1641168000, 1641189600,
       1641211200, 1641232800, 1641254400, 1641276000, 1641297600,
       1641319200])) new_value=Variable(dimensions=('time',), data=array([1640973600, 1641016800, 1641060000, 1641103200, 1641146400,
       1641189600, 

In [16]:
ds1 = ds1.drop_vars(['number', 'step', 'surface', 'valid_time'])
ds1 = ds1.rename({"z" : "geopotential_at_surface"})

# 2. Precipitation Data

Accumulates 6-hour precipitation into the next time step?
or into previous step?

## TODO
- coordinate 정리
- 강수량 합치기

In [17]:
# 2-1. the time after
ds2 = xr.open_dataset(ds_surface_precip, engine='cfgrib')
prior = xr.open_dataset(ds_surface_precip_prior, engine='cfgrib')

def sync_tp_coords(dataset: xr.Dataset):
    dataset = dataset.stack(new_time=['time', 'step'])
    dataset = dataset.assign_coords(new_time=dataset.valid_time.values)
    dataset = dataset.rename({'new_time': 'time'})
    dataset = dataset.drop_vars(['number', 'surface'])
    return dataset

ds2 = sync_tp_coords(ds2)
ds2 = ds2.isel(time=slice(5,None)) #                                          <------- FIX HERE

Ignoring index file 'testdata/2022-01-01/2022-01-01_tp.grib.5b7b6.idx' incompatible with GRIB file
Ignoring index file 'testdata/2022-01-01/2021-01-01_tp prior.grib.5b7b6.idx' incompatible with GRIB file
  dataset = dataset.assign_coords(new_time=dataset.valid_time.values)


In [18]:
# 2-2. the time before

prior = sync_tp_coords(prior)
prior = prior.isel(time=slice(5,29)) #                                        <------- FIX HERE

  dataset = dataset.assign_coords(new_time=dataset.valid_time.values)


In [19]:
# 2-3. merge the two datasets

ds2 = xr.concat([prior, ds2], dim='time')
ds2 = ds2.sortby('time')

ds2 = ds2.resample(time='6h', closed='right', label='right').sum()
ds2 = ds2.isel(time=slice(4, 20)) #                                           <------- FIX HERE

# 3. Pressure Level Data

37 level data를 처리하는 과정. 17 level이어도 동일한 방식으로 작동하도록 최대한 해보자

## TODO
- coordinate 정리

In [20]:
ds3 = xr.open_dataset(ds_pressure_level, engine='cfgrib')

Ignoring index file 'testdata/2022-01-01/2022-01-01_pressure level.grib.5b7b6.idx' incompatible with GRIB file


In [21]:
ds3 = ds3.drop_vars(['number', 'step', 'valid_time'])
ds3 = ds3.rename({"isobaricInhPa" : "level"})
ds3 = ds3.sortby('level', ascending=True)

level = ds3.level.values
level = level.astype(np.int32)

ds3 = ds3.assign_coords(level = ('level', level))

# (optional) 4. TOA 가공하기

구글에서 만든 거랑 내가 다운받은거랑 같다면 상관 없음.
다를 경우에는 이거 사용해야 함.

$\therefore$ $\exists$ noise
$\Rightarrow$ 결과가 미묘하게 달라지지만 유의미해보이지는 않다

In [22]:
# 4th. TOA
ds4 = xr.open_dataset(ds_TOA, engine='cfgrib')

ds4 = ds4.stack(new_time=['time', 'step'])
ds4 = ds4.assign_coords(new_time=ds4.valid_time.values)
ds4 = ds4.rename({'new_time': 'time'})
ds4 = ds4.drop_vars(['number', 'surface', 'valid_time'])

Ignoring index file 'testdata/2022-01-01/2022-01-01 TOA.grib.5b7b6.idx' incompatible with GRIB file
  ds4 = ds4.assign_coords(new_time=ds4.valid_time.values)


# 4. 3개의 데이터셋을 하나로 합성하기

*ds1*, *ds2*, *ds3*, (*ds4*)를 하나로 합치고 GC에 잘 들어가도록 다듬어주기

## TODO
- 한 장씩 합치는 게 좋을지, 여러 장 한 번에 합치는 게 좋을지 for memory efficiency
- 합쳐서 GC에 잘 들어가는 지까지 확인

In [23]:
# merge all datasets
ds_list = [ds1, ds2, ds3, ds4]

result = xr.merge(ds_list)

for ds in ds_list:
    ds.close()

result = his_utils.transform_dataset(result).copy()



result = result.reindex(lat=result.lat[::-1])  #                        <------- FIX HERE

## 4-1. 파일 명 정하기

In [24]:
result_path = 'testdata/ERA5_2022-01-01.nc'  #                                          <------- FIX HERE

# 5. Output: GC로 준비 갈 완료!

In [None]:
result.to_netcdf(result_path)
result

---
---
---
---
---
---
---
---
---
---
---
---

# 번외: xarray.resample은 어떻게 작동하는가?

| 1 | 2 | 3 | 4 | 5 | 6 | 7 | value

| 0 | 1 | 2 | 3 | 4 | 5 | 6 | hr

`closed=` 어느 쪽을 닫힌 구간으로 쓸 것인가 $\Rightarrow$ 어느 쪽을 포함하고 반대쪽을 제외할까

`label=` sample한 거를 어느 쪽에 할당할 것인가

- case 1) `xarray.resample("6h")`

    1 + ... + 6을 0hr에 할당

- case 2) `xarray.resample("6h", closed='right', label='right')`

    2 + ... + 7을 6hr에 할당

In [None]:
import pandas as pd

# 샘플 데이터 생성 (0시부터 23시까지)
date_range = pd.date_range(start='2021-12-31T19:00:00.000000000', end='2022-01-05T06:00:00.000000000', freq='h')
data = np.arange(1, 109, 1)
ds = pd.Series(data, index=date_range)

# 6시간 간격으로 리샘플링
ds_resampled = ds.resample('6h', closed='right', label='right').sum()

print("\n리샘플링 결과:")
print(ds_resampled)

# 각 리샘플링 구간의 시작과 끝 확인
for i, value in ds_resampled.items():
    start = i
    end = i + pd.Timedelta(hours=5)
    original_data = ds[start:end]
    print(f"\n{i}의 리샘플링 구간:")
    print(f"시작: {start}, 끝: {end}")
    print("포함된 원본 데이터:")
    print(original_data)
    print(f"합계: {value}")