In [1]:
from ecmwfapi import ECMWFDataServer

server = ECMWFDataServer()

server.retrieve({
    "class": "s2",
    "dataset": "s2s",
    "date": "2024-01-01/to/2024-01-31",
    "expver": "prod",
    "levtype": "sfc",
    "model": "glob",
    "origin": "ecmf",
    "param": "151",
    "step": "24",
    "stream": "enfo",
    "time": "00:00:00",
    "type": "cf",
    "target": "output"
})


2025-08-01 16:08:38 ECMWF API python library 1.6.3
2025-08-01 16:08:38 ECMWF API at https://api.ecmwf.int/v1
2025-08-01 16:08:39 Welcome Huisu Kim
2025-08-01 16:08:42 In case of problems, please check https://confluence.ecmwf.int/display/WEBAPI/Web+API+FAQ or contact servicedesk@ecmwf.int
2025-08-01 16:08:42 Access to this dataset is transitioning to a new interface, dates to be announced soon
2025-08-01 16:08:42 For more information on how to access this data in the future, visit https://confluence.ecmwf.int/x/-wUiEw
2025-08-01 16:08:42 ---------------------------------
2025-08-01 16:08:43 Request submitted
2025-08-01 16:08:43 Request id: 688c67fb3750a2799e512f21
2025-08-01 16:08:43 Request is submitted
2025-08-01 16:08:45 Calling 'nice mars /tmp/20250801-0700/aa/tmp-_mars-J7I2wG-70095729e3402ed42d409167af42a697.req'
2025-08-01 16:08:45 Forcing MIR_CACHE_PATH=/data/ec_coeff
2025-08-01 16:08:45 mars - WARN -
2025-08-01 16:08:45 mars - WARN -
2025-08-01 16:08:45 MIR environment variable

In [2]:
import xarray as xr

ds = xr.open_dataset('/geodata1/weather-quest/data/ifs/ifs_cf_228_2024-01-01.nc')

In [3]:
ds

# Model Configure

In [1]:
import xarray as xr
import numpy as np

from torch.utils.data import Subset
from sklearn.model_selection import train_test_split 
import xarray as xr
import numpy as np
import torch, os, random
from torch.utils.data import Dataset, DataLoader
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torch.utils.tensorboard import SummaryWriter
import torch.nn.functional as F
from glob import glob

seed = 42
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

CONFIG = {
    # 1) Variables & levels
    "input_vars": ["u_component_of_wind", "v_component_of_wind", "geopotential"],           # any list of 2D fields
    "input_levels": [200, 300, 500, 700, 850],# pressure levels for those vars
    "target_var": "mean_sea_level_pressure",                     # mean sea level pressure
    
    # 2) Date range
    "data_start": "2015-01-01",
    "data_end":   "2019-12-31",
    
    # 3) Temporal settings
    "resample_freq": "6h",                   # resample hourly → 6-hourly
    "context_days": 14,                      # days of input before t0
    "lead_start": 19,                        # forecast from t0 + 19 days...
    "lead_end":   32,                        # ...through t0 + 32 days
    
    # 4) Grid size (1.0° global)
    "n_lon": 360,
    "n_lat": 181,
    
    # 5) Model/training
    "embed_dim":        512,
    "enc_layers":       6,
    "dec_layers":       6,
    "n_heads":          8,
    "ff_dim":           1024,

    "batch_size":       2,
    "lr":               3e-5,
    "weight_decay":     1e-5,
    "epochs":           100,


    # 7) EarlyStopping / Scheduler -------------------------------------
    "use_early_stopping": True,       # EarlyStopping 사용 여부
    "patience":           10,         # EarlyStopping 기준 에포크 수
    "es_min_delta":       1e-5,       # 개선으로 간주할 최소 변화량

    "use_reduce_on_plateau": True,    # ReduceLROnPlateau 사용 여부
    "rlp_factor":          0.5,       # ReduceLROnPlateau 감소 비율
    "rlp_patience":       5,          # ReduceLROnPlateau 대기 에포크 수

    # 8) Hyperparameter Tuning (Optuna)
    "do_hyperopt":        False,      # Optuna로 HP 탐색을 할 것인지 여부
    "hp_trials":          20,         # Optuna 탐색 횟수

    # 9) Activation Function 선택 --------------------------------------
    "activation":        "relu",      # ["relu", "gelu", "leaky_relu", "elu", ...]
}

def get_activation(name: str):
    """
    CONFIG['activation']에 따라 torch.nn.functional 모듈의 활성화 함수를 반환.
    """
    name = name.lower()
    if name == "relu":
        return F.relu
    elif name == "gelu":
        return F.gelu
    elif name == "leaky_relu":
        return lambda x: F.leaky_relu(x, negative_slope=0.01)
    elif name == "elu":
        return F.elu
    elif name == "silu" or name == "swish":
        return F.silu
    else:
        raise ValueError(f"Unknown activation: {name}")

ModuleNotFoundError: No module named 'tensorboard'

# Input

# Model Class

In [7]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class S2SForecastModel(nn.Module):
    def __init__(self):
        super().__init__()
        in_ch = len(CONFIG["input_vars"]) * len(CONFIG["input_levels"])
        H, W  = CONFIG["n_lat"], CONFIG["n_lon"]
        E     = CONFIG["embed_dim"]

        # conv encoder
        self.enc_conv1 = nn.Conv2d(in_ch, 64, kernel_size=5, stride=2, padding=2)
        self.enc_conv2 = nn.Conv2d(64,  128, 3, 2, 1)
        self.enc_conv3 = nn.Conv2d(128, 128, 3, 1, 1)

        # compute conv feature size dynamically
        dummy = torch.zeros(1, in_ch, H, W)
        feat  = self.enc_conv3(self.enc_conv2(self.enc_conv1(dummy)))
        self.conv_feat_size = feat.numel()
        self.enc_linear = nn.Linear(self.conv_feat_size, E)

        # target-side flat-grid → embedding
        self.tgt_linear = nn.Linear(H * W, E)

        # positional encodings
        self.pos_in  = nn.Parameter(torch.randn(100, 1, E))
        self.pos_out = nn.Parameter(torch.randn(100, 1, E))

        # transformer
        self.trans = nn.Transformer(
            d_model=E,
            nhead=CONFIG["n_heads"],
            num_encoder_layers=CONFIG["enc_layers"],
            num_decoder_layers=CONFIG["dec_layers"],
            dim_feedforward=CONFIG["ff_dim"],
            dropout=0.1,
            batch_first=False
        )

        # output decoder: E → H*W
        self.dec_linear = nn.Linear(E, H * W)

        # 활성화 함수 객체 저장 (CONFIG["activation"]에 따라)
        self.act = get_activation(CONFIG["activation"])

    def forward(self, x_inputs, y_inputs=None):
        """
        x_inputs: [B, T_in, 15, H, W]
        y_inputs: [B, T_out, 1, H, W] or None
        """
        B, T_in, _, H, W = x_inputs.shape
        if y_inputs is not None:
            T_out = y_inputs.shape[1]
        else:
            T_out = (CONFIG["lead_end"] - CONFIG["lead_start"]) * 4 + 4  # 56

        # 1) Encoder: Conv → Flatten → Linear → Positional Encoding
        x = x_inputs.view(B * T_in, 15, H, W)
        x = self.act(self.enc_conv1(x))   # ← ReLU 대신 CONFIG["activation"]
        x = self.act(self.enc_conv2(x))
        x = self.act(self.enc_conv3(x))
        x = x.view(x.size(0), -1)
        x = self.enc_linear(x)
        x_enc = x.view(B, T_in, -1).permute(1, 0, 2)  # [T_in, B, E]
        x_enc = x_enc + self.pos_in[:T_in]

        # 2) Decoder input 생성 (Teacher forcing)
        if y_inputs is not None:
            bos     = torch.zeros(B, 1, H, W, device=x_inputs.device)
            y_shift = torch.cat([bos, y_inputs[:, :-1]], dim=1)  # [B, T_out, H, W]
            
            # flat → embed + positional
            y_flat = y_shift.reshape(B * T_out, -1)  # [B*T_out, H*W]
            y_emb  = self.tgt_linear(y_flat)
            y_emb  = y_emb.view(B, T_out, -1).permute(1, 0, 2)  # [T_out, B, E]
        else:
            y_emb = torch.zeros(T_out, B, self.enc_linear.out_features, device=x_inputs.device)

        y_emb = y_emb + self.pos_out[:T_out, ...]
        tgt_mask = nn.Transformer.generate_square_subsequent_mask(T_out, device=x_inputs.device)

        # 3) Transformer forward
        memory  = self.trans.encoder(x_enc)
        out_seq = self.trans.decoder(tgt=y_emb, memory=memory, tgt_mask=tgt_mask)

        # 4) Output projection
        out_seq  = out_seq.permute(1, 0, 2)  # [B, T_out, E]
        out_flat = self.dec_linear(out_seq)  # [B, T_out, H*W]
        out_grid = out_flat.view(B, T_out, H, W)
        return out_grid

# Instantiate model
model = S2SForecastModel()
print("Total parameters:", sum(p.numel() for p in model.parameters()))




Total parameters: 370002312


# Model Validation

In [8]:
# import torch

# # 1) init
# model = S2SForecastModel().cuda()
# model.eval()  # we’re just forward-testing

# # 2) dummy input
# T_in  = int(CONFIG["context_days"]*24 / 6) + 1   # e.g. 14 days at 6h → 57 steps
# n_ch  = len(CONFIG["input_vars"]) * len(CONFIG["input_levels"])
# B     = 1  # single-sample test
# H, W  = CONFIG["n_lat"], CONFIG["n_lon"]
# dummy_x = torch.randn(B, T_in, n_ch, H, W, device="cuda")
# print("dummy input shape: (batch, time, channel, lat, lon) = ", dummy_x.shape)
# # 3) forward
# with torch.no_grad():
#     out = model(dummy_x)   # should be [B, T_out, H, W]
# print("dummy output shape:", out.shape)
# # Expect: (1, (lead_end-lead_start)*4, 181, 360) → (1, 56, 181a, 360)


# Output Wrapper

In [9]:
# import numpy as np
# import torch
# import xarray as xr

# def preprocess_for_model(ds6, t0):
#     """Slice ds6 around t0, normalize, and return a torch.Tensor [1,Tin,Ch,H,W]."""
#     inp, _ = get_sample(t0)               # reuse get_sample but ignore target
#     # inp is a numpy array [Tin,Ch,H,W], normalized
#     x = torch.from_numpy(inp).unsqueeze(0).cuda()
#     return x

# def forecast_to_xarray(model, ds6, t0):
#     """Run model and return xarray DataArray with coords time, lat, lon."""
#     model.eval()
#     x = preprocess_for_model(ds6, t0)
#     with torch.no_grad():
#         y_norm = model(x)                # [1,Tout,H,W]
#     # denormalize
#     mean, std = ds6[CONFIG["target_var"]].mean(), ds6[CONFIG["target_var"]].std()
#     y = y_norm.cpu().numpy()*std + mean
#     # build forecast lead times
#     t0_np = np.datetime64(t0)
#     lead0 = t0_np + np.timedelta64(CONFIG["lead_start"], "D")
#     steps = int((CONFIG["lead_end"]-CONFIG["lead_start"])*4 + 1)
#     times = lead0 + np.arange(steps)*np.timedelta64(6, "h")
#     # wrap in DataArray
#     da = xr.DataArray(
#         y[0], 
#         dims=("time","lat","lon"),
#         coords={"time": times,
#                 "lat": ds6.lat, 
#                 "lon": ds6.lon},
#         name="msl_forecast")
#     return da

# Usage:
# ds6 = …  # your preprocessed xarray dataset
# model = S2SForecastModel(); model.load_state_dict(torch.load("best.pth")); model.cuda()
# da_fcst = forecast_to_xarray(model, ds6, "2025-08-01T00:00:00")
# da_fcst.to_netcdf("mslp_19-32d_forecast.nc")


# TestBed

In [10]:


# ---------- 1. open & light-preprocess  ---------------------------------

paths = sorted(glob("/home/work/output/era5_1p0/201*.zarr"))
ds    = xr.open_mfdataset(paths, engine="zarr")
ds = ds.sel(level=CONFIG["input_levels"])                       # keep 5 pressure lvls
ds = ds[CONFIG["input_vars"] + [CONFIG["target_var"]]]          # keep only wanted vars
ds = ds.sortby("time")                                          # make sure time is monotonic
ds6 = ds.resample(time=CONFIG["resample_freq"]).nearest().compute()       # hourly → 6-hourly

# ---------- optional: simple per-field normalisation -------------

# FIXME: climatology required
mu  = ds6.mean(("time","latitude","longitude"))
sig = ds6.std (("time","latitude","longitude"))
ds6 = (ds6 - mu) / sig

# ---------- 2. PyTorch-friendly sliding-window dataset -----------

T_IN           = int(CONFIG["context_days"]*24/6) + 1          # 57
LEAD_START_ST  = CONFIG["lead_start"] * 4                      # 19d → 76
LEAD_END_ST    = CONFIG["lead_end"]   * 4 + 3                  # 32d18UTC → 131
T_OUT          = LEAD_END_ST - LEAD_START_ST + 1               # 56
STRIDE         = 4                                             # 1 day

def to_tensor(da: xr.DataArray) -> torch.Tensor:
    return torch.from_numpy(da.values).float()

class ERA5Window(Dataset):
    def __init__(self, dset):
        self.x = dset[CONFIG["input_vars"]].to_array("f")
        self.y = dset[[CONFIG["target_var"]]].to_array("f")
        N      = self.x.time.size

        # 사전에 “끝 인덱스 ≤ N-1” 조건을 만족하는 시작점만 저장
        self.starts = [s for s in range(0, N, STRIDE)
                       if s + T_IN - 1 + LEAD_END_ST < N]

    def __len__(self):  return len(self.starts)

    def __getitem__(self, idx):
        s  = self.starts[idx]
        t0 = s + T_IN - 1                                      # reference time

        # -------- context (Tin steps) ---------------------------------
        x = (self.x.isel(time=s + np.arange(T_IN))             # (f,l,T,H,W)
                .stack(c=("f","level"))
                .transpose("time","c","latitude","longitude"))
        # -------- target (Tout=56 steps) ------------------------------
        y_idx = t0 + LEAD_START_ST + np.arange(T_OUT)
        y = (self.y.isel(time=y_idx)            ### ←
                .squeeze("f")                   # drop feature dim
                .transpose("time","latitude","longitude"))  ### ← lat-lon 맞춤

        return to_tensor(x), to_tensor(y)




In [11]:


# ---------- 3. build loader ---------------------------------------------
ds_torch = ERA5Window(ds6)
print("valid samples:", len(ds_torch))
x, y = ds_torch[0]
print(x.shape, y.shape)  # (57, 15, 181, 360) (56, 181, 360)

train_idx, test_idx = train_test_split(
    np.arange(len(ds_torch)),      # 전체 샘플 인덱스
    test_size   = 0.2,             # 20 % test
    shuffle     = True,
    random_state= 42
)

train_set = Subset(ds_torch, train_idx)   # x_train / y_train
test_set  = Subset(ds_torch, test_idx)    # x_test  / y_test

train_loader = DataLoader(
    train_set,
    batch_size  = CONFIG["batch_size"],
    shuffle     = True,
    num_workers = 1,
    pin_memory  = True
)
test_loader = DataLoader(                 # 🔧 NEW
    test_set,
    batch_size  = CONFIG["batch_size"],
    shuffle     = False,
    num_workers = 1,
    pin_memory  = True
)



valid samples: 3605
torch.Size([57, 15, 181, 360]) torch.Size([56, 181, 360])


In [None]:

# ---------- 4. quick smoke-train ----------------------------------------
device  = "cuda" if torch.cuda.is_available() else "cpu"
model   = S2SForecastModel().to(device)
opt     = torch.optim.AdamW(model.parameters(),
                            lr=CONFIG["lr"],
                            weight_decay=CONFIG["weight_decay"])
if CONFIG["use_reduce_on_plateau"]:
    scheduler = ReduceLROnPlateau(opt,
                                  mode="min",
                                  factor=CONFIG["rlp_factor"],
                                  patience=CONFIG["rlp_patience"])
else:
    scheduler = None

writer = SummaryWriter(log_dir="/home/work/graphcast/result")

if CONFIG["use_early_stopping"]:
    best_val_loss     = float("inf")
    epochs_no_improve = 0
    patience          = CONFIG["patience"]
    min_delta         = CONFIG["es_min_delta"]

for epoch in range(1, CONFIG["epochs"] + 1):
    model.train()
    train_running = 0.0

    for xb, yb in train_loader:
        xb, yb = xb.to(device), yb.to(device)
        xb = torch.nan_to_num(xb); yb = torch.nan_to_num(yb)

        opt.zero_grad()
        pred = model(xb, yb)
        loss = F.mse_loss(pred, yb)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        opt.step()
        train_running += loss.item() * xb.size(0)
    
    train_loss = train_running / len(train_set)

    # --- Validation 단계 -----------------------------------------------
    model.eval()
    val_running = 0.0
    with torch.no_grad():
        for xb, yb in test_loader:
            xb, yb = xb.to(device), yb.to(device)
            xb = torch.nan_to_num(xb); yb = torch.nan_to_num(yb)
            pred = model(xb, yb)
            val_running += F.mse_loss(pred, yb).item() * xb.size(0)

    val_loss = val_running / len(test_set)

    # --- Scheduler 조정 -------------------------------------------------
    if scheduler is not None:
        # ReduceLROnPlateau 기준은 val_loss (에포크 단위)
        scheduler.step(val_loss)

    # --- EarlyStopping 체크 ---------------------------------------------
    if CONFIG["use_early_stopping"]:
        if val_loss < best_val_loss - min_delta:
            best_val_loss     = val_loss
            epochs_no_improve = 0
            # 최적 모델 가중치 저장
            torch.save(model.state_dict(), "best_model.pt")
        else:
            epochs_no_improve += 1
            if epochs_no_improve >= patience:
                print(f"Epoch {epoch}: Validation loss 개선 없음 ({patience} epoch) → 학습 종료")
                break

    # --- TensorBoard 기록 및 로그 출력 ----------------------------------
    writer.add_scalar("Loss/Train", train_loss, epoch)
    writer.add_scalar("Loss/Val",   val_loss,   epoch)
    print(f"Epoch {epoch:03d} | Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f} | LR: {opt.param_groups[0]['lr']:.2e}")

writer.close()



Epoch 001 | Train Loss: 0.4493 | Val Loss: 0.3416 | LR: 3.00e-05
Epoch 002 | Train Loss: 0.3400 | Val Loss: 0.3232 | LR: 3.00e-05
Epoch 003 | Train Loss: 0.3035 | Val Loss: 0.2649 | LR: 3.00e-05
Epoch 004 | Train Loss: 0.2488 | Val Loss: 0.2165 | LR: 3.00e-05
Epoch 005 | Train Loss: 0.2056 | Val Loss: 0.1791 | LR: 3.00e-05
Epoch 006 | Train Loss: 0.1745 | Val Loss: 0.1526 | LR: 3.00e-05
Epoch 007 | Train Loss: 0.1514 | Val Loss: 0.1317 | LR: 3.00e-05
Epoch 008 | Train Loss: 0.1329 | Val Loss: 0.1160 | LR: 3.00e-05
Epoch 009 | Train Loss: 0.1185 | Val Loss: 0.1046 | LR: 3.00e-05
Epoch 010 | Train Loss: 0.1068 | Val Loss: 0.0935 | LR: 3.00e-05
Epoch 011 | Train Loss: 0.0961 | Val Loss: 0.0837 | LR: 3.00e-05
Epoch 012 | Train Loss: 0.0871 | Val Loss: 0.0767 | LR: 3.00e-05
Epoch 013 | Train Loss: 0.0798 | Val Loss: 0.0702 | LR: 3.00e-05
Epoch 014 | Train Loss: 0.0733 | Val Loss: 0.0648 | LR: 3.00e-05
Epoch 015 | Train Loss: 0.0676 | Val Loss: 0.0595 | LR: 3.00e-05
Epoch 016 | Train Loss: 0

In [None]:

# ---------- 5. shape sanity-check ---------------------------------------
with torch.no_grad():
    dummy_x = torch.randn(1, T_IN, 15, CONFIG["n_lat"], CONFIG["n_lon"], device=device)
    out     = model(dummy_x)
print("dummy input :", dummy_x.shape)
print("dummy output:", out.shape)