In [1]:
# extract sit from TOPAZ4b FreeRun for 2000-2010 application

import os
import xarray as xr
import pandas as pd
import numpy as np
import yaml
import pickle as pkl
from datetime import datetime
from datetime import timedelta
from glob import glob

import src.utils.load_config as load_config
from src.data_preparation import load_data
import src.feature_extraction.extract_pca as extract_pca

import src.utils.tardisml_utils as tardisml_utils
rootdir = tardisml_utils.get_rootdir()

In [2]:
# Path to config file
file_config = '../config/config_default_2023.yaml'

nosit_dir, withsit_dir, _, forcing_bdir, pca_dir, res_dir, fig_dir, ml_dir, freerun_dir = load_config.load_filename(file_config)
timeofday, target_field, forcing_fields, covar_fields, lim_idm, lim_jdm, n_comp = load_config.load_config_params(file_config)


# -------------------------------------------------
# selection files

years = [2000, 2011] 
# years = [2011, 2012] 



listfile_a = sorted(glob(os.path.join(rootdir + freerun_dir,'*.nc')))
listyear = [os.path.basename(name)[:4] for name in listfile_a]  # read years
# selection corresponding years
listfile_a = [listfile_a[idx] for idx, name in enumerate(listyear) if int(name)>=years[0] and int(name)<=years[-1]]

# -------------------------------------------------

data_kind = "nosit"
n_components = load_config.get_n_components(data_kind, file_config)

In [3]:
rootdir

'/scratch/project_465000269/edelleo1/'

In [4]:
freerun_dir

'Leo/Jiping_2023/TP4b_freerun/'

In [5]:
listfile_a[-3:]

['/scratch/project_465000269/edelleo1/Leo/Jiping_2023/TP4b_freerun/20111229_dm-12km-NERSC-MODEL-TOPAZ4B-ARC-RAN.fv2.0.nc',
 '/scratch/project_465000269/edelleo1/Leo/Jiping_2023/TP4b_freerun/20111230_dm-12km-NERSC-MODEL-TOPAZ4B-ARC-RAN.fv2.0.nc',
 '/scratch/project_465000269/edelleo1/Leo/Jiping_2023/TP4b_freerun/20111231_dm-12km-NERSC-MODEL-TOPAZ4B-ARC-RAN.fv2.0.nc']

In [6]:
# load Topaz 4b Free Run
nc_sel_na, chrono_na = extract_pca.load_TOPAZ(listfile_a[:20], target_field=target_field, lim_idm=lim_idm, lim_jdm=lim_jdm)

Define chronology from .nc files...
Loading .nc ...
Variable selection...
Spatial selection...


In [8]:
import datetime

In [9]:
# if extend_3m:
first_oct = datetime.datetime(years[0], 12, 1)
last_march = datetime.datetime(years[-1], 3, 31)
idx_start = 200
idx_end = 3000

#nc_sel_na = nc_sel_na.isel(time=slice(idx_start, idx_end))

In [14]:
chrono_na

Unnamed: 0,date
0,2000-01-01
1,2000-01-02
2,2000-01-03
3,2000-01-04
4,2000-01-05
5,2000-01-06
6,2000-01-07
7,2000-01-08
8,2000-01-09
9,2000-01-10


In [15]:
np.where(chrono_na==datetime.datetime(2000, 1, 12))[0][0]

11

In [17]:
nca_sel_na_ext = nc_sel_na.isel(time=slice(11,14))

In [18]:
nca_sel_na_ext

Unnamed: 0,Array,Chunk
Bytes,2.47 MiB,841.99 kiB
Shape,"(3, 479, 450)","(1, 479, 450)"
Dask graph,3 chunks in 43 graph layers,3 chunks in 43 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 2.47 MiB 841.99 kiB Shape (3, 479, 450) (1, 479, 450) Dask graph 3 chunks in 43 graph layers Data type float32 numpy.ndarray",450  479  3,

Unnamed: 0,Array,Chunk
Bytes,2.47 MiB,841.99 kiB
Shape,"(3, 479, 450)","(1, 479, 450)"
Dask graph,3 chunks in 43 graph layers,3 chunks in 43 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,841.99 kiB,841.99 kiB
Shape,"(479, 450)","(479, 450)"
Dask graph,1 chunks in 96 graph layers,1 chunks in 96 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 841.99 kiB 841.99 kiB Shape (479, 450) (479, 450) Dask graph 1 chunks in 96 graph layers Data type float32 numpy.ndarray",450  479,

Unnamed: 0,Array,Chunk
Bytes,841.99 kiB,841.99 kiB
Shape,"(479, 450)","(479, 450)"
Dask graph,1 chunks in 96 graph layers,1 chunks in 96 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,841.99 kiB,841.99 kiB
Shape,"(479, 450)","(479, 450)"
Dask graph,1 chunks in 96 graph layers,1 chunks in 96 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 841.99 kiB 841.99 kiB Shape (479, 450) (479, 450) Dask graph 1 chunks in 96 graph layers Data type float32 numpy.ndarray",450  479,

Unnamed: 0,Array,Chunk
Bytes,841.99 kiB,841.99 kiB
Shape,"(479, 450)","(479, 450)"
Dask graph,1 chunks in 96 graph layers,1 chunks in 96 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray


In [32]:
# import pdb; pdb.set_trace()

# load pca from 2010-2020
# --------- TOPAZ4b FREERUN ------------------
data_kind = "nosit"
n_components = load_config.get_n_components(data_kind, file_config)
filename = os.path.join(rootdir,pca_dir,f"pca_{target_field}_{n_components}N_noSITass_2011_2019_FreeRun.pkl")
pca_na = load_data.load_pca(filename)

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [33]:
# apply PCA of 2010-2020 to 2000-2010

Xna = nc_sel_na

maskok = (np.isfinite(Xna)).all(dim='time')
maskok1d = maskok.stack(z=('y','x')).compute()
PCs_na = extract_pca.pca_to_PC(pca_na, Xna, maskok1d)


KeyboardInterrupt: 

In [None]:
str_na = 'FreeRun'  # 'noSITass' or 'FreeRun'

In [None]:
# save pca for 2000-2010 as .pkl file

filename = f'pca_{target_field}_TOPAZ4b23_{years[0]}_{years[-1]}_{str_na}.pkl'
extract_pca.save_pca(filename, PCs_na)

# extract_pca.save_pca(filename, pca)
#print(f'PCA saved: {filename}')
#pkl.dump(pca, open(filename,"wb"))