# Header

In [1]:
import glob
import re
import xarray as xr
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from multiprocessing import Pool
from lib.his_preprocess import *

# Preprocess

In [2]:

for target_var in ["2m_temperature", 'geopotential']:
    if target_var == '2m_temperature':
        p_1 = sorted(glob.glob('/data/GC_output/2021-06-21/GC_???????????_global_scale*.nc'))
        p_2 = sorted(glob.glob('/data/GC_output/2021-06-21/GC_00100000000_*_scale*.nc'))
        p_3 = sorted(glob.glob('/data/GC_output/2021-06-21/GC_11011111111_*_scale*.nc'))

    elif target_var == 'geopotential':
        p_1 = sorted(glob.glob('/data/GC_output/2021-06-21/GC_???????????_global_scale*.nc'))
        p_2 = sorted(glob.glob('/data/GC_output/2021-06-21/GC_00000100000_*_scale*.nc'))
        p_3 = sorted(glob.glob('/data/GC_output/2021-06-21/GC_11111011111_*_scale*.nc'))  

    # Assign base colors for each partition
    partition_colors = {
        'p_1': 'blue',
        'p_2': 'green',
        'p_3': 'red',
        'p_4': 'purple'
    }

    # Function to extract perturbation type and value from filename
    def extract_perturbation_info(filename):
        match = re.search(r'_([01][01][01][01][01][01][01][01][01][01][01])_(.*?)_(scale|wipeout)_([\d.eE+-]+)\.nc$', filename)
        if match:
            var=match.group(1)
            region=match.group(2)
            perturb_type = match.group(3)
            value = match.group(4)
            return f"{value}_{region}_{var}"
        else:
            return None

    # Collect perturbation files with labels and colors
    perturb_files = []
    for partition_name, partition_files in zip(['p_1', 'p_2', 'p_3'], [p_1, p_2, p_3]):
    # for partition_name, partition_files in zip(['p_4'], [p_4]):
        base_color = partition_colors[partition_name]
        num_files = len(partition_files)
        # Generate different shades of the base color
        colors = sns.light_palette(base_color, n_colors=num_files + 2)[1:-1]
        for i, file in enumerate(partition_files):
            perturb_info = extract_perturbation_info(file)
            if perturb_info:
                label = f"{partition_name} {perturb_info}"
                color = colors[i % len(colors)]
                perturb_files.append((label, color, file))

    perturb_datasets = []

    def process_file(file_info):
        label, color, file = file_info
        dataset = weighted_mean(preprocess_GC(xr.open_dataset(file), target_var))
        return (label, color, dataset)

    with Pool(processes=35) as pool:
        perturb_datasets = pool.map(process_file, perturb_files)

    def piping(dataset:xr.Dataset, target_var):
        return weighted_mean(preprocess_nwp(dataset, target_var))

    from functools import partial
    pipe = partial(piping, target_var = target_var)

    if target_var == '2m_temperature':
        files = sorted(glob.glob('/geodata2/S2S/ECMWF_Perturbed/Dailyaveraged/t2m/nc/*/Temperature2m_2021-06-21.nc'))

    elif target_var == 'geopotential':
        files = sorted(glob.glob('/geodata2/S2S/ECMWF_Perturbed/InstantaneousAccumulated/z/nc/*/Z_2021-06-21.nc'))

    files = [f for f in files if int(f.split('/')[-2]) <= 24 * 7]
    nwp = xr.open_mfdataset(
        files,
        combine='by_coords',
        preprocess=pipe
    )
    if target_var == "2m_temperature":
        nwp = nwp.rename({"2t":"2m_temperature"})

    nwp = nwp.compute()
    df = nwp[target_var].to_dataframe().reset_index()
    
    if target_var == '2m_temperature':
        nwp.to_netcdf('/data/GC_output/analysis/nwp_t2m_GlobAvg.nc')
        with open('/data/GC_output/analysis/GC_t2m_GlobAvg.pkl', 'wb') as f:
            pickle.dump(perturb_datasets, f)

    elif target_var == 'geopotential':
        nwp.to_netcdf('/data/GC_output/analysis/nwp_z500_GlobAvg.nc')
        with open('/data/GC_output/analysis/GC_z500_GlobAvg.pkl', 'wb') as f:
            pickle.dump(perturb_datasets, f)
    

# Raw NWP has useless dimention.... to handle that..

In [3]:
import glob
import re
import xarray as xr
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from multiprocessing import Pool
from lib.his_preprocess import *

In [9]:
for target_var in ["2m_temperature", "geopotential"]:
    def piping(dataset: xr.Dataset, target_var):
        return preprocess_nwp(dataset, target_var)

    from functools import partial
    pipe = partial(piping, target_var=target_var)

    if target_var == '2m_temperature':
        files = glob.glob('/geodata2/S2S/ECMWF_Perturbed/Dailyaveraged/t2m/nc/*/Temperature2m_2021-06-21.nc')
        
        # 경로에서 숫자 부분을 추출하여 정렬
        files = sorted(files, key=lambda x: int(x.split('/')[-2]))

    elif target_var == 'geopotential':
        files = glob.glob('/geodata2/S2S/ECMWF_Perturbed/InstantaneousAccumulated/z/nc/*/Z_2021-06-21.nc')
        files = sorted(files, key=lambda x: int(x.split('/')[-2]))

    # 7일 이하 데이터만 필터링
    files = [f for f in files if int(f.split('/')[-2]) <= 24 * 7]

    # 정렬된 순서 확인
    for f in files:
        print(f.split('/')[-2])  # 디버깅용

    # 순서대로 데이터셋 생성
    datasets = [preprocess_nwp(xr.open_dataset(f), target_var) for f in files]
    
    # date 차원으로 병합할 때 순서 보장
    nwp = xr.concat(datasets, dim="date")
    
    # 필요하다면 date 차원 정렬
    nwp = nwp.sortby('date')

    if target_var == "2m_temperature":
        nwp = nwp.rename({"2t": "2m_temperature"})

    nwp = nwp.compute()
    df = nwp[target_var].to_dataframe().reset_index()

    if target_var == '2m_temperature':
        nwp.to_netcdf('/data/GC_output/analysis/percent/nwp_t2m_Globraw.nc')
    elif target_var == 'geopotential':
        nwp.to_netcdf('/data/GC_output/analysis/percent/nwp_z500_Globraw.nc')

24
48
72
96
120
144
168
24
48
72
96
120
144
168
