# Header

preprocess to generate `/data/GC_output/analysis/percent2` dataset

forked from `preprocess.ipynb`

In [9]:
import glob
import re
import xarray as xr
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from multiprocessing import Pool
from concurrent.futures import ThreadPoolExecutor
from lib.his_preprocess import *

In [10]:
# GC 데이터 전처리

### 
# p_1: 11111111111 / 모든 변수
# p_2: 00000000001 / 타켓변수만
# p_3: 11111111110 / 타켓 변수 제외
# mean: 전지구 평균/ raw: 원본 데이터
###

scale_list = [1,3,5,7,9]

file_dic ={}
for target_var in ["2m_temperature"]:

    if target_var == '2m_temperature':
        # file_dic[0] = sorted(glob.glob('/data/GC_output/percent2/GC_11111111111_250_*.nc'))
        for i, scale in enumerate(scale_list):
            file_dic[i] = sorted(glob.glob(f'/data/GC_output/percent2/GC_11111111111_250_{scale}.nc'))
            print(file_dic[i])
    # elif target_var == 'geopotential':
    #     p_1 = sorted(glob.glob('/data/GC_output/percent2/GC_11111111111_*_*.nc'))
    #     p_2 = sorted(glob.glob('/data/GC_output/percent2/GC_00000100000_*_*.nc'))
    #     p_3 = sorted(glob.glob('/data/GC_output/percent2/GC_11111011111_*_*.nc'))

    # Assign base colors for each partition
    colors_list = [
        '#008000', # 초록
        '#FF00FF', # 마젠타
        '#0000FF', # 파랑
        '#800000', # 마룬
        '#800080', # 보라
        '#FF0000', # 빨강
        '#00FF00', # 라임
        '#00FFFF', # 시안
        '#FFA500', # 주황
        '#FF69B4', # 핫핑크
        '#4B0082', # 인디고
        '#FFD700', # 골드
        '#8B4513', # 새들브라운
        '#FF4500', # 오렌지레드
        '#00FF7F', # 스프링그린
        '#1E90FF', # 도저블루
        '#FF1493', # 딥핑크
        '#7B68EE', # 미디엄슬레이트블루
        '#20B2AA', # 라이트시그린
        '#DAA520' # 골든로드
    ]

    def extract_perturbation_info(filename):
        match = re.search(r'GC_([01]{11})_([\d.eE+-]+)_(.*?)\.nc$', filename)
        if match:
            var = match.group(1)  # perturbation 코드
            value = match.group(2)  # 값 (예: 0.001)
            region = match.group(3)  # 지역 코드 (예: 9p)
            print(f"{value}_{region}_{var}")
            return f"{value}_{region}_{var}"
        else:
            return None

    # Collect perturbation files with labels and colors
    perturb_files = []
    
    for i, partition_files in file_dic.items():
        base_color = colors_list[i%10]
        num_files = len(partition_files)
        # Generate different shades of the base color
        colors = sns.light_palette(base_color, n_colors=num_files + 2)[1:-1]
        for i, file in enumerate(partition_files):
            perturb_info = extract_perturbation_info(file)
            if perturb_info:
                label = f"{perturb_info}"
                color = colors[i % len(colors)]
                perturb_files.append((label, color, file))
    
    # Modify labels for p_0 and p_9 ./ 1p,3p -> 10,30
    for i, (label, color, file) in enumerate(perturb_files):
        if 'p_0' in label or 'p_9' in label:
            # 레이블 수정: '_[0-9]+p' 패턴을 '_0'으로 대체
            new_label = re.sub(r'_(\d+)p', r'_\g<1>0', label)
            perturb_files[i] = (new_label, color, file)

    perturb_datasets_raw = []
    perturb_datasets_mean = []

    def process_file(file_info):
        label, color, file = file_info
        dataset_raw = preprocess_GC(xr.open_dataset(file), target_var)
        return {"mean": (label, color, weighted_mean(dataset_raw)), "raw": (label, color, dataset_raw)}

    with Pool(processes=10) as pool:
        results = pool.map(process_file, perturb_files)

    perturb_datasets_mean = [result["mean"] for result in results]
    perturb_datasets_raw = [result["raw"] for result in results]

    if target_var == '2m_temperature':
        with open('/data/GC_output/analysis/percent2/zoom_shuffle_percent_GC_t2m_GlobAvg.pkl', 'wb') as f:
            pickle.dump(perturb_datasets_mean, f)
        with open('/data/GC_output/analysis/percent2/zoom_shuffle_percent_GC_t2m_Globraw.pkl', 'wb') as f:
            pickle.dump(perturb_datasets_raw, f)

['/data/GC_output/percent2/GC_11111111111_250_1.nc']
['/data/GC_output/percent2/GC_11111111111_250_3.nc']
['/data/GC_output/percent2/GC_11111111111_250_5.nc']
['/data/GC_output/percent2/GC_11111111111_250_7.nc']
['/data/GC_output/percent2/GC_11111111111_250_9.nc']
250_1_11111111111
250_3_11111111111
250_5_11111111111
250_7_11111111111
250_9_11111111111
