In [3]:
import xarray as xr

ds = xr.open_dataset(r"data\raw\BRB_Combined_Consecutive.nc")
print(ds)


<xarray.Dataset> Size: 363MB
Dimensions:              (time: 300, lat: 197, lon: 128)
Coordinates:
  * lat                  (lat) float64 2kB -28.95 -28.9 -28.85 ... -19.2 -19.15
  * lon                  (lon) float64 1kB 146.0 146.1 146.1 ... 152.3 152.3
  * time                 (time) datetime64[ns] 2kB 2000-01-01 ... 2024-12-01
Data variables:
    monthly_rain         (time, lat, lon) float64 61MB ...
    max_temp             (time, lat, lon) float64 61MB ...
    min_temp             (time, lat, lon) float64 61MB ...
    radiation            (time, lat, lon) float64 61MB ...
    spi_1                (time, lat, lon) float64 61MB ...
    consecutive_drought  (time, lat, lon) float64 61MB ...
Attributes:
    long_name:     Monthly rainfall
    units:         mm
    valid_min:     -32765
    valid_max:     32767
    grid_mapping:  spatial_ref


In [None]:
import os
import numpy as np
import xarray as xr
from scipy import stats

# Directories
RAW_DATA_DIR = r"data\raw"
PATCHES_DIR = r"data\patches"
PATCH_SIZE = 32
STRIDE = 16

# Ensure patch directory exists
os.makedirs(PATCHES_DIR, exist_ok=True)

def extract_patches(region_file):
    # Load region data
    ds = xr.open_dataset(os.path.join(RAW_DATA_DIR, region_file))
    region_name = region_file.split("_Combined_Consecutive.nc")[0]
    region_dir = os.path.join(PATCHES_DIR, region_name)
    os.makedirs(region_dir, exist_ok=True)

    # Extract variables
    variables = ["monthly_rain", "max_temp", "min_temp", "radiation", "spi_1", "consecutive_drought"]
    data = np.stack([ds[var].values for var in variables], axis=-1)

    lat_size, lon_size, _ = data.shape

    # Patch extraction
    for lat in range(0, lat_size - PATCH_SIZE + 1, STRIDE):
        for lon in range(0, lon_size - PATCH_SIZE + 1, STRIDE):
            patch = data[lat:lat + PATCH_SIZE, lon:lon + PATCH_SIZE, :]

            # Handle small patches with padding
            if patch.shape[0] < PATCH_SIZE or patch.shape[1] < PATCH_SIZE:
                padded_patch = np.zeros((PATCH_SIZE, PATCH_SIZE, patch.shape[2]))
                padded_patch[:patch.shape[0], :patch.shape[1], :] = patch
                patch = padded_patch

            # Extract label using mode of consecutive_drought
            label_patch = patch[:, :, -1]  # consecutive_drought
            patch_label = stats.mode(label_patch.flatten(), keepdims=False).mode

            # Save patch and label
            patch_filename = f"{region_name}_{lat}_{lon}.npy"
            label_filename = f"{region_name}_{lat}_{lon}_label.npy"

            np.save(os.path.join(region_dir, patch_filename), patch)
            np.save(os.path.join(region_dir, label_filename), patch_label)

if __name__ == "__main__":
    # Process each region
    for file in os.listdir(RAW_DATA_DIR):
        if file.endswith("_Combined_Consecutive.nc"):
            print(f"Processing {file}...")
            extract_patches(file)
            print(f"Finished processing {file}.")


In [4]:
import os
import numpy as np
import xarray as xr
from scipy import stats

# Directories
RAW_DATA_DIR = r"data\raw"
PATCHES_DIR = r"data\patches"
PATCH_SIZE = 32
STRIDE = 16

# Ensure patch directory exists
os.makedirs(PATCHES_DIR, exist_ok=True)

def extract_patches(region_file):
    # Load region data
    ds = xr.open_dataset(os.path.join(RAW_DATA_DIR, region_file))
    region_name = region_file.split("_Combined_Consecutive.nc")[0]
    region_dir = os.path.join(PATCHES_DIR, region_name)
    os.makedirs(region_dir, exist_ok=True)

    # Extract variables
    variables = ["monthly_rain", "max_temp", "min_temp", "radiation", "spi_1", "consecutive_drought"]

    # Iterate over each time step
    for time in range(ds.dims['time']):
        stacked_data = [ds[var].isel(time=time).values for var in variables]
        data = np.stack(stacked_data, axis=-1)

        lat_size, lon_size = data.shape[0], data.shape[1]

        # Patch extraction
        for lat in range(0, lat_size - PATCH_SIZE + 1, STRIDE):
            for lon in range(0, lon_size - PATCH_SIZE + 1, STRIDE):
                patch = data[lat:lat + PATCH_SIZE, lon:lon + PATCH_SIZE, :]

                # Handle small patches with padding
                if patch.shape[0] < PATCH_SIZE or patch.shape[1] < PATCH_SIZE:
                    padded_patch = np.zeros((PATCH_SIZE, PATCH_SIZE, patch.shape[2]))
                    padded_patch[:patch.shape[0], :patch.shape[1], :] = patch
                    patch = padded_patch

                # Extract label using mode of consecutive_drought
                label_patch = patch[:, :, -1]  # consecutive_drought
                patch_label = stats.mode(label_patch.flatten(), keepdims=False).mode

                # Save patch and label
                patch_filename = f"{region_name}_t{time}_{lat}_{lon}.npy"
                label_filename = f"{region_name}_t{time}_{lat}_{lon}_label.npy"

                np.save(os.path.join(region_dir, patch_filename), patch)
                np.save(os.path.join(region_dir, label_filename), patch_label)

if __name__ == "__main__":
    # Process each region
    for file in os.listdir(RAW_DATA_DIR):
        if file.endswith("_Combined_Consecutive.nc"):
            print(f"Processing {file}...")
            extract_patches(file)
            print(f"Finished processing {file}.")

Processing BRB_Combined_Consecutive.nc...


  for time in range(ds.dims['time']):


Finished processing BRB_Combined_Consecutive.nc.
Processing CHC_Combined_Consecutive.nc...


  for time in range(ds.dims['time']):


Finished processing CHC_Combined_Consecutive.nc.
Processing CQC_Combined_Consecutive.nc...


  for time in range(ds.dims['time']):


Finished processing CQC_Combined_Consecutive.nc.
Processing CYP_Combined_Consecutive.nc...


  for time in range(ds.dims['time']):


Finished processing CYP_Combined_Consecutive.nc.
Processing DEU_Combined_Consecutive.nc...


  for time in range(ds.dims['time']):


Finished processing DEU_Combined_Consecutive.nc.
Processing EIU_Combined_Consecutive.nc...


  for time in range(ds.dims['time']):


Finished processing EIU_Combined_Consecutive.nc.
Processing GUP_Combined_Consecutive.nc...


  for time in range(ds.dims['time']):


Finished processing GUP_Combined_Consecutive.nc.
Processing MGD_Combined_Consecutive.nc...


  for time in range(ds.dims['time']):


Finished processing MGD_Combined_Consecutive.nc.
Processing MUL_Combined_Consecutive.nc...


  for time in range(ds.dims['time']):


Finished processing MUL_Combined_Consecutive.nc.
Processing NET_Combined_Consecutive.nc...


  for time in range(ds.dims['time']):


Finished processing NET_Combined_Consecutive.nc.
Processing NWH_Combined_Consecutive.nc...


  for time in range(ds.dims['time']):


Finished processing NWH_Combined_Consecutive.nc.
Processing SEQ_Combined_Consecutive.nc...


  for time in range(ds.dims['time']):


Finished processing SEQ_Combined_Consecutive.nc.
Processing WET_Combined_Consecutive.nc...


  for time in range(ds.dims['time']):


Finished processing WET_Combined_Consecutive.nc.


In [1]:
import os
import numpy as np
import xarray as xr
from scipy import stats

# Directories
RAW_DATA_DIR = r"data\raw"
PATCHES_DIR = r"data\patches with padding_v02"
PATCH_SIZE = 32
STRIDE = 16

# Ensure patch directory exists
os.makedirs(PATCHES_DIR, exist_ok=True)

# Patch Extraction with NaN Handling
def extract_patches(region_file):
    # Load region data
    ds = xr.open_dataset(os.path.join(RAW_DATA_DIR, region_file))
    region_name = region_file.split("_Combined_Consecutive.nc")[0]
    region_dir = os.path.join(PATCHES_DIR, region_name)
    os.makedirs(region_dir, exist_ok=True)

    # Extract variables
    variables = ["monthly_rain", "max_temp", "min_temp", "radiation", "spi_1", "consecutive_drought"]
    total_patches, valid_patches = 0, 0

    # Iterate over each time step
    for time in range(ds.dims['time']):
        stacked_data = [ds[var].isel(time=time).values for var in variables]
        data = np.stack(stacked_data, axis=-1)

        lat_size, lon_size = data.shape[0], data.shape[1]

        # Patch extraction
        for lat in range(0, lat_size, STRIDE):
            for lon in range(0, lon_size, STRIDE):
                patch = data[lat:lat + PATCH_SIZE, lon:lon + PATCH_SIZE, :]

                # Ensure patch is 32x32 with padding
                padded_patch = np.zeros((PATCH_SIZE, PATCH_SIZE, patch.shape[2]))
                padded_patch[:patch.shape[0], :patch.shape[1], :] = np.nan_to_num(patch, nan=0.0)  # Replace NaNs with 0

                # Calculate label (mode of non-NaN values)
                label_patch = padded_patch[:, :, -1]  # consecutive_drought
                label_patch = label_patch[~np.isnan(label_patch)]  # Remove NaNs

                if len(label_patch) == 0:  # Skip if all are NaN
                    continue

                patch_label = stats.mode(label_patch.flatten(), keepdims=False).mode

                # Save only valid patches
                if not np.isnan(patch_label):
                    patch_filename = f"{region_name}_t{time}_{lat}_{lon}.npy"
                    label_filename = f"{region_name}_t{time}_{lat}_{lon}_label.npy"
                    np.save(os.path.join(region_dir, patch_filename), padded_patch)
                    np.save(os.path.join(region_dir, label_filename), patch_label)
                    valid_patches += 1

                total_patches += 1

    print(f"{region_name}: Total Patches = {total_patches}, Valid Patches = {valid_patches}")

if __name__ == "__main__":
    # Process each region
    for file in os.listdir(RAW_DATA_DIR):
        if file.endswith("_Combined_Consecutive.nc"):
            print(f"Processing {file}...")
            extract_patches(file)
            print(f"Finished processing {file}.")


Processing BRB_Combined_Consecutive.nc...


  for time in range(ds.dims['time']):


BRB: Total Patches = 31200, Valid Patches = 31200
Finished processing BRB_Combined_Consecutive.nc.
Processing CHC_Combined_Consecutive.nc...


  for time in range(ds.dims['time']):


CHC: Total Patches = 21600, Valid Patches = 21600
Finished processing CHC_Combined_Consecutive.nc.
Processing CQC_Combined_Consecutive.nc...


  for time in range(ds.dims['time']):


CQC: Total Patches = 4800, Valid Patches = 4800
Finished processing CQC_Combined_Consecutive.nc.
Processing CYP_Combined_Consecutive.nc...


  for time in range(ds.dims['time']):


CYP: Total Patches = 12000, Valid Patches = 12000
Finished processing CYP_Combined_Consecutive.nc.
Processing DEU_Combined_Consecutive.nc...


  for time in range(ds.dims['time']):


DEU: Total Patches = 5400, Valid Patches = 5400
Finished processing DEU_Combined_Consecutive.nc.
Processing EIU_Combined_Consecutive.nc...


  for time in range(ds.dims['time']):


EIU: Total Patches = 14700, Valid Patches = 14700
Finished processing EIU_Combined_Consecutive.nc.
Processing GUP_Combined_Consecutive.nc...


  for time in range(ds.dims['time']):


GUP: Total Patches = 24300, Valid Patches = 24300
Finished processing GUP_Combined_Consecutive.nc.
Processing MGD_Combined_Consecutive.nc...


  for time in range(ds.dims['time']):


MGD: Total Patches = 36000, Valid Patches = 36000
Finished processing MGD_Combined_Consecutive.nc.
Processing MUL_Combined_Consecutive.nc...


  for time in range(ds.dims['time']):


MUL: Total Patches = 16200, Valid Patches = 16200
Finished processing MUL_Combined_Consecutive.nc.
Processing NET_Combined_Consecutive.nc...


  for time in range(ds.dims['time']):


NET: Total Patches = 1200, Valid Patches = 1200
Finished processing NET_Combined_Consecutive.nc.
Processing NWH_Combined_Consecutive.nc...


  for time in range(ds.dims['time']):


NWH: Total Patches = 8400, Valid Patches = 8400
Finished processing NWH_Combined_Consecutive.nc.
Processing SEQ_Combined_Consecutive.nc...


  for time in range(ds.dims['time']):


SEQ: Total Patches = 8400, Valid Patches = 8400
Finished processing SEQ_Combined_Consecutive.nc.
Processing WET_Combined_Consecutive.nc...


  for time in range(ds.dims['time']):


WET: Total Patches = 4500, Valid Patches = 4500
Finished processing WET_Combined_Consecutive.nc.


In [None]:
import os
import numpy as np
import xarray as xr
from scipy import stats
import matplotlib.pyplot as plt

# Directories
RAW_DATA_DIR = r"data\raw"
PATCHES_DIR = r"data\patches"
PATCH_SIZE = 32
STRIDE = 16

# Ensure patch directory exists
os.makedirs(PATCHES_DIR, exist_ok=True)

# Patch Extraction with NaN Handling and Label Distribution Plotting
def extract_patches(region_file):
    # Load region data
    ds = xr.open_dataset(os.path.join(RAW_DATA_DIR, region_file))
    region_name = region_file.split("_Combined_Consecutive.nc")[0]
    region_dir = os.path.join(PATCHES_DIR, region_name)
    os.makedirs(region_dir, exist_ok=True)

    # Extract variables
    variables = ["monthly_rain", "max_temp", "min_temp", "radiation", "spi_1", "consecutive_drought"]
    total_patches, valid_patches = 0, 0
    label_distribution = []

    # Iterate over each time step
    for time in range(ds.dims['time']):
        stacked_data = [ds[var].isel(time=time).values for var in variables]
        data = np.stack(stacked_data, axis=-1)

        lat_size, lon_size = data.shape[0], data.shape[1]

        # Patch extraction
        for lat in range(0, lat_size, STRIDE):
            for lon in range(0, lon_size, STRIDE):
                patch = data[lat:lat + PATCH_SIZE, lon:lon + PATCH_SIZE, :]

                # Ensure patch is 32x32 with padding
                padded_patch = np.zeros((PATCH_SIZE, PATCH_SIZE, patch.shape[2]))
                padded_patch[:patch.shape[0], :patch.shape[1], :] = np.nan_to_num(patch, nan=0.0)

                # Calculate label (mode of non-NaN values)
                label_patch = padded_patch[:, :, -1]  # consecutive_drought
                label_patch = label_patch[~np.isnan(label_patch)]  # Remove NaNs

                if len(label_patch) == 0:  # Skip if all are NaN
                    continue

                patch_label = stats.mode(label_patch.flatten(), keepdims=False).mode

                # Save only valid patches
                if not np.isnan(patch_label):
                    patch_filename = f"{region_name}_t{time}_{lat}_{lon}.npy"
                    label_filename = f"{region_name}_t{time}_{lat}_{lon}_label.npy"
                    np.save(os.path.join(region_dir, patch_filename), padded_patch)
                    np.save(os.path.join(region_dir, label_filename), patch_label)
                    valid_patches += 1
                    label_distribution.append(patch_label)

                total_patches += 1

    # Plotting Label Distribution
    plt.figure(figsize=(6, 4))
    plt.hist(label_distribution, bins=3, edgecolor='black')
    plt.title(f"Label Distribution for {region_name}")
    plt.xlabel("Label (0 = No Drought, 1 = Consecutive Drought)")
    plt.ylabel("Frequency")
    plt.show()

    print(f"{region_name}: Total Patches = {total_patches}, Valid Patches = {valid_patches}")

if __name__ == "__main__":
    # Process each region
    for file in os.listdir(RAW_DATA_DIR):
        if file.endswith("_Combined_Consecutive.nc"):
            print(f"Processing {file}...")
            extract_patches(file)
            print(f"Finished processing {file}.")
