<a href="https://colab.research.google.com/github/MatP-DS/MasterThesis/blob/main/npz_pipeline_full_features.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install rasterio numpy

import os
import numpy as np
import rasterio
from tqdm import tqdm

Collecting rasterio
  Downloading rasterio-1.4.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.1 kB)
Collecting affine (from rasterio)
  Downloading affine-2.4.0-py3-none-any.whl.metadata (4.0 kB)
Collecting cligj>=0.5 (from rasterio)
  Downloading cligj-0.7.2-py3-none-any.whl.metadata (5.0 kB)
Collecting click-plugins (from rasterio)
  Downloading click_plugins-1.1.1.2-py2.py3-none-any.whl.metadata (6.5 kB)
Downloading rasterio-1.4.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (22.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m22.2/22.2 MB[0m [31m46.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading cligj-0.7.2-py3-none-any.whl (7.1 kB)
Downloading affine-2.4.0-py3-none-any.whl (15 kB)
Downloading click_plugins-1.1.1.2-py2.py3-none-any.whl (11 kB)
Installing collected packages: cligj, click-plugins, affine, rasterio
Successfully installed affine-2.4.0 click-plugins-1.1.1.2 cligj-0.7.2 rasterio-1.4.3


In [None]:
# Safely mount Google Drive only if not mounted
import os
from google.colab import drive

if not os.path.ismount('/content/drive'):
    drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
# -----------------------------
# STEP 2: Configuration
# -----------------------------
input_folder = "/content/drive/MyDrive/MasterThesis/01_raw_data/01_sentinel2/full_feature_stack"
output_folder = "/content/drive/MyDrive/MasterThesis/02_preprocessed_data"
n_bands = 11  # B2, B3, B4, B8, B11, B12, EVI, NDWI, NBR, SAVI, NDVI

os.makedirs(output_folder, exist_ok=True)
tif_files = sorted([f for f in os.listdir(input_folder) if f.endswith(".tif")])

# -----------------------------
# STEP 3: Load and stack monthly images
# -----------------------------
monthly_arrays = []
valid_mask = None

for tif_file in tqdm(tif_files, desc="Loading GeoTIFFs"):
    path = os.path.join(input_folder, tif_file)
    with rasterio.open(path) as src:
        img = src.read().astype(np.float32)  # shape: (bands, rows, cols)

        if valid_mask is None:
            valid_mask = np.all(~np.isnan(img), axis=0) & (np.sum(img, axis=0) != 0)
            rows, cols = valid_mask.shape

        img_masked = img[:, valid_mask]  # shape: (bands, n_valid)
        monthly_arrays.append(np.transpose(img_masked, (1, 0)))  # shape: (n_valid, bands)


Loading GeoTIFFs: 100%|██████████| 46/46 [00:55<00:00,  1.22s/it]


In [None]:
# -----------------------------
# STEP 4: Reshape and Save
# -----------------------------
X_time = np.stack(monthly_arrays, axis=1)  # shape: (n_valid, time, bands)
X_flat = X_time.reshape(-1, n_bands)       # shape: (n_valid * time, bands)

np.savez_compressed(os.path.join(output_folder, "X_time_series_full.npz"), X=X_time)
np.savez_compressed(os.path.join(output_folder, "X_flattened_full.npz"), X=X_flat)


print("Export completed:")
print(f" - X_time_series_full.npz: {X_time.shape}")
print(f" - X_flattened_full.npz:   {X_flat.shape}")

Export completed:
 - X_time_series_full.npz: (466708, 46, 11)
 - X_flattened_full.npz:   (21468568, 11)
