In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

### Step 0 ‚Äî Dataset Validation

Before preprocessing, we validated the structure of both datasets used for SEN12MS. 

The Sentinel-2 RGB+NIR tiles (from `deepnir-nir-rgb-sen12ms-dataset`) were confirmed to follow the naming pattern `ROIs<ROI>_<season>_s<scene>_p<patch>.png` and are organized into train/val/test splits with RGB in `_A` folders and NIR in `_B` folders. 

The LC masks uploaded from the official SEN12MS dataset were confirmed to be TIFF files stored inside nested `lc_<sceneID>` folders, following the pattern `ROIs<ROI>_<season>_lc_<sceneID>_p<PATCHID>.tif`. 

These patterns match perfectly and allow deterministic mapping between S2 image tiles and LC masks for segmentation.


In [None]:
import os
root = "/kaggle/input/deepnir-nir-rgb-sen12ms-dataset"
print("Dataset accessible:", os.path.exists(root))

In [None]:
!echo "=== S2 ROOT ==="
!ls -d /kaggle/input/deepnir-nir-rgb-sen12ms-dataset/sen12ms_All_seasons/*

!echo ""
!echo "=== Example files from train_A ==="
!ls /kaggle/input/deepnir-nir-rgb-sen12ms-dataset/sen12ms_All_seasons/train_A | head

!echo ""
!echo "=== Example files from train_B ==="
!ls /kaggle/input/deepnir-nir-rgb-sen12ms-dataset/sen12ms_All_seasons/train_B | head

In [None]:
root1 = "/kaggle/input/sen12ms-lc"
print("Dataset accessible:", os.path.exists(root1))

In [None]:
!echo ""
!echo "=== LC spring folder ==="
!ls -d /kaggle/input/sen12ms-lc/ROIs1158_spring_lc/*

!echo ""
!echo "=== Example LC files (spring) ==="
!ls /kaggle/input/sen12ms-lc/ROIs1158_spring_lc/ROIs1158_spring | head

In [None]:
!echo "=== Inside lc_1 ==="
!ls /kaggle/input/sen12ms-lc/ROIs1158_spring_lc/ROIs1158_spring/lc_1 | head

!echo ""
!echo "=== Inside lc_100 ==="
!ls /kaggle/input/sen12ms-lc/ROIs1158_spring_lc/ROIs1158_spring/lc_100 | head

!echo ""
!echo "=== Try to find any mask file in the spring LC folder ==="
!find /kaggle/input/sen12ms-lc/ROIs1158_spring_lc/ROIs1158_spring -type f | head


### Step 1 ‚Äî Create Processing Directory
A dedicated directory `sen12ms_processed` was created inside the Kaggle working environment. All generated outputs (bands, masks, metadata) will be stored here for easy tracking and export.

In [None]:
!mkdir -p /kaggle/working/sen12ms_processed
!echo "Created /kaggle/working/sen12ms_processed"
!mkdir -p /kaggle/working/indexes
!echo "Created /kaggle/working/indexes"

### Step 2 ‚Äî Full Dataset Indexing in Safe Batches
To avoid performance issues when scanning more than 40,000 SEN12MS tiles, we processed the dataset in controlled batches of 5,000 files per iteration. Each batch extracted the ROI, season, scene ID, and patch ID from the S2 filename, resolved the corresponding LC path, and wrote a partial CSV file. These partial index files will be merged into a final unified mapping table used for downstream preprocessing.


In [None]:
import os
import pandas as pd

S2_ROOT = "/kaggle/input/deepnir-nir-rgb-sen12ms-dataset/sen12ms_All_seasons"
LC_ROOT = "/kaggle/input/sen12ms-lc"

def parse_s2_filename(filename):
    base = filename.replace(".png", "")
    parts = base.split("_")
    roi = parts[0]
    season = parts[1]
    scene = int(parts[2][1:])
    patch = int(parts[3][1:])
    return roi, season, scene, patch

def build_batch(split_name, start_idx, end_idx):
    rgb_folder = os.path.join(S2_ROOT, f"{split_name}_A")
    nir_folder = os.path.join(S2_ROOT, f"{split_name}_B")

    rgb_files = sorted([f for f in os.listdir(rgb_folder) if f.endswith(".png")])
    rgb_files = rgb_files[start_idx:end_idx]

    rows = []

    for f in rgb_files:
        roi, season, scene, patch = parse_s2_filename(f)

        rgb_path = os.path.join(rgb_folder, f)
        nir_path = os.path.join(nir_folder, f)

        lc_dir = f"{roi}_{season}"
        lc_scene_dir = os.path.join(LC_ROOT, f"{lc_dir}_lc", lc_dir, f"lc_{scene}")
        lc_filename = f"{lc_dir}_lc_{scene}_p{patch}.tif"
        lc_path = os.path.join(lc_scene_dir, lc_filename)

        if os.path.exists(lc_path):
            rows.append({
                "rgb": rgb_path,
                "nir": nir_path,
                "lc": lc_path,
                "roi": roi,
                "season": season,
                "scene": scene,
                "patch": patch
            })

    return pd.DataFrame(rows)

# Process all splits
splits = ["train", "val", "test"]
batch_size = 10000
partial_files = []

for split in splits:
    print(f"\n=== Processing split: {split.upper()} ===")

    rgb_folder = os.path.join(S2_ROOT, f"{split}_A")
    all_files = sorted([f for f in os.listdir(rgb_folder) if f.endswith(".png")])
    total = len(all_files)

    print(f"{total} files found in {split}_A")

    for start in range(0, total, batch_size):
        end = min(start + batch_size, total)
        print(f"  ‚Üí Batch {start} to {end}")

        df_batch = build_batch(split, start, end)

        out_csv = f"/kaggle/working/indexes/index_{split}_{start}_{end}.csv"
        df_batch.to_csv(out_csv, index=False)
        partial_files.append(out_csv)

print("\nBatched index build complete.")
partial_files

### Step 2.1 ‚Äî Merge Partial Index Files
After building the SEN12MS S2‚ÜíLC mapping in safe batches, all partial index CSVs were merged into a single unified index file. This file lists the full paths for RGB and NIR Sentinel-2 tiles along with their matching LandCover masks and metadata fields (ROI, season, scene, patch). This consolidated index enables efficient and fully aligned preprocessing in later stages.


In [None]:
import pandas as pd
import glob
import os

# Folder with partial CSVs
INDEX_DIR = "/kaggle/working/indexes"

# Find partial CSVs inside the folder
partial_csvs = sorted(glob.glob(os.path.join(INDEX_DIR, "index_*.csv")))

print("Found", len(partial_csvs), "partial CSVs")

dfs = []
for f in partial_csvs:
    print("Merging:", f)
    dfs.append(pd.read_csv(f))

df_full = pd.concat(dfs, ignore_index=True)
print("\nFinal merged shape:", df_full.shape)

# Save final merged CSV OUTSIDE the folder, in /kaggle/working
df_full.to_csv("/kaggle/working/sen12ms_index.csv", index=False)

print("\nSaved final index as /kaggle/working/sen12ms_index.csv")

### Step 3 ‚Äî 4-Band S2 Extraction (B2, B3, B4, B8)
Using the unified index, each Sentinel-2 sample was loaded in batches of 5000 images. The RGB PNG (containing B2, B3, B4) and NIR PNG (containing B8) were read, normalized to [0,1], and stacked into 4-band tensors of shape (64√ó64√ó4). LandCover TIFF files were loaded in parallel to provide the raw integer label mask. Each batch was saved to disk to avoid memory overload.

In [None]:
import numpy as np
import pandas as pd
import cv2
from tqdm import tqdm
import os
import tifffile

# Ensure batches folder exists
os.makedirs("/kaggle/working/sen12ms_batches", exist_ok=True)

# Load final unified index
df = pd.read_csv("/kaggle/working/sen12ms_index.csv")
print("Total samples:", df.shape[0])

batch_size = 10000

def load_rgb(path):
    img = cv2.imread(path, cv2.IMREAD_UNCHANGED)
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

    if img.shape[:2] != (64, 64):
        img = cv2.resize(img, (64, 64), interpolation=cv2.INTER_NEAREST)

    return img.astype(np.float32) / 255.0

def load_nir(path):
    img = cv2.imread(path, cv2.IMREAD_UNCHANGED)

    if img is None:
        raise ValueError("NIR image missing: " + path)

    if img.ndim == 3:
        img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

    if img.shape != (64, 64):
        img = cv2.resize(img, (64, 64), interpolation=cv2.INTER_NEAREST)

    img = img.astype(np.float32) / 255.0
    return img.reshape(64, 64, 1)

batch_id = 0

for start in range(0, df.shape[0], batch_size):
    end = min(start + batch_size, df.shape[0])
    print(f"\n=== Processing BAND batch {batch_id}: {start} to {end} ===")

    rows = df.iloc[start:end]

    bands_batch = []

    for idx, row in tqdm(rows.iterrows(), total=len(rows)):
        rgb = load_rgb(row['rgb'])
        nir = load_nir(row['nir'])

        band4 = np.concatenate([rgb, nir], axis=-1)   # (64,64,4)

        bands_batch.append(band4)

    bands_batch = np.array(bands_batch, dtype=np.float32)

    # Save ONLY bands
    np.savez_compressed(
        f"/kaggle/working/sen12ms_batches/sen12ms_bands_batch_{batch_id}.npz",
        bands=bands_batch
    )

    print(f"Saved BAND batch {batch_id}")
    batch_id += 1

print("\nAll BAND batches completed.")


In [None]:
import os
import numpy as np
from skimage.filters import threshold_otsu
from tqdm import tqdm

BATCH_DIR = "/kaggle/working/sen12ms_batches"

def compute_ndwi(tile):
    """
    NDWI = (Green - NIR) / (Green + NIR)
    tile shape = (64, 64, 4)
    channel 1 = G
    channel 3 = NIR
    """
    green = tile[:, :, 1].astype("float32")
    nir   = tile[:, :, 3].astype("float32")
    ndwi = (green - nir) / (green + nir + 1e-8)
    return ndwi


# ---- FIND ALL BAND BATCHES ----
band_batches = sorted([f for f in os.listdir(BATCH_DIR) if f.startswith("sen12ms_bands_batch")])
print("Found band batches:", len(band_batches))

for batch_idx, bb in enumerate(band_batches):
    print(f"\n=== Processing MASKS for batch {batch_idx}: {bb} ===")

    batch_path = os.path.join(BATCH_DIR, bb)
    tiles = np.load(batch_path)["bands"]    # (N, 64, 64, 4)

    masks_out = []

    for i in tqdm(range(tiles.shape[0])):
        tile = tiles[i]

        # 1) NDWI
        ndwi = compute_ndwi(tile)

        # 2) OTSU threshold
        try:
            thr = threshold_otsu(ndwi)
        except:
            thr = 0.0

        # 3) Binary mask: water=1, land=0
        mask = (ndwi > thr).astype("uint8")
        masks_out.append(mask)

    masks_out = np.stack(masks_out, axis=0)

    # ---- SAVE MASK BATCH ----
    out_file = os.path.join(BATCH_DIR, f"sen12ms_masks_batch_{batch_idx}.npz")
    np.savez_compressed(out_file, masks=masks_out)

    print("Saved:", out_file, masks_out.shape)

### Step 4 ‚Äî Merge All SEN12MS Batches Into Final Dataset

After generating all sen12ms_bands_batch_*.npz and sen12ms_masks_batch_*.npz files, we now merge them into one unified dataset. This produces: sen12ms_bands.npy sen12ms_masks.npy sen12ms_metadata.json

We create one HDF5 file and write each batch into a section of the dataset. This avoids RAM overflows and avoids disk overflows.These outputs match the EuroSAT format and can be directly merged later.

In [None]:
import os
import json
import numpy as np
import h5py
from tqdm import tqdm

BATCH_DIR = "/kaggle/working/sen12ms_batches"
OUT_DIR = "/kaggle/working/sen12ms_processed"

os.makedirs(OUT_DIR, exist_ok=True)

# -------------------------------------------------------
# PASS 1 ‚Äî MERGE BAND BATCHES INTO sen12ms_bands.h5
# -------------------------------------------------------

bands_files = sorted([f for f in os.listdir(BATCH_DIR) if f.startswith("sen12ms_bands_batch")])
print("Found", len(bands_files), "band batches")

# Count total samples
total_samples = 0
for bf in bands_files:
    arr = np.load(os.path.join(BATCH_DIR, bf))
    total_samples += arr["bands"].shape[0]

print("Total band samples:", total_samples)

bands_h5_path = os.path.join(OUT_DIR, "sen12ms_bands.h5")

with h5py.File(bands_h5_path, "w") as h5f:
    ds = h5f.create_dataset("bands",
                            shape=(total_samples, 64, 64, 4),
                            dtype="float32")

    write_ptr = 0
    for bf in tqdm(bands_files):
        b = np.load(os.path.join(BATCH_DIR, bf))["bands"]
        bs = b.shape[0]
        ds[write_ptr:write_ptr+bs] = b
        write_ptr += bs

print("Bands merged ‚Üí", bands_h5_path)

# -------------------------------------------------------
# SAVE METADATA JSON
# -------------------------------------------------------

metadata = {
    "total_samples": total_samples,
    "image_size": [64, 64],
    "bands": ["R", "G", "B", "NIR"],
    "label_type": "LandCover",
}

meta_path = os.path.join(OUT_DIR, "sen12ms_metadata.json")
with open(meta_path, "w") as f:
    json.dump(metadata, f, indent=4)

print("\nMetadata saved ‚Üí", meta_path)

print("\n=== ALL MERGES COMPLETE ===")


In [None]:
import os
import numpy as np
import h5py
from tqdm import tqdm

BATCH_DIR = "/kaggle/working/sen12ms_batches"
OUT_DIR = "/kaggle/working/processed"
os.makedirs(OUT_DIR, exist_ok=True)

mask_files = sorted([f for f in os.listdir(BATCH_DIR) if f.startswith("sen12ms_masks_batch")])
print("Found mask batches:", len(mask_files))

# ---------- Step 1: Count total samples ----------
total = 0
for mf in mask_files:
    arr = np.load(os.path.join(BATCH_DIR, mf))["masks"]
    total += arr.shape[0]

print("Total mask samples:", total)

# ---------- Step 2: Create final HDF5 ----------
out_path = os.path.join(OUT_DIR, "sen12ms_masks.h5")
h5f = h5py.File(out_path, "w")

ds = h5f.create_dataset(
    "masks",
    shape=(total, 64, 64),
    dtype="int16"
)

# ---------- Step 3: Stream batches into HDF5 ----------
write_ptr = 0

for mf in tqdm(mask_files):
    m = np.load(os.path.join(BATCH_DIR, mf))["masks"]  # (N, 64, 64)
    n = m.shape[0]

    ds[write_ptr : write_ptr + n] = m
    write_ptr += n

h5f.close()

print("\nMask merge completed.")
print("Saved to:", out_path)


In [None]:
import h5py

bands_file = "/kaggle/working/sen12ms_processed/sen12ms_bands.h5"
masks_file = "/kaggle/working/sen12ms_processed/sen12ms_masks.h5"

with h5py.File(bands_file, "r") as f:
    print("BANDS:")
    print("Keys:", list(f.keys()))
    print("Shape:", f["bands"].shape)

with h5py.File(masks_file, "r") as f:
    print("\nMASKS:")
    print("Keys:", list(f.keys()))
    print("Shape:", f["masks"].shape)

# Final Dataset Merge: EuroSAT + SEN12MS

This is the final merge step where we combine the pre-processed 
**SEN12MS dataset** (stored in HDF5 format) with the **EuroSAT dataset** (stored in NPY format) 
to create one unified machine-learning‚Äìready dataset.

---

## üìÅ Input Datasets

### **1. SEN12MS (HDF5)**
Located in Kaggle input:
- `sen12ms_bands.h5` ‚Üí `(N1, 64, 64, 4)`
- `sen12ms_masks.h5` ‚Üí `(N1, 64, 64)`
- `sen12ms_metadata.json`

These files are produced after batch processing and merging.

### **2. EuroSAT (NPY)**
Uploaded in Kaggle input:
- `eurosat_bands.npy` ‚Üí `(N2, 64, 64, 4)`
- `eurosat_masks.npy` ‚Üí `(N2, 64, 64)`

---

## üì¶ Output (Final Combined Dataset)

After merging:

- `final_bands.npy` ‚Üí `(N1 + N2, 64, 64, 4)`
- `final_masks.npy` ‚Üí `(N1 + N2, 64, 64)`
- `final_metadata.json`


In [None]:
import numpy as np
import h5py
import os
from tqdm import tqdm

OUT = "/kaggle/working/final_dataset/final_bands.npz"
TMP = "/kaggle/working/final_dataset/_bands_tmp"

# Create temporary directory to store small chunks
os.makedirs(TMP, exist_ok=True)

# 1. Load EuroSAT bands (fits in RAM)
eurosat_bands = np.load("/kaggle/input/eurosat/eurosat_bands.npy")
np.save(os.path.join(TMP, "chunk_0.npy"), eurosat_bands)

# 2. Stream Sen12MS bands in chunks
with h5py.File("/kaggle/input/sen12ms/sen12ms_bands.h5") as f:
    ds = f["bands"]
    total = ds.shape[0]
    chunk = 10000  # adjustable

    chunk_id = 1

    for i in tqdm(range(0, total, chunk)):
        part = ds[i:i+chunk]
        np.save(os.path.join(TMP, f"chunk_{chunk_id}.npy"), part)
        chunk_id += 1

# 3. Write NPZ with all chunks
np.savez_compressed(
    OUT,
    **{f"arr_{i}": np.load(os.path.join(TMP, f)) 
       for i, f in enumerate(sorted(os.listdir(TMP)))}
)

print("Final BANDS saved:", OUT)

In [None]:
!rm -rf /kaggle/working/final_dataset/_bands_tmp

In [None]:
import numpy as np

# Load the chunked NPZ
data = np.load("/kaggle/working/final_dataset/final_bands.npz")

# Concatenate in correct order
all_arrays = [data[k] for k in sorted(data.files)]
merged = np.concatenate(all_arrays, axis=0)

print("FINAL merged shape:", merged.shape)

# Save final merged dataset as one clean NPZ
np.savez_compressed(
    "/kaggle/working/final_dataset/final_bands_merged.npz",
    bands=merged
)

print("Saved FINAL merged dataset as final_bands_merged.npz")

In [None]:
import numpy as np

data = np.load("/kaggle/working/final_dataset/final_bands_merged.npz")

for k in data.files:
    print(k, data[k].shape)

In [None]:
import numpy as np
import h5py
import os
from tqdm import tqdm

OUT = "/kaggle/working/final_dataset/final_masks.npz"
TMP = "/kaggle/working/final_dataset/_masks_tmp"

os.makedirs(TMP, exist_ok=True)

# 1. EuroSAT masks
eurosat_masks = np.load("/kaggle/input/eurosat/eurosat_masks.npy")
np.save(os.path.join(TMP, "chunk_0.npy"), eurosat_masks)

# 2. Sen12MS masks streamed
with h5py.File("/kaggle/input/sen12ms/sen12ms_masks.h5") as f:
    ds = f["masks"]
    total = ds.shape[0]
    chunk = 5000

    chunk_id = 1

    for i in tqdm(range(0, total, chunk)):
        part = ds[i:i+chunk]
        np.save(os.path.join(TMP, f"chunk_{chunk_id}.npy"), part)
        chunk_id += 1

# 3. Write NPZ output
np.savez_compressed(
    OUT,
    **{f"arr_{i}": np.load(os.path.join(TMP, f)) 
       for i, f in enumerate(sorted(os.listdir(TMP)))}
)

print("Final MASKS saved:", OUT)

In [None]:
!rm -rf /kaggle/working/final_dataset/_masks_tmp

In [None]:
import numpy as np

# Load the chunked NPZ
data = np.load("/kaggle/working/final_dataset/final_masks.npz")

# Concatenate in correct order
all_arrays = [data[k] for k in sorted(data.files)]
merged = np.concatenate(all_arrays, axis=0)

print("FINAL merged shape:", merged.shape)

# Save final merged dataset as one clean NPZ
np.savez_compressed(
    "/kaggle/working/final_dataset/final_masks_merged.npz",
    masks=merged
)

print("Saved FINAL merged dataset as final_masks_merged.npz")

In [None]:
import numpy as np

data = np.load("/kaggle/working/final_dataset/final_masks_merged.npz")

for k in data.files:
    print(k, data[k].shape)

In [None]:
!pip install --upgrade huggingface_hub

In [None]:
from huggingface_hub import login
login()

In [None]:
from huggingface_hub import HfApi

api = HfApi()

repo_id = "mayeraa/water_body_detection"

In [None]:
api.upload_file(
    path_or_fileobj="/kaggle/working/final_dataset/final_bands_merged.npz",
    path_in_repo="final_bands_merged.npz",
    repo_id=repo_id,
    repo_type="dataset"
)

In [None]:
api.upload_file(
    path_or_fileobj="/kaggle/working/final_dataset/final_masks_merged.npz",
    path_in_repo="final_masks_merged.npz",
    repo_id=repo_id,
    repo_type="dataset"
)