# Step 1: Bare Earth + Official SpectralGPT Precompute (Colab GPU)

Use this notebook first.
It uses `data/processed/features_normalized_points.csv` as the source training table,
samples GA Barest Earth for each point, runs official SpectralGPT on the Bare Earth pixels,
and builds a fused `feat_*` table for ResNet training.


## Runtime

In Colab, set `Runtime -> Change runtime type -> T4/A100 GPU`, then run all cells top to bottom.


In [None]:
from google.colab import drive
import os
import shutil
import subprocess
from pathlib import Path

print("[1/9] Mount + clone repo")

drive.mount('/content/drive', force_remount=True)

USE_GIT_CLONE = True
REPO_GIT_URL = "https://github.com/JackOnThePaddock/soil-resnet-model.git"
DRIVE_REPO_DIR = "/content/drive/MyDrive/soil-resnet-model"
PROJECT_DIR = "/content/soil-resnet-model"

if os.path.exists(PROJECT_DIR):
    shutil.rmtree(PROJECT_DIR)

if USE_GIT_CLONE:
    subprocess.run(["git", "clone", REPO_GIT_URL, PROJECT_DIR], check=True)
else:
    if not os.path.exists(DRIVE_REPO_DIR):
        raise FileNotFoundError(f"Repo not found at {DRIVE_REPO_DIR}")
    shutil.copytree(DRIVE_REPO_DIR, PROJECT_DIR)

os.chdir(PROJECT_DIR)
print("Project dir:", os.getcwd())


In [None]:
print("[2/9] Install dependencies")
!python -V
!pip -q install --upgrade pip
!pip -q install -e .


In [None]:
print("[3/9] Check GPU")
import torch
print("Torch:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU:", torch.cuda.get_device_name(0))


In [None]:
print("[4/9] Configure paths")
import json
import re
import pandas as pd
from pathlib import Path

WCS_WORKERS = 16
WCS_TIMEOUT = 120
WCS_RETRIES = 3
SPECTRAL_DIM = 16
RANDOM_SEED = 42

paths = {
    "raw": Path("data/processed/features.csv"),
    "normalized": Path("data/processed/features_normalized.csv"),
    "normalized_points": Path("data/processed/features_normalized_points.csv"),
    "be_sgpt": Path("data/processed/features_normalized_points_be_sgpt.csv"),
    "sgpt_only": Path("data/processed/features_normalized_points_sgpt_embeddings.csv"),
    "sgpt_raw": Path("data/processed/features_normalized_points_sgpt_official_raw.csv"),
    "fused_feat": Path("data/processed/features_normalized_points_fused_feat.csv"),
    "fused_meta": Path("data/processed/features_normalized_points_fused_meta.json"),
}

if not paths["normalized_points"].exists():
    print("normalized_points missing; attempting to build from features.csv + features_normalized.csv")
    for req in [paths["raw"], paths["normalized"]]:
        if not req.exists():
            raise FileNotFoundError(
                f"Missing required input: {req}. Cannot build {paths['normalized_points']}."
            )

    raw_meta = pd.read_csv(paths["raw"], usecols=["id", "lat", "lon"])
    norm_df = pd.read_csv(paths["normalized"])
    if len(raw_meta) != len(norm_df):
        raise ValueError(f"Row mismatch raw={len(raw_meta)} normalized={len(norm_df)}")

    norm_points = pd.concat([raw_meta.reset_index(drop=True), norm_df.reset_index(drop=True)], axis=1)
    paths["normalized_points"].parent.mkdir(parents=True, exist_ok=True)
    norm_points.to_csv(paths["normalized_points"], index=False)
    print("Created", paths["normalized_points"], norm_points.shape)

print("Input ready:", paths["normalized_points"])



In [None]:
print("[5/9] Validate normalized points schema")
points_df = pd.read_csv(paths["normalized_points"])
required_cols = ["id", "lat", "lon"] + [f"band_{i}" for i in range(64)]
missing = [c for c in required_cols if c not in points_df.columns]
if missing:
    raise ValueError(f"normalized_points is missing columns: {missing[:10]}")

print("Rows:", len(points_df))
print("Columns:", len(points_df.columns))
print("Band columns:", len([c for c in points_df.columns if c.startswith('band_')]))



In [None]:
print("[6/9] Pull Bare Earth + official SpectralGPT embeddings")
cmd = [
    "python", "scripts/pull_bare_earth_embeddings.py",
    "--normalized-csv", str(paths["normalized_points"]),
    "--output-csv", str(paths["be_sgpt"]),
    "--output-embeddings-csv", str(paths["sgpt_only"]),
    "--output-official-raw-csv", str(paths["sgpt_raw"]),
    "--workers", str(WCS_WORKERS),
    "--timeout", str(WCS_TIMEOUT),
    "--retries", str(WCS_RETRIES),
    "--spectral-backend", "official_pretrained",
    "--official-request-chunk-size", "64",
    "--spectral-dim", str(SPECTRAL_DIM),
    "--seed", str(RANDOM_SEED),
]
print("Running:", " ".join(cmd))
subprocess.run(cmd, check=True)
print("Saved", paths["be_sgpt"])
print("Saved", paths["sgpt_only"])
print("Saved", paths["sgpt_raw"])



In [None]:
print("[7/9] Build fused feat_* training table")
fused = pd.read_csv(paths["be_sgpt"])

def sort_numeric_suffix(cols):
    def key(c):
        m = re.search(r"(\d+)$", c)
        return (0, int(m.group(1)), c) if m else (1, -1, c)
    return sorted(cols, key=key)

band_cols = sort_numeric_suffix([c for c in fused.columns if c.lower().startswith("band_")])
be_cols = sort_numeric_suffix([c for c in fused.columns if c.lower().startswith("be_")])
sgpt_cols = sort_numeric_suffix([c for c in fused.columns if c.lower().startswith("sgpt_")])

if len(band_cols) != 64:
    raise ValueError(f"Expected 64 band cols, got {len(band_cols)}")
if len(sgpt_cols) == 0:
    raise ValueError("No sgpt_* columns found")

source_cols = band_cols + be_cols + sgpt_cols
for i, c in enumerate(source_cols):
    fused[f"feat_{i:03d}"] = fused[c]

paths["fused_feat"].parent.mkdir(parents=True, exist_ok=True)
fused.to_csv(paths["fused_feat"], index=False)

meta = {
    "alpha_cols": band_cols,
    "bareearth_cols": be_cols,
    "spectral_cols": sgpt_cols,
    "feat_cols": [f"feat_{i:03d}" for i in range(len(source_cols))],
}
paths["fused_meta"].write_text(json.dumps(meta, indent=2), encoding="utf-8")

print("Saved", paths["fused_feat"], fused.shape)
print("Counts:", {"band": len(band_cols), "be": len(be_cols), "sgpt": len(sgpt_cols), "feat": len(meta["feat_cols"])})


In [None]:
print("[8/9] Sanity checks")
import numpy as np

fused = pd.read_csv(paths["fused_feat"])
feat_cols = [c for c in fused.columns if c.startswith("feat_")]

arr = fused[feat_cols].to_numpy(dtype=np.float64)
print("fused rows:", len(fused), "features:", len(feat_cols))
print("non-finite values:", int((~np.isfinite(arr)).sum()))

print("sample columns:", feat_cols[:5], "...", feat_cols[-5:])
fused[["id", "lat", "lon"] + feat_cols[:3]].head()


In [None]:
print("[9/9] Save outputs to Drive")
from datetime import datetime

stamp = datetime.now().strftime("%Y%m%d_%H%M%S")
drive_out = Path(f"/content/drive/MyDrive/soil-resnet-outputs/step1_bareearth_sgpt_{stamp}")
drive_out.mkdir(parents=True, exist_ok=True)

copy_targets = [
    paths["normalized_points"],
    paths["be_sgpt"],
    paths["sgpt_only"],
    paths["sgpt_raw"],
    paths["fused_feat"],
    paths["fused_meta"],
]

for p in copy_targets:
    if p.exists():
        shutil.copy2(p, drive_out / p.name)

print("Saved to:", drive_out)
print("Use this folder in Step 2 notebook as DRIVE_PRECOMPUTED_DIR")
