# Soil Bare Earth + SpectralGPT (Colab GPU)

This notebook runs your pipeline on Colab:
1. Load your `soil-resnet-model` repo.
2. Install dependencies.
3. Pull GA Barest Earth values for normalized training points.
4. Train SpectralGPT embeddings on those bands.
5. Save outputs to Drive.

## 1) Enable GPU runtime

In Colab: `Runtime -> Change runtime type -> T4/A100 GPU`.

In [None]:
from google.colab import drive
import os
import shutil
import subprocess

# Mount Google Drive
drive.mount('/content/drive')

# ---- Configure these paths ----
USE_GIT_CLONE = True
REPO_GIT_URL = "https://github.com/JackOnThePaddock/soil-resnet-model.git"
DRIVE_REPO_DIR = "/content/drive/MyDrive/soil-resnet-model"
PROJECT_DIR = "/content/soil-resnet-model"

if os.path.exists(PROJECT_DIR):
    shutil.rmtree(PROJECT_DIR)

if USE_GIT_CLONE:
    if not REPO_GIT_URL:
        raise ValueError("Set REPO_GIT_URL or set USE_GIT_CLONE=False")
    subprocess.run(["git", "clone", REPO_GIT_URL, PROJECT_DIR], check=True)
else:
    if not os.path.exists(DRIVE_REPO_DIR):
        raise FileNotFoundError(f"Repo not found at {DRIVE_REPO_DIR}")
    shutil.copytree(DRIVE_REPO_DIR, PROJECT_DIR)

os.chdir(PROJECT_DIR)
print("Project:", os.getcwd())

In [None]:
# Install project and dependencies
!pip -q install --upgrade pip
!pip -q install -e .

In [None]:
# Confirm GPU availability
import torch
print("Torch:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU:", torch.cuda.get_device_name(0))

## 2) Configure inputs/outputs

`features_normalized.csv` does not contain `lat/lon`, so this uses `features.csv` for coordinates.

In [None]:
import os

NORMALIZED_CSV = "data/processed/features_normalized.csv"
POINTS_CSV = "data/processed/features.csv"
OUTPUT_CSV = "data/processed/features_normalized_bareearth_sgpt.csv"
OUTPUT_EMBEDDINGS_CSV = "data/processed/features_normalized_sgpt_embeddings.csv"

for p in [NORMALIZED_CSV, POINTS_CSV]:
    if not os.path.exists(p):
        raise FileNotFoundError(f"Missing input: {p}")
print("Inputs OK")

In [None]:
# Run Bare Earth point sampling + SpectralGPT embeddings
# Adjust workers if you see timeouts or request throttling.
!python scripts/pull_bare_earth_embeddings.py   --normalized-csv {NORMALIZED_CSV}   --points-csv {POINTS_CSV}   --output-csv {OUTPUT_CSV}   --output-embeddings-csv {OUTPUT_EMBEDDINGS_CSV}   --workers 16   --timeout 120   --retries 3   --spectral-backend official_pretrained   --official-request-chunk-size 64   --spectral-dim 16   --output-official-raw-csv data/processed/features_normalized_sgpt_official_raw.csv   --seed 42

In [None]:
# Quick sanity check
import pandas as pd

fused = pd.read_csv(OUTPUT_CSV)
emb = pd.read_csv(OUTPUT_EMBEDDINGS_CSV)

print("Fused shape:", fused.shape)
print("Embeddings shape:", emb.shape)
print("Bare Earth columns:", [c for c in fused.columns if c.startswith('be_')][:10])
print("SGPT columns:", [c for c in fused.columns if c.startswith('sgpt_')][:10])
print("Missing rate (be_):", fused[[c for c in fused.columns if c.startswith('be_')]].isna().mean().mean())

In [None]:
# Save outputs back to Drive
OUT_DIR = "/content/drive/MyDrive/soil-resnet-outputs"
os.makedirs(OUT_DIR, exist_ok=True)

!cp {OUTPUT_CSV} {OUT_DIR}/
!cp {OUTPUT_EMBEDDINGS_CSV} {OUT_DIR}/

print("Saved to:", OUT_DIR)