In [None]:
from model_v3.TeacherModel import TeacherModel
from data.data_module_v2 import IrrigationDataModule
from omegaconf import OmegaConf
import torch
import numpy as np
import geopandas as gpd
from shapely import wkt
from rasterio.features import rasterize
from rasterio.transform import from_bounds
from tqdm import tqdm

# === Config ===
cfg_path = '/project/biocomplexity/wyr6fx(Nibir)/NeurIPS_Irrigation_Mapping_Model/Output/cross-state/vision/kiim/result_stats/configs/hydra_config.yaml'
state = 'Arizona'
print('************************************************')
print(state, cfg_path)
print('************************************************')

cfg = OmegaConf.load(cfg_path)
cfg.dataset.train_type = 'unsupervised'
cfg.dataset.states = [[state, 1]]

# === Device Setup ===
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# === Data + Model ===
data_module = IrrigationDataModule(cfg)
data_module.setup('fit')
data_module.setup('test')

ckpt_path = '/sfs/gpfs/tardis/project/bii_nssac/people/wyr6fx/NeurIPS_Irrigation_Mapping_Model/Output/cross-state/vision/kiim/result_stats/checkpoints/epoch=17-val_iou_macro_irr=0.912.ckpt'
model = TeacherModel.load_from_checkpoint(ckpt_path, **cfg).to(device)
model.eval()

# === Load Polygons ===
gdf = gpd.read_file(f'/project/biocomplexity/wyr6fx(Nibir)/NeurIPS_irrigation_data/Agcensus/{state}_Irrigation.geojson')
gdf = gdf.to_crs("EPSG:5070")
print(gdf.head())

# === Define Raster Extent ===
xmin, ymin, xmax, ymax = gdf.total_bounds
resolution = 30
width = int(np.ceil((xmax - xmin) / resolution))
height = int(np.ceil((ymax - ymin) / resolution))
transform = from_bounds(xmin, ymin, xmax, ymax, width, height)

# === Initialize Count & Sum Rasters ===
count_map = np.zeros((height, width), dtype=np.uint32)
sum_map = np.zeros((height, width), dtype=np.uint32)

# === Inference Loop ===
for batch in tqdm(data_module.train_dataloader(), desc="Aggregating predictions"):
    with torch.no_grad():
        polygons = batch['polygon']
        batch = {k: v.to(device) for k, v in batch.items() if isinstance(v, torch.Tensor)}
        preds = model(batch)['predictions'].argmax(dim=1).cpu().numpy()

    for i in range(preds.shape[0]):
        patch_mask = preds[i]
        poly = wkt.loads(polygons[i])
        bounds = poly.bounds

        h, w = patch_mask.shape
        patch_transform = from_bounds(*bounds, w, h)

        poly_raster = rasterize([(poly, 1)], out_shape=(h, w), transform=patch_transform, fill=0, dtype=np.uint8)
        patch_mask = patch_mask * poly_raster

        x0 = int((bounds[0] - xmin) / resolution)
        y0 = int((ymax - bounds[3]) / resolution)
        x1 = min(x0 + w, width)
        y1 = min(y0 + h, height)
        h_clip = y1 - y0
        w_clip = x1 - x0

        if h_clip <= 0 or w_clip <= 0:
            continue

        patch_mask = patch_mask[:h_clip, :w_clip]
        mask_valid = (patch_mask > 0)

        sum_map_view = sum_map[y0:y1, x0:x1]
        count_map_view = count_map[y0:y1, x0:x1]

        sum_map_view[mask_valid] = sum_map_view[mask_valid] + patch_mask[mask_valid].astype(np.uint32)
        count_map_view[mask_valid] += 1

# === Compute Majority Vote or Mean Prediction ===
avg_pred_map = np.round(sum_map / np.maximum(count_map, 1)).astype(np.uint8)

# === Filter Polygons by Prediction ===
valid_rows = []
irrigated_counts = []
gdf_group = gdf.groupby(['geometry', 'County'])['irrigated_acres'].sum().reset_index()

for idx, geom in tqdm(enumerate(gdf_group.geometry), total=len(gdf_group)):
    bounds = geom.bounds
    x0 = int((bounds[0] - xmin) / resolution)
    y0 = int((ymax - bounds[3]) / resolution)
    x1 = int((bounds[2] - xmin) / resolution)
    y1 = int((ymax - bounds[1]) / resolution)

    h = y1 - y0
    w = x1 - x0
    if h <= 0 or w <= 0 or x1 > width or y1 > height:
        continue

    local_transform = from_bounds(*bounds, w, h)
    poly_mask = rasterize([(geom, 1)], out_shape=(h, w), transform=local_transform, fill=0, dtype=np.uint8)
    pred_crop = avg_pred_map[y0:y1, x0:x1]

    if not np.any(poly_mask):
        continue

    irrigated_pixels = ((pred_crop == 1) * poly_mask).sum()
    if irrigated_pixels == 0:
        continue

    valid_rows.append(idx)
    irrigated_counts.append(irrigated_pixels)

# === Save Output ===
gdf_group = gdf_group.loc[valid_rows].copy()
gdf_group['irrigated_pixels'] = irrigated_counts
gdf_group['irrigation_discovered'] = gdf_group['irrigated_pixels'] * 0.2223945

print(gdf_group[['irrigated_acres', 'irrigation_discovered']].sum())

true = gdf_group['irrigated_acres'].astype(float).values
pred = gdf_group['irrigation_discovered'].astype(float).values
rmse = np.sqrt(np.mean((true - pred) ** 2))
print(f"RMSE: {rmse:.4f}")


************************************************
Arizona /project/biocomplexity/wyr6fx(Nibir)/NeurIPS_Irrigation_Mapping_Model/Output/cross-state/vision/kiim/result_stats/configs/hydra_config.yaml
************************************************


INFO:timm.models._builder:Loading pretrained weights from Hugging Face hub (timm/swin_base_patch4_window7_224.ms_in22k_ft_in1k)
INFO:timm.models._hub:[timm/swin_base_patch4_window7_224.ms_in22k_ft_in1k] Safe alternative available for 'pytorch_model.bin' (as 'model.safetensors'). Loading weights using safetensors.
INFO:timm.models._builder:Missing keys (head.fc.weight, head.fc.bias) discovered while loading pretrained weights. This is expected if model is being adapted.


  state_alpha    County  irrigated_acres state_name  \
0          AZ    APACHE             7729    Arizona   
1          AZ   COCHISE           108237    Arizona   
2          AZ  COCONINO              413    Arizona   
3          AZ      GILA              767    Arizona   
4          AZ    GRAHAM            32482    Arizona   

                                            geometry  
0  POLYGON ((-1157883.284 1547341.315, -1157900.1...  
1  POLYGON ((-1357944.934 1068525.433, -1357908.5...  
2  POLYGON ((-1470277.965 1628362.73, -1470267.72...  
3  POLYGON ((-1431952.572 1351760.221, -1431942.3...  
4  MULTIPOLYGON (((-1336903.01 1204027.255, -1336...  


Aggregating predictions: 100%|██████████| 505/505 [04:57<00:00,  1.70it/s]


In [2]:
import os
import torch
import numpy as np
from pathlib import Path
from PIL import Image
from tqdm import tqdm
import torch.nn.functional as F

from model_v3.TeacherModel import TeacherModel
from data.data_module_v2 import IrrigationDataModule
from omegaconf import OmegaConf

# === CONFIG ===
state = 'Arizona'
cfg_path = '/project/biocomplexity/wyr6fx(Nibir)/NeurIPS_Irrigation_Mapping_Model/Output/cross-state/vision/kiim/result_stats/configs/hydra_config.yaml'
ckpt_path = '/sfs/gpfs/tardis/project/bii_nssac/people/wyr6fx/NeurIPS_Irrigation_Mapping_Model/Output/cross-state/vision/kiim/result_stats/checkpoints/epoch=17-val_iou_macro_irr=0.912.ckpt'
save_dir = Path(f"/project/biocomplexity/wyr6fx(Nibir)/NeurIPS_irrigation_data/generated_label/{state}")
save_dir.mkdir(parents=True, exist_ok=True)

# === SETUP ===
print(f"Generating pseudo-labels for {state}")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

cfg = OmegaConf.load(cfg_path)
cfg.dataset.train_type = 'unsupervised'
cfg.dataset.states = [[state, 1]]

data_module = IrrigationDataModule(cfg)
data_module.setup('fit')
data_module.setup('test')

model = TeacherModel.load_from_checkpoint(ckpt_path, **cfg)
model.to(device)
model.eval()

# === INFERENCE LOOP ===
for batch_idx, batch in enumerate(tqdm(data_module.train_dataloader(), desc="Generating pseudo-labels")):
    with torch.no_grad():
        batch = {k: v.to(device) for k, v in batch.items() if isinstance(v, torch.Tensor)}
        probs = model(batch)['predictions']  # shape: [B, C, H, W]
        # probs = F.softmax(logits, dim=1)      # [B, C, H, W]
        conf, preds = torch.max(probs, dim=1) # [B, H, W] - class & confidence

    for i in range(preds.shape[0]):
        patch_id = f"{state}_{batch_idx:05}_{i:02}"
        
        image_path = batch[i]['image_path']
        label_path = image_path.replace('patch', 'generated_label').replace('.tif', '.npy')
        conf_path = image_path.replace('patch', 'generated_conf').replace('.tif', '.npy')
        

        label = preds[i].cpu().numpy().astype(np.uint8)
        conf_map = conf[i].cpu().numpy().astype(np.float32)

        # === Save .npy format ===
        np.save(label_path, label)
        np.save(conf_path, conf_map)

        # === Save optional PNG for visualization ===
        Image.fromarray(label).save(save_dir / f"{patch_id}_label.png")
        Image.fromarray((conf_map * 255).astype(np.uint8)).save(save_dir / f"{patch_id}_conf.png")

print("✅ Pseudo-label generation complete.")


NameError: name 'true' is not defined

In [3]:
from data.dataset_v2 import *
import json
config = {
    "dataset": {
        "data_dir": "/project/biocomplexity/wyr6fx(Nibir)/NeurIPS_irrigation_data/Train-Test-Split",
        "states": [
            ["Arizona", 1],
            # ["Colorado", 1],
            # ["Utah", 1],
            # ["Georgia", 1],
            # ["Washington", 1],
            # ["Florida", 1]
        ],
        "image_shape": [224, 224],
        "transform": False,
        "gamma_value": 1.5,
        "label_type": "irrigation",
        "vision_indices": [
            "image", "ndvi", "ndti", "ndwi", "evi", "gndvi", "savi",
            "msavi", "rvi", "cigreen", "pri", "osavi", "wdrvi"
        ],
        "train_type": "holdout"
    }
}


In [6]:
data_dir = "/project/biocomplexity/wyr6fx(Nibir)/NeurIPS_irrigation_data/Train-Test-Split"
state = 'Georgia'
dataset = ImageMaskDataset(
        data_dir=data_dir,
        states=[(state, 1.0)],
        train_type='holdout',
        split='train',
        transform=False,
        label_type= "irrigation",
        vision_indices= [
            "image", "ndvi", "ndti", "ndwi", "evi", "gndvi", "savi",
            "msavi", "rvi", "cigreen", "pri", "osavi", "wdrvi"
        ]
    )



In [10]:
dataset[0]

{'image_path': '/project/biocomplexity/wyr6fx(Nibir)/NeurIPS_irrigation_data/label/Arizona/2016/005b3993-92af-4305-a964-cb1326d44d9d/L1C_T12SUB_A005472_20160709T181305/patch_2912_10752.tif',
 'split': 'train',
 'rgb': tensor([[[0.4824, 0.4941, 0.5098,  ..., 0.4039, 0.3647, 0.3451],
          [0.4510, 0.4784, 0.5176,  ..., 0.3843, 0.3686, 0.3490],
          [0.4510, 0.4941, 0.5412,  ..., 0.3686, 0.3686, 0.3608],
          ...,
          [0.3725, 0.4627, 0.5686,  ..., 0.6510, 0.6431, 0.6235],
          [0.4784, 0.5373, 0.5529,  ..., 0.6353, 0.6314, 0.6235],
          [0.5412, 0.4902, 0.4471,  ..., 0.6392, 0.6431, 0.6392]],
 
         [[0.3843, 0.3922, 0.4000,  ..., 0.3059, 0.2706, 0.2627],
          [0.3569, 0.3725, 0.3961,  ..., 0.2980, 0.2745, 0.2627],
          [0.3490, 0.3843, 0.4039,  ..., 0.2824, 0.2745, 0.2667],
          ...,
          [0.2980, 0.3490, 0.4118,  ..., 0.4824, 0.4824, 0.4745],
          [0.3608, 0.3961, 0.4000,  ..., 0.4667, 0.4745, 0.4706],
          [0.4000, 0.364

In [None]:
import h5py
import os

for i in range(len(dataset)):
    data = dataset[i]
    image_path = data['image_path']

    # === Construct relative HDF5 path ===
    fname_rel = image_path.replace('.tif', '.h5').replace('/project/biocomplexity/wyr6fx(Nibir)/NeurIPS_irrigation_data/', 'Data/')
    save_path = os.path.join('/project/biocomplexity/wyr6fx(Nibir)/NeurIPS_irrigation_data/huggingface', fname_rel)

    # === Create necessary directories ===
    os.makedirs(os.path.dirname(save_path), exist_ok=True)

    # === Write HDF5 file ===
    with h5py.File(save_path, 'w') as f:
        f.create_dataset("rgb", data=data['rgb'].numpy())  # [3, H, W]
        f.create_dataset("agri_index", data=data['agri_index'].numpy())  # [13, H, W]
        f.create_dataset("land_mask", data=data['land_mask'].numpy())
        f.create_dataset("crop_mask", data=data['crop_mask'].numpy())
        f.create_dataset("irr_mask", data=data['irr_mask'].numpy())
        f.create_dataset("subirr_mask", data=data['subirr_mask'].numpy())
        f.attrs["image_path"] = data['image_path']
        f.attrs["label_type"] = "irrigation"
        f.attrs["text_prompt"] = data['text_prompt']
        f.attrs["polygon"] = data['polygon']
        f.attrs["crs"] = data['crs']
        f.attrs["split"] = data['split']


In [6]:
import h5py
import os
import torch
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor
from data.dataset_v2 import *

# === Configuration ===
data_dir = "/project/biocomplexity/wyr6fx(Nibir)/NeurIPS_irrigation_data/Train-Test-Split"
state = 'Texas'

# === Load Dataset ===
dataset = ImageMaskDataset(
    data_dir=data_dir,
    states=[(state, 1.0)],
    train_type='unsupervised',
    split='train',
    transform=False,
    label_type="irrigation",
    vision_indices=[
        "image", "ndvi", "ndti", "ndwi", "evi", "gndvi", "savi",
        "msavi", "rvi", "cigreen", "pri", "osavi", "wdrvi"
    ]
)

# === Save Function ===
def save_h5_by_index(idx):
    data = dataset[idx]
    image_path = data['image_path']
    
    fname_rel = image_path.replace('.tif', '.h5').replace(
        '/project/biocomplexity/wyr6fx(Nibir)/NeurIPS_irrigation_data/', 'Data/')
    save_path = os.path.join(
        '/project/biocomplexity/wyr6fx(Nibir)/NeurIPS_irrigation_data/huggingface', fname_rel)

    os.makedirs(os.path.dirname(save_path), exist_ok=True)

    with h5py.File(save_path, 'w') as f:
        f.create_dataset("rgb", data=data['rgb'].numpy())
        f.create_dataset("agri_index", data=data['agri_index'].numpy())
        f.create_dataset("land_mask", data=data['land_mask'].numpy())
        f.create_dataset("crop_mask", data=data['crop_mask'].numpy())
        f.create_dataset("irr_mask", data=data['irr_mask'].numpy())
        f.create_dataset("subirr_mask", data=data['subirr_mask'].numpy())
        f.attrs["image_path"] = data['image_path']
        f.attrs["label_type"] = "irrigation"
        f.attrs["text_prompt"] = data['text_prompt']
        f.attrs["polygon"] = data['polygon']
        f.attrs["crs"] = data['crs']
        f.attrs["split"] = data['split']
    
    return save_path

# === Use ThreadPoolExecutor for better tqdm support ===
with ThreadPoolExecutor(max_workers=40) as executor:
    list(tqdm(executor.map(save_h5_by_index, range(len(dataset))),
              total=len(dataset), desc="Saving HDF5 files"))


Saving HDF5 files: 100%|██████████| 318354/318354 [3:37:42<00:00, 24.37it/s]   
