In [1]:
import arcpy, os, pandas as pd, numpy as np
from arcpy.sa import *
from sklearn.model_selection import GroupKFold, GroupShuffleSplit, train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score, average_precision_score, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

arcpy.CheckOutExtension("Spatial")

gdb = r"C:\Users\USER\Desktop\PROJECTS\ESRI\DATA\MPM_LIRHANDA_CORRIDOR\MPM_LIRHANDA_CORRIDOR.gdb"   # GDB that contains your TrainingPoints
points_fc = "TrainingPts"                                              # enriched points with raster values
label_field = "Label"                                                     # 1 = occurrence, 0 = background
cell_size_m = 5000                                                        # fishnet cell size (meters) → try 5,000 first
fishnet_fc  = "CV_Fishnet_5km"                                            # fishnet output name (in the same GDB)
points_with_blocks = "TrainingPoints_blocks"                               # points + block_id (output)

arcpy.env.workspace = gdb
arcpy.env.overwriteOutput = True

In [2]:
# === Build fishnet over training points (NO CLIP), add block_id, join to points ===
# Expects: points_fc, gdb, fishnet_fc, cell_size_m to be defined earlier.

import arcpy, os
arcpy.env.workspace = gdb
arcpy.env.overwriteOutput = True

# 0) CRS sanity (fishnet size is in meters)
sr = arcpy.Describe(points_fc).spatialReference
print(f"[Info] Points CRS: {sr.name} | Linear unit: {getattr(sr, 'linearUnitName', 'N/A')}")
if getattr(sr, "linearUnitName", "").lower() != "meter":
    print("⚠️  Consider projecting to a meter-based CRS; fishnet size is in meters.")

# Defaults if not set
fishnet_fc = globals().get("fishnet_fc", "CV_Fishnet_5km")
cell_size_m = float(globals().get("cell_size_m", 5000))

# 1) Build fishnet over the points' extent
ext = arcpy.Describe(points_fc).extent
origin     = f"{ext.XMin} {ext.YMin}"
y_axis     = f"{ext.XMin} {ext.YMin + 10}"    # small offset defines Y-axis direction
opp_corner = f"{ext.XMax} {ext.YMax}"

fishnet_path = os.path.join(gdb, fishnet_fc)
if arcpy.Exists(fishnet_path):
    arcpy.management.Delete(fishnet_path)

arcpy.management.CreateFishnet(
    out_feature_class=fishnet_path,
    origin_coord=origin,
    y_axis_coord=y_axis,
    cell_width=cell_size_m,
    cell_height=cell_size_m,
    number_rows=0, number_columns=0,
    corner_coord=opp_corner,
    labels="NO_LABELS",
    template="#",
    geometry_type="POLYGON"
)
print("✅ Fishnet created:", fishnet_path)

# 2) Add a stable block_id from the fishnet's OID
oid_field = arcpy.Describe(fishnet_path).OIDFieldName
if "block_id" not in [f.name for f in arcpy.ListFields(fishnet_path)]:
    arcpy.management.AddField(fishnet_path, "block_id", "LONG")
arcpy.management.CalculateField(fishnet_path, "block_id", f"!{oid_field}!", "PYTHON3")
print("✅ block_id populated from", oid_field)

# 3) Spatial join: attach block_id to your training points
points_with_blocks = os.path.join(gdb, "TrainingPoints_blocks")
if arcpy.Exists(points_with_blocks):
    arcpy.management.Delete(points_with_blocks)

arcpy.analysis.SpatialJoin(
    target_features=points_fc,
    join_features=fishnet_path,
    out_feature_class=points_with_blocks,
    join_operation="JOIN_ONE_TO_ONE",
    join_type="KEEP_ALL",
    match_option="INTERSECT"
)

# Ensure the field is exactly named 'block_id' (SJ can rename/qualify)
joined_fields = [f.name for f in arcpy.ListFields(points_with_blocks)]
if "block_id" not in joined_fields:
    cand = [n for n in joined_fields if n.lower().endswith(".block_id") or "block_id" in n.lower()]
    if cand:
        src = cand[0]
        arcpy.management.AddField(points_with_blocks, "block_id", "LONG")
        arcpy.management.CalculateField(points_with_blocks, "block_id", f"!{src}!", "PYTHON3")

print("🎯 Done. Points with blocks →", points_with_blocks)
print("   Fishnet cells:", arcpy.management.GetCount(fishnet_path)[0])


[Info] Points CRS: Arc_1960_UTM_Zone_36N | Linear unit: Meter
✅ Fishnet created: C:\Users\USER\Desktop\PROJECTS\ESRI\DATA\MPM_LIRHANDA_CORRIDOR\MPM_LIRHANDA_CORRIDOR.gdb\CV_Fishnet_5km
✅ block_id populated from OID
🎯 Done. Points with blocks → C:\Users\USER\Desktop\PROJECTS\ESRI\DATA\MPM_LIRHANDA_CORRIDOR\MPM_LIRHANDA_CORRIDOR.gdb\TrainingPoints_blocks
   Fishnet cells: 324


In [3]:
# stable block_id from OBJECTID
if "block_id" not in [f.name for f in arcpy.ListFields(fishnet_path)]:
    arcpy.management.AddField(fishnet_path, "block_id", "LONG")
arcpy.management.CalculateField(fishnet_path, "block_id", "!OID!", "PYTHON3")

In [4]:
# spatial join: write block_id to each point
pts_blocks_path = os.path.join(gdb, points_with_blocks)
if arcpy.Exists(pts_blocks_path):
    arcpy.management.Delete(pts_blocks_path)

arcpy.analysis.SpatialJoin(
    target_features=points_fc,
    join_features=fishnet_path,
    out_feature_class=pts_blocks_path,
    join_operation="JOIN_ONE_TO_ONE",
    join_type="KEEP_ALL",
    match_option="INTERSECT"
)

In [5]:
# ensure field is named block_id (field mapping sometimes renames)
joined_fields = [f.name for f in arcpy.ListFields(pts_blocks_path)]
if "block_id" not in joined_fields:
    cand = [n for n in joined_fields if n.lower().endswith(".block_id") or "block_id" in n.lower()]
    if cand:
        src = cand[0]
        arcpy.management.AddField(pts_blocks_path, "block_id", "LONG")
        arcpy.management.CalculateField(pts_blocks_path, "block_id", f"!{src}!", "PYTHON3")

print(f"✅ Fishnet created: {fishnet_path}")
print(f"✅ Points with blocks: {pts_blocks_path}")

✅ Fishnet created: C:\Users\USER\Desktop\PROJECTS\ESRI\DATA\MPM_LIRHANDA_CORRIDOR\MPM_LIRHANDA_CORRIDOR.gdb\CV_Fishnet_5km
✅ Points with blocks: C:\Users\USER\Desktop\PROJECTS\ESRI\DATA\MPM_LIRHANDA_CORRIDOR\MPM_LIRHANDA_CORRIDOR.gdb\TrainingPoints_blocks


In [6]:
# --- READ TO PANDAS & PICK PREDICTORS (cleaned: exclude IDs/admin; safety drop) ---
import arcpy, pandas as pd, numpy as np, re

# 1) Numeric predictors only, skip obvious admin fields
num_types = {"Double","Single","Integer","SmallInteger"}
all_fields = arcpy.ListFields(pts_blocks_path)

skip_names = {
    label_field.lower(),
    "block_id","objectid","oid","fid",
    "target_fid","orig_fid","join_count",
    "shape","shape_length","shape_area"
}

feature_fields = [
    f.name for f in all_fields
    if (f.type in num_types and f.name.lower() not in skip_names)
]

# 2) Pull rows
rows = []
with arcpy.da.SearchCursor(pts_blocks_path, feature_fields + [label_field,"block_id","SHAPE@XY"]) as cur:
    for *vals, lab, blk, xy in cur:
        rows.append({**dict(zip(feature_fields, vals)),
                     "label": int(lab), "block_id": int(blk),
                     "x": xy[0], "y": xy[1]})
df = pd.DataFrame(rows)
print(f"[Info] Rows: {len(df)} | Pos: {(df['label']==1).sum()} | Neg: {(df['label']==0).sum()} | Blocks: {df['block_id'].nunique()}")

# 3) SAFER: drop only integer, ~unique, and ID-like names
id_name_pat = re.compile(r"(?:^|_)(?:id|fid|oid|target|orig|join|index)(?:_|$)", re.I)
idish = [
    c for c in feature_fields
    if c in df.columns
    and pd.api.types.is_integer_dtype(df[c])
    and (df[c].nunique() / len(df) > 0.98)
    and id_name_pat.search(c) is not None
]
if idish:
    print("Dropping likely integer ID fields:", idish)
    feature_fields = [c for c in feature_fields if c not in idish]
    df = df[feature_fields + ["label","block_id","x","y"]]


[Info] Rows: 409 | Pos: 68 | Neg: 341 | Blocks: 133


In [7]:
lith_cols = [c for c in feature_fields if c.lower().startswith("lith_")]
cont_cols = [c for c in feature_fields if c not in lith_cols]

In [8]:
X = df[cont_cols + lith_cols]
y = df["label"].values.astype(int)
groups = df["block_id"].values

# preprocessors: impute both; scale only continuous for SVM
rf_prep = ColumnTransformer([
    ("cont_impute", SimpleImputer(strategy="median"), cont_cols),
    ("lith_impute", SimpleImputer(strategy="most_frequent"), lith_cols),
], remainder="drop")

svm_prep = ColumnTransformer([
    ("cont_pipe", Pipeline([
        ("impute", SimpleImputer(strategy="median")),
        ("scale",  StandardScaler(with_mean=True, with_std=True))
    ]), cont_cols),
    ("lith_impute", SimpleImputer(strategy="most_frequent"), lith_cols),
], remainder="drop")

def eval_groupcv(model_builder, X, y, groups, n_splits=5):
    """Short: run GroupKFold CV by block, print ROC-AUC & PR-AUC per fold."""
    uniq = np.unique(groups).size
    n_splits = min(n_splits, max(3, min(5, uniq)))   # keep between 3–5 and ≤ unique blocks
    gkf = GroupKFold(n_splits=n_splits)
    roc_list, pr_list = [], []

    for i, (tr, te) in enumerate(gkf.split(X, y, groups), 1):
        model = model_builder()
        model.fit(X.iloc[tr], y[tr])
        proba = model.predict_proba(X.iloc[te])[:,1]
        pred  = (proba >= 0.5).astype(int)
        roc = roc_auc_score(y[te], proba)
        pr  = average_precision_score(y[te], proba)
        roc_list.append(roc); pr_list.append(pr)
        print(f"Fold {i}: ROC-AUC={roc:.3f} | PR-AUC={pr:.3f}")
    print(f"➕ Mean ROC-AUC={np.mean(roc_list):.3f} (±{np.std(roc_list):.3f})")
    print(f"➕ Mean PR-AUC ={np.mean(pr_list):.3f} (±{np.std(pr_list):.3f})")
    return np.mean(roc_list), np.mean(pr_list)

In [9]:
def build_rf():
    """Short: RF pipeline with imputation; class_weight balances 0/1 without SMOTE."""
    return Pipeline([
        ("prep", rf_prep),
        ("clf", RandomForestClassifier(
            n_estimators=500, max_depth=None, min_samples_leaf=3,
            class_weight="balanced", n_jobs=-1, random_state=42
        ))
    ])

In [10]:
def build_svm():
    """Short: SVM(RBF) pipeline; scale continuous only; class_weight balances 0/1."""
    return Pipeline([
        ("prep", svm_prep),
        ("svc", SVC(kernel="rbf", C=5.0, gamma="scale",
                    class_weight="balanced", probability=True, random_state=42))
    ])

In [11]:
print("\n=== Random Forest : Spatial GroupKFold ===")
rf_roc, rf_pr = eval_groupcv(build_rf, X, y, groups, n_splits=5)

print("\n=== SVM (RBF) : Spatial GroupKFold ===")
svm_roc, svm_pr = eval_groupcv(build_svm, X, y, groups, n_splits=5)


=== Random Forest : Spatial GroupKFold ===
Fold 1: ROC-AUC=0.932 | PR-AUC=0.867
Fold 2: ROC-AUC=0.938 | PR-AUC=0.735
Fold 3: ROC-AUC=0.998 | PR-AUC=0.990
Fold 4: ROC-AUC=0.961 | PR-AUC=0.560
Fold 5: ROC-AUC=0.988 | PR-AUC=0.500
➕ Mean ROC-AUC=0.963 (±0.026)
➕ Mean PR-AUC =0.731 (±0.183)

=== SVM (RBF) : Spatial GroupKFold ===
Fold 1: ROC-AUC=0.907 | PR-AUC=0.747
Fold 2: ROC-AUC=0.923 | PR-AUC=0.736
Fold 3: ROC-AUC=0.993 | PR-AUC=0.961
Fold 4: ROC-AUC=0.894 | PR-AUC=0.390
Fold 5: ROC-AUC=0.950 | PR-AUC=0.200
➕ Mean ROC-AUC=0.933 (±0.035)
➕ Mean PR-AUC =0.607 (±0.274)


In [12]:
gss = GroupShuffleSplit(n_splits=1, test_size=0.25, random_state=42)
tr, te = next(gss.split(X, y, groups))

rf = build_rf()
rf.fit(X.iloc[tr], y[tr])
proba = rf.predict_proba(X.iloc[te])[:,1]
pred  = (proba >= 0.5).astype(int)

print("\n=== Random Forest : Spatial hold-out (25% blocks) ===")
print("ROC-AUC:", round(roc_auc_score(y[te], proba), 3),
      "| PR-AUC:", round(average_precision_score(y[te], proba), 3))
print(classification_report(y[te], pred, digits=3))


=== Random Forest : Spatial hold-out (25% blocks) ===
ROC-AUC: 0.965 | PR-AUC: 0.864
              precision    recall  f1-score   support

           0      0.887     0.947     0.916        75
           1      0.765     0.591     0.667        22

    accuracy                          0.866        97
   macro avg      0.826     0.769     0.791        97
weighted avg      0.860     0.866     0.860        97



In [13]:
final_feature_names = cont_cols + lith_cols
rf_clf = rf.named_steps["clf"]
imp = (pd.Series(rf_clf.feature_importances_, index=final_feature_names)
         .sort_values(ascending=False)
         .head(20))
print("\nTop 20 RF features:\n", imp)


Top 20 RF features:
 As          0.107583
FebyMn      0.082736
Mn          0.072107
Zn          0.062316
W           0.050665
Cu          0.042352
Sb          0.042158
V           0.041038
Cr          0.033784
Rb          0.032864
Bi          0.030889
Pb          0.025957
Th          0.024638
ClayAIOH    0.023655
Tl          0.021669
ferrous     0.020711
OM          0.020321
UbyK        0.019916
Na          0.019488
dem10       0.017103
dtype: float64


In [14]:
from sklearn.model_selection import GroupKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score, average_precision_score
import numpy as np

def build_rf_tuned():
    return Pipeline([
        ("prep", rf_prep),  # your existing preprocessor
        ("clf", RandomForestClassifier(
            n_estimators=800,        # a bit more trees
            min_samples_leaf=5,      # more regularization
            max_depth=None,          # or try 20 if you prefer
            class_weight="balanced",
            n_jobs=-1, random_state=42
        ))
    ])

gkf = GroupKFold(n_splits=5)
roc, pr = [], []
for tr, te in gkf.split(X, y, groups):
    m = build_rf_tuned().fit(X.iloc[tr], y[tr])
    p = m.predict_proba(X.iloc[te])[:,1]
    roc.append(roc_auc_score(y[te], p))
    pr.append(average_precision_score(y[te], p))
print("RF tuned — ROC-AUC:", np.mean(roc).round(3), "PR-AUC:", np.mean(pr).round(3))
rf = build_rf_tuned().fit(X, y)   # fit on ALL data for deployment

RF tuned — ROC-AUC: 0.966 PR-AUC: 0.759


In [15]:
# === Choose threshold the right way (OOF) and check a clean spatial hold-out ===
from sklearn.model_selection import GroupKFold, GroupShuffleSplit
from sklearn.metrics import precision_recall_curve, roc_auc_score, average_precision_score, confusion_matrix, classification_report
import numpy as np

# 1) OOF predictions under GroupKFold (no leakage)
gkf = GroupKFold(n_splits=5)
oof_proba = np.zeros(len(y), dtype=float)

for tr, te in gkf.split(X, y, groups):
    m = build_rf_tuned().fit(X.iloc[tr], y[tr])
    oof_proba[te] = m.predict_proba(X.iloc[te])[:, 1]

# Pick threshold on OOF PR curve (recall-weighted F2)
prec, rec, thr = precision_recall_curve(y, oof_proba)
beta = 2
f2 = (1+beta**2) * (prec*rec) / ((beta**2)*prec + rec + 1e-9)
k = f2.argmax()
THRESH_RF = float(thr[max(k-1, 0)])

print(f"OOF-chosen threshold (F2-opt): {THRESH_RF:.3f}  |  P={prec[k]:.3f}  R={rec[k]:.3f}")
print("OOF ROC-AUC:", round(roc_auc_score(y, oof_proba), 3),
      "| OOF PR-AUC:", round(average_precision_score(y, oof_proba), 3))

# 2) Clean spatial hold-out using that threshold
gss = GroupShuffleSplit(n_splits=1, test_size=0.25, random_state=123)
tr, te = next(gss.split(X, y, groups))

rf_hold = build_rf_tuned().fit(X.iloc[tr], y[tr])
proba_te = rf_hold.predict_proba(X.iloc[te])[:, 1]
pred_te  = (proba_te >= THRESH_RF).astype(int)

print("\n=== Random Forest (clean spatial hold-out) ===")
print("ROC-AUC:", round(roc_auc_score(y[te], proba_te), 3),
      "| PR-AUC:", round(average_precision_score(y[te], proba_te), 3))
print("Confusion matrix @th", THRESH_RF, ":\n", confusion_matrix(y[te], pred_te))
print(classification_report(y[te], pred_te, digits=3))

# 3) Final model for deployment (fit on ALL data) + keep threshold
rf = build_rf_tuned().fit(X, y)
print("\nModel ready for mapping: variable `rf` + threshold THRESH_RF =", THRESH_RF)


OOF-chosen threshold (F2-opt): 0.434  |  P=0.756  R=0.912
OOF ROC-AUC: 0.968 | OOF PR-AUC: 0.821

=== Random Forest (clean spatial hold-out) ===
ROC-AUC: 0.98 | PR-AUC: 0.9
Confusion matrix @th 0.4338329465378296 :
 [[76  5]
 [ 0 23]]
              precision    recall  f1-score   support

           0      1.000     0.938     0.968        81
           1      0.821     1.000     0.902        23

    accuracy                          0.952       104
   macro avg      0.911     0.969     0.935       104
weighted avg      0.961     0.952     0.954       104


Model ready for mapping: variable `rf` + threshold THRESH_RF = 0.4338329465378296


In [16]:
import arcpy
import os
import glob
import numpy as np
import pandas as pd

arcpy.CheckOutExtension("Spatial")

# ---- Paths (edit if needed) -------------------------------------------------
REF_RASTER = r"C:\Users\USER\Desktop\PROJECTS\ESRI\DATA\RASTERS FOR ML\dem10.tif"  # alignment reference
OUT_DIR    = r"C:\Users\USER\Desktop\PROJECTS\ESRI\DATA\OUTPUTS"
os.makedirs(OUT_DIR, exist_ok=True)
OUT_PROB = os.path.join(OUT_DIR, "Prospectivity_RF.tif")
OUT_BIN  = os.path.join(OUT_DIR, "Targets_RF_binary.tif")

# Folders
ml_folder   = r"C:\Users\USER\Desktop\PROJECTS\ESRI\DATA\RASTERS FOR ML"
lith_folder = r"C:\Users\USER\Desktop\PROJECTS\ESRI\DATA\RASTERS FOR ML\LITHOLOGY LAYERS"
geochem_folder = r"C:\Users\USER\Desktop\PROJECTS\ESRI\DATA\GEOCHEMICAL LAYERS\PATHFINDERS"
ratios_folder = r"C:\Users\USER\Desktop\PROJECTS\ESRI\DATA\GEOCHEMICAL LAYERS\key ratio rasters"
covars_folder = r"C:\Users\USER\Desktop\PROJECTS\ESRI\DATA\GEOCHEMICAL LAYERS\COVARIATES"
alter_geochem_folder = r"C:\Users\USER\Desktop\PROJECTS\ESRI\DATA\GEOCHEMICAL LAYERS\Alteration  lithology context"
gdb         = r"C:\Users\USER\Desktop\PROJECTS\ESRI\DATA\MPM_LIRHANDA_CORRIDOR\MPM_LIRHANDA_CORRIDOR.gdb"

# Candidate rasters (continuous + GDB + all lithology one-hots)
candidates = [
    os.path.join(ml_folder, "ClayAIOH.tif"),
    os.path.join(ml_folder, "dem10.tif"),
    os.path.join(ml_folder, "distfaults2.tif"),
    os.path.join(ml_folder, "ferrous.tif"),
    os.path.join(ml_folder, "ironoxide.tif"),
    os.path.join(gdb, "Curv_Plan_10"),
    os.path.join(gdb, "Curv_Profile_10"),
    os.path.join(gdb, "Curvature_Gen10"),
    os.path.join(gdb, "Slope_10"),
] + glob.glob(os.path.join(lith_folder, "*.tif"))  # Lithology rasters

# Adding geochemical rasters from the specified folders
geochemical_folders = [geochem_folder, ratios_folder, covars_folder, alter_geochem_folder]
for folder in geochemical_folders:
    candidates += glob.glob(os.path.join(folder, "*.tif"))

# Manual name overrides (helps when field names differ from file names)
manual = {
    "dem10":                os.path.join(ml_folder, "dem10.tif"),
    "distfaults2":          os.path.join(ml_folder, "distfaults2.tif"),
    "clayaloh":             os.path.join(ml_folder, "ClayAIOH.tif"),
    "ironoxide":            os.path.join(ml_folder, "ironoxide.tif"),
    "ferrous":              os.path.join(ml_folder, "ferrous.tif"),
    "curvature_gen10":      os.path.join(gdb, "Curvature_Gen10"),
    "slope_10":             os.path.join(gdb, "Slope_10"),
    "curv_plan_10":         os.path.join(gdb, "Curv_Plan_10"),
    "curv_profile_10":      os.path.join(gdb, "Curv_Profile_10"),
}

# Output final list of candidate rasters
print(f"Total candidate rasters: {len(candidates)}")
print("Candidate rasters: ", candidates)


Total candidate rasters: 63
Candidate rasters:  ['C:\\Users\\USER\\Desktop\\PROJECTS\\ESRI\\DATA\\RASTERS FOR ML\\ClayAIOH.tif', 'C:\\Users\\USER\\Desktop\\PROJECTS\\ESRI\\DATA\\RASTERS FOR ML\\dem10.tif', 'C:\\Users\\USER\\Desktop\\PROJECTS\\ESRI\\DATA\\RASTERS FOR ML\\distfaults2.tif', 'C:\\Users\\USER\\Desktop\\PROJECTS\\ESRI\\DATA\\RASTERS FOR ML\\ferrous.tif', 'C:\\Users\\USER\\Desktop\\PROJECTS\\ESRI\\DATA\\RASTERS FOR ML\\ironoxide.tif', 'C:\\Users\\USER\\Desktop\\PROJECTS\\ESRI\\DATA\\MPM_LIRHANDA_CORRIDOR\\MPM_LIRHANDA_CORRIDOR.gdb\\Curv_Plan_10', 'C:\\Users\\USER\\Desktop\\PROJECTS\\ESRI\\DATA\\MPM_LIRHANDA_CORRIDOR\\MPM_LIRHANDA_CORRIDOR.gdb\\Curv_Profile_10', 'C:\\Users\\USER\\Desktop\\PROJECTS\\ESRI\\DATA\\MPM_LIRHANDA_CORRIDOR\\MPM_LIRHANDA_CORRIDOR.gdb\\Curvature_Gen10', 'C:\\Users\\USER\\Desktop\\PROJECTS\\ESRI\\DATA\\MPM_LIRHANDA_CORRIDOR\\MPM_LIRHANDA_CORRIDOR.gdb\\Slope_10', 'C:\\Users\\USER\\Desktop\\PROJECTS\\ESRI\\DATA\\RASTERS FOR ML\\LITHOLOGY LAYERS\\Lith_And_1

In [17]:
def base(p):  # dataset name without extension
    return os.path.splitext(os.path.basename(p))[0]
def norm(s):  # simplify for loose matching
    return "".join(ch.lower() for ch in s if ch.isalnum())

name_to_path = {}
for p in candidates:
    if arcpy.Exists(p):
        b = base(p)
        name_to_path[b] = p
        name_to_path[norm(b)] = p

feature_order = cont_cols + lith_cols  # same order the model expects
feat_to_path, missing = {}, []
for f in feature_order:
    p = name_to_path.get(f) or name_to_path.get(norm(f)) or manual.get(f.lower())
    # Try stripping numeric suffix (_1,_2) if present
    if not p and "_" in f and f.split("_")[-1].isdigit():
        base_no = "_".join(f.split("_")[:-1])
        p = name_to_path.get(base_no) or name_to_path.get(norm(base_no)) or manual.get(base_no.lower())
    if p and arcpy.Exists(p):
        feat_to_path[f] = p
    else:
        missing.append(f)

if missing:
    raise RuntimeError("❌ No raster found for these model features: "
                       + ", ".join(missing)
                       + "\nAdd them to `manual` above or adjust paths.")

print("✅ Feature → raster paths resolved:", len(feat_to_path), "features")

✅ Feature → raster paths resolved: 59 features


In [18]:
ref = arcpy.Raster(REF_RASTER)
arcpy.env.snapRaster = ref
arcpy.env.extent     = ref.extent
arcpy.env.cellSize   = ref.meanCellWidth

cell_x = ref.meanCellWidth
cell_y = ref.meanCellHeight
ncols  = ref.width
nrows  = ref.height
ext    = ref.extent
ll_all = arcpy.Point(ext.XMin, ext.YMin)

out_prob = np.full((nrows, ncols), np.nan, dtype="float32")
nodata_out = -9999.0

def read_block(rpath, ll_pt, ncols, nrows):
    """Read a raster block, cast to float, set NoData → NaN."""
    ras = arcpy.Raster(rpath)
    arr = arcpy.RasterToNumPyArray(ras, ll_pt, ncols, nrows).astype("float32")
    nd = ras.noDataValue
    if nd is not None:
        arr[arr == nd] = np.nan
    return arr

In [19]:
BLOCK = 1024  # pixels per side (tweak for memory/perf)

for r0 in range(0, nrows, BLOCK):
    rh = min(BLOCK, nrows - r0)
    for c0 in range(0, ncols, BLOCK):
        cw = min(BLOCK, ncols - c0)
        ll_win = arcpy.Point(ext.XMin + c0*cell_x, ext.YMin + r0*cell_y)

        # read all feature bands for this window
        bands = []
        for f in feature_order:
            arr = read_block(feat_to_path[f], ll_win, cw, rh)
            bands.append(arr.reshape(-1))
        X_block = pd.DataFrame(np.vstack(bands).T, columns=feature_order)

        # predict probabilities (pipeline handles impute/scale)
        proba = rf.predict_proba(X_block)[:, 1].reshape(rh, cw)
        out_prob[r0:r0+rh, c0:c0+cw] = proba

        print(f"Block r{r0}:{r0+rh} c{c0}:{c0+cw} done", end="\r")

print("\n✅ All blocks predicted.")

Block r8192:8938 c8192:9031 done
✅ All blocks predicted.


In [22]:
# ---- Save probability raster ------------------------------------------------
to_save = np.where(np.isfinite(out_prob), out_prob, nodata_out)
rast_p  = arcpy.NumPyArrayToRaster(to_save, ll_all, cell_x, cell_y, nodata_out)
rast_p.save(OUT_PROB)
print("Saved:", OUT_PROB)

# ---- Save binary targets raster (using THRESH_RF) ---------------------------
bin_arr = np.zeros_like(to_save, dtype="uint8")
valid   = np.isfinite(out_prob)
bin_arr[valid] = (out_prob[valid] >= THRESH_RF).astype("uint8")
nodata_bin = 255  # nodata marker for byte raster
bin_arr[~valid] = nodata_bin
rast_b = arcpy.NumPyArrayToRaster(bin_arr, ll_all, cell_x, cell_y, nodata_bin)
rast_b.save(OUT_BIN)
print("Saved:", OUT_BIN)

Saved: C:\Users\USER\Desktop\PROJECTS\ESRI\DATA\OUTPUTS\Prospectivity_RF.tif
Saved: C:\Users\USER\Desktop\PROJECTS\ESRI\DATA\OUTPUTS\Targets_RF_binary.tif
