### Imports & Setup

In [1]:
import numpy as np
import pandas as pd
import geopandas as gpd
import os, warnings
warnings.filterwarnings("ignore", category=UserWarning)

from shapely.geometry import Point
import rioxarray as rxr
import rasterio
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV
from sklearn.metrics import r2_score
from sklearn.base import clone
from joblib import dump
from xgboost import XGBRegressor
from sklearn.ensemble import HistGradientBoostingRegressor

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

### Feature Toggles

In [2]:
FEATURE_FLAGS = {
    "building_cov_50m": True,
    "building_cov_100m": True,
    "building_cov_200m": True,
    "distance_water": True,
    "distance_parks": True,
    "lst_value": True,      # We assume you have a valid LST raster
    "ndvi_value": True,     # We assume your S2 Indices raster has NDVI in band 1
    "ndbi_value": True,     # band 2
    "ndwi_value": True,     # band 3
    "evi_value": False,     # only if you have B02,B04,B08 raw reflectances
    "dist_manhattan_centre": True,
    "dist_bronx_centre": True,
    "location_cluster": True,
}

### Helper functions

In [3]:
def building_coverage_fraction(geom, building_gdf, radius=50):
    buffer_poly = geom.buffer(radius)
    clipped = gpd.clip(building_gdf, buffer_poly)
    area_buildings = clipped.geometry.area.sum()
    area_buf = buffer_poly.area
    return area_buildings / area_buf if area_buf > 0 else 0

def distance_to_polygons(geom, poly_gdf):
    dists = poly_gdf.geometry.distance(geom)
    return dists.min() if len(dists) > 0 else np.nan

def euclidean_distance(x1, y1, x2, y2):
    return np.sqrt((x1 - x2)**2 + (y1 - y2)**2)

def extract_raster_value(geom, raster, band_index=1, method="nearest"):
    x, y = geom.x, geom.y
    val = raster.sel(x=x, y=y, band=band_index, method=method).values
    return float(val)

def compute_evi(blue, red, nir, L=1.0, C1=6.0, C2=7.5, G=2.5):
    return G * (nir - red) / (nir + C1 * red - C2 * blue + L)

### LOAD BOROUGH BOUNDARIES, BUILDINGS, WATER, PARKS

In [4]:
print("Loading boroughs + buildings...")
gdf_boroughs = gpd.read_file("./data/nyc_boroughs.geojson").to_crs("EPSG:2263")
if "name" in gdf_boroughs.columns:
    gdf_boroughs.rename(columns={"name":"BoroName"}, inplace=True)

gdf_buildings = gpd.read_file("./data/Building_Footprint.kml").to_crs("EPSG:2263")
gdf_buildings = gpd.sjoin(gdf_buildings, gdf_boroughs, how="left", predicate="intersects")
gdf_buildings = gdf_buildings.dropna(subset=["BoroName"])

gdf_water = pd.read_csv("./data/NYC_Planimetric_Database__Hydrography_20250123.csv")
from shapely import wkt
gdf_water["geometry"] = gdf_water["the_geom"].apply(wkt.loads)
gdf_water = gpd.GeoDataFrame(gdf_water, geometry="geometry", crs="EPSG:4326").to_crs("EPSG:2263")

gdf_parks = gpd.read_file("./data/Parks_Properties_20250123.kml").to_crs("EPSG:2263")

# Find largest cluster center for Manhattan, Bronx
gdf_manhattan = gdf_buildings[gdf_buildings["BoroName"]=="Manhattan"]
gdf_bronx = gdf_buildings[gdf_buildings["BoroName"]=="Bronx"]

print(gdf_manhattan)
print(gdf_bronx)

Loading boroughs + buildings...
     Name Description                                           geometry  \
5926                   MULTIPOLYGON (((1003404.964 249576.503, 100335...   
5927                   MULTIPOLYGON (((1000278.304 241985.812, 100017...   
5928                   MULTIPOLYGON (((998336.549 242281.176, 998273....   
5929                   MULTIPOLYGON (((999175.11 243083.583, 998935.9...   
5930                   MULTIPOLYGON (((1000201.166 243560.061, 100022...   
...   ...         ...                                                ...   
9431                   MULTIPOLYGON (((997358.658 223175.61, 997394.0...   
9432                   MULTIPOLYGON (((998198.469 222047.008, 998288....   
9433                   MULTIPOLYGON (((997517.49 219375.698, 997618.2...   
9434                   MULTIPOLYGON (((997465.413 215819.05, 997419.4...   
9435                   MULTIPOLYGON (((996496.699 217310.85, 996432.4...   

      index_right   BoroName  cartodb_id               

In [5]:
from shapely.geometry import MultiPoint
from sklearn.cluster import DBSCAN

def find_largest_building_cluster_center(gdf_bld, eps_m=200, min_samples=10):
    centroids = gdf_bld.geometry.centroid
    coords = np.column_stack([centroids.x, centroids.y])
    db = DBSCAN(eps=eps_m, min_samples=min_samples).fit(coords)
    labels = db.labels_
    unique, counts = np.unique(labels, return_counts=True)
    valid_mask = (unique >= 0)
    if not np.any(valid_mask):
        mp = MultiPoint(coords)
        return mp.centroid
    valid_labels = unique[valid_mask]
    valid_counts = counts[valid_mask]
    largest_id = valid_labels[np.argmax(valid_counts)]
    in_largest = coords[labels==largest_id]
    if len(in_largest)==0:
        return MultiPoint(coords).centroid
    else:
        return MultiPoint(in_largest).centroid

man_cent = find_largest_building_cluster_center(gdf_manhattan, eps_m=200, min_samples=20)
brx_cent = find_largest_building_cluster_center(gdf_bronx, eps_m=200, min_samples=20)
manhattan_x, manhattan_y = man_cent.x, man_cent.y
bronx_x, bronx_y = brx_cent.x, brx_cent.y

print("bronx x y:")
print(bronx_x)
print(bronx_y)

print("manhattan x y:")
print(manhattan_x)
print(manhattan_y)

bronx x y:
1017485.8860241318
241760.33727313532
manhattan x y:
997790.3277237965
232244.58317053123


### LOAD + CHECK RASTERS

In [6]:
print("Loading reprojected LST + Indices...")
lst_raster = rxr.open_rasterio("Landsat_LST_v4_single_0601_0901.tiff")
lst_raster_2263 = lst_raster.rio.reproject("EPSG:2263")

indices_raster = rxr.open_rasterio("S2_indices_v4_single_0601_0901.tiff")
indices_raster_2263 = indices_raster.rio.reproject("EPSG:2263")

# Check if the rasters actually have valid data
ratio_lst = (~lst_raster_2263.isnull()).mean().values
ratio_idx = (~indices_raster_2263.isnull()).mean().values
print(f"LST valid ratio: {ratio_lst:.3f}")
print(f"Indices valid ratio: {ratio_idx:.3f}")

if ratio_lst == 0.0:
    print("WARNING: LST raster is entirely NaN. Possibly an empty mosaic or over-strict cloud mask!")
if ratio_idx == 0.0:
    print("WARNING: Indices raster is entirely NaN. Possibly an empty mosaic or over-strict cloud mask!")

print("LST raster bounds:", lst_raster_2263.rio.bounds())
print("Indices raster bounds:", indices_raster_2263.rio.bounds())

Loading reprojected LST + Indices...
LST valid ratio: 1.000
Indices valid ratio: 0.998
LST raster bounds: (981437.4489166049, 212457.54099916507, 1023088.2146540903, 260009.54459268012)
Indices raster bounds: (981462.3462672028, 212512.28574393364, 1023050.6163433938, 259944.0070645518)


### BUILD TRAINING FEATURES

In [7]:
df_train = pd.read_csv("./data/Training_data_uhi_index_UHI2025-v2.csv")
gdf_train = gpd.GeoDataFrame(
    df_train,
    geometry=[Point(lon, lat) for lon, lat in zip(df_train.Longitude, df_train.Latitude)],
    crs="EPSG:4326"
).to_crs("EPSG:2263")

cov_50, cov_100, cov_200 = [], [], []
dist_w, dist_p = [], []
lst_vals, ndvi_vals, ndbi_vals, ndwi_vals, evi_vals = [], [], [], [], []
dist_man_vals, dist_bron_vals = [], []

for i, row in gdf_train.iterrows():
    geom = row.geometry

    if FEATURE_FLAGS["building_cov_50m"]:
        cov_50.append(building_coverage_fraction(geom, gdf_buildings, 50))
    else:
        cov_50.append(0)
    if FEATURE_FLAGS["building_cov_100m"]:
        cov_100.append(building_coverage_fraction(geom, gdf_buildings, 100))
    else:
        cov_100.append(0)
    if FEATURE_FLAGS["building_cov_200m"]:
        cov_200.append(building_coverage_fraction(geom, gdf_buildings, 200))
    else:
        cov_200.append(0)

    # water / parks
    if FEATURE_FLAGS["distance_water"]:
        dist_w.append(distance_to_polygons(geom, gdf_water))
    else:
        dist_w.append(0)
    if FEATURE_FLAGS["distance_parks"]:
        dist_p.append(distance_to_polygons(geom, gdf_parks))
    else:
        dist_p.append(0)

    # LST
    lv = 0
    if FEATURE_FLAGS["lst_value"]:
        lv = extract_raster_value(geom, lst_raster_2263, band_index=1)
    lst_vals.append(lv)

    # NDVI / NDBI / NDWI
    ndv, ndb, ndw = 0,0,0
    if FEATURE_FLAGS["ndvi_value"]:
        ndv = extract_raster_value(geom, indices_raster_2263, 1)
    if FEATURE_FLAGS["ndbi_value"]:
        ndb = extract_raster_value(geom, indices_raster_2263, 2)
    if FEATURE_FLAGS["ndwi_value"]:
        ndw = extract_raster_value(geom, indices_raster_2263, 3)
    ndvi_vals.append(ndv)
    ndbi_vals.append(ndb)
    ndwi_vals.append(ndw)

    # EVI
    evi_val = 0
    if FEATURE_FLAGS["evi_value"]:
        # e.g. blue=band1, red=band3, nir=band4 in another raster
        # evi_val = compute_evi(blue, red, nir)
        pass
    evi_vals.append(evi_val)

    # Dist manhattan / bronx
    distm = 0
    if FEATURE_FLAGS["dist_manhattan_centre"]:
        distm = euclidean_distance(geom.x, geom.y, manhattan_x, manhattan_y)
    distb = 0
    if FEATURE_FLAGS["dist_bronx_centre"]:
        distb = euclidean_distance(geom.x, geom.y, bronx_x, bronx_y)

    dist_man_vals.append(distm)
    dist_bron_vals.append(distb)

gdf_train["bld_cov_50"] = cov_50
gdf_train["bld_cov_100"] = cov_100
gdf_train["bld_cov_200"] = cov_200
gdf_train["dist_water"] = dist_w
gdf_train["dist_parks"] = dist_p
gdf_train["lst_value"] = lst_vals
gdf_train["ndvi_value"] = ndvi_vals
gdf_train["ndbi_value"] = ndbi_vals
gdf_train["ndwi_value"] = ndwi_vals
# gdf_train["evi_value"] = evi_vals
gdf_train["dist_manh"] = dist_man_vals
gdf_train["dist_bron"] = dist_bron_vals

# optional KMeans
from sklearn.cluster import KMeans
if FEATURE_FLAGS["location_cluster"]:
    coords_train = np.column_stack([gdf_train.geometry.x, gdf_train.geometry.y])
    kmeans = KMeans(n_clusters=8, random_state=RANDOM_SEED, n_init=10).fit(coords_train)
    gdf_train["location_cluster"] = kmeans.labels_
else:
    kmeans = None
    gdf_train["location_cluster"] = 0

In [8]:
all_features = [
    "bld_cov_50", "bld_cov_100", "bld_cov_200",
    "dist_water", "dist_parks",
    "lst_value", "ndvi_value", "ndbi_value", "ndwi_value", 
    "dist_manh", "dist_bron",
    "location_cluster"
]

df_train_feat = gdf_train[all_features].fillna(0.0)
X = df_train_feat.values
y = gdf_train["UHI Index"].values
print("Train shape:", X.shape)
print("Number of NaN in X?", np.isnan(X).sum())

Train shape: (11229, 12)
Number of NaN in X? 0


In [9]:
gdf_train

Unnamed: 0,Longitude,Latitude,datetime,UHI Index,geometry,bld_cov_50,bld_cov_100,bld_cov_200,dist_water,dist_parks,lst_value,ndvi_value,ndbi_value,ndwi_value,dist_manh,dist_bron,location_cluster
0,-73.909167,40.813107,24-07-2021 15:53,1.030289,POINT (1009393.606 235526.824),0.019848,0.119473,0.248509,3536.717021,371.277131,41.442815,-0.032849,-0.027607,0.044681,12058.572295,10214.777740,2
1,-73.909187,40.813045,24-07-2021 15:53,1.030289,POINT (1009388.093 235504.35),0.039795,0.159338,0.260397,3520.867715,358.889116,41.442815,-0.017685,-0.025670,0.039056,12047.167026,10232.870157,2
2,-73.909215,40.812978,24-07-2021 15:53,1.023798,POINT (1009380.276 235480.052),0.026731,0.194538,0.278762,3504.846309,344.200944,41.442815,-0.023479,-0.024851,0.039874,12033.085822,10253.921329,2
3,-73.909242,40.812908,24-07-2021 15:53,1.023798,POINT (1009372.92 235454.541),0.020690,0.227561,0.312600,3487.673871,330.119247,41.152283,-0.024157,-0.027820,0.041210,12019.162378,10275.373165,2
4,-73.909257,40.812845,24-07-2021 15:53,1.021634,POINT (1009368.791 235431.463),0.049321,0.263256,0.351619,3470.826136,320.290749,41.152283,-0.024157,-0.027820,0.041210,12009.039312,10292.806926,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11224,-73.957050,40.790333,24-07-2021 15:57,0.972470,POINT (996143.074 227219.577),0.000000,0.000000,0.000000,569.780991,0.000000,34.890471,0.028827,-0.023243,-0.000817,5288.111794,25825.361761,0
11225,-73.957063,40.790308,24-07-2021 15:57,0.972470,POINT (996139.388 227210.467),0.000000,0.000000,0.000000,559.973656,0.000000,34.890471,0.051500,-0.028651,-0.016411,5297.917140,25833.538520,0
11226,-73.957093,40.790270,24-07-2021 15:57,0.981124,POINT (996131.087 227196.498),0.000000,0.000000,0.000000,543.787136,0.000000,34.890471,0.051500,-0.028651,-0.016411,5313.778307,25848.265634,0
11227,-73.957112,40.790253,24-07-2021 15:59,0.981245,POINT (996126.012 227190.422),0.000000,0.000000,0.000000,536.117477,0.000000,34.890471,0.048808,-0.048659,-0.018743,5321.136077,25855.882277,0


In [10]:
gdf_train.describe()

Unnamed: 0,Longitude,Latitude,UHI Index,bld_cov_50,bld_cov_100,bld_cov_200,dist_water,dist_parks,lst_value,ndvi_value,ndbi_value,ndwi_value,dist_manh,dist_bron,location_cluster
count,11229.0,11229.0,11229.0,11229.0,11229.0,11229.0,11229.0,11229.0,11229.0,11229.0,11229.0,11229.0,11229.0,11229.0,11229.0
mean,-73.933927,40.8088,1.000001,0.12519,0.228201,0.30353,1891.184531,362.798049,40.654181,-0.006387,-0.002206,0.023604,11321.153684,18163.91362,3.224686
std,0.028253,0.023171,0.016238,0.17508,0.185421,0.188678,1298.246512,360.343364,2.714054,0.062528,0.065959,0.047172,5453.390526,9320.532698,2.338326
min,-73.994457,40.758792,0.956122,0.0,0.0,0.0,20.933619,0.0,32.484185,-0.231795,-0.353383,-0.499395,146.773827,2863.788821,0.0
25%,-73.955703,40.790905,0.988577,0.0,0.058977,0.166868,930.568027,76.262961,39.227938,-0.037236,-0.03632,0.015558,6642.556117,9804.072416,1.0
50%,-73.932968,40.810688,1.000237,0.048059,0.21574,0.311991,1617.923389,284.983308,40.868588,-0.01968,-0.00443,0.032591,11533.755497,17965.861358,3.0
75%,-73.909647,40.824515,1.011176,0.198378,0.35074,0.43566,2415.031074,505.1988,42.341754,0.006772,0.030177,0.045274,15730.919401,25630.341184,5.0
max,-73.879458,40.859497,1.046036,1.0,0.999675,0.824851,6041.065986,1998.456627,54.564594,0.562697,0.440744,0.165486,21732.239259,38257.009798,7.0


### Final Feature Table

In [11]:
feature_cols = [
    "bld_cov_50", "bld_cov_100", "bld_cov_200",
    "dist_water", "dist_parks",
    "lst_value",
    "ndvi_value", "ndbi_value", "ndwi_value", 
    "dist_manh", "dist_bron",
    "location_cluster"
]
X = gdf_train[feature_cols].fillna(0.0).values
y = gdf_train["UHI Index"].values
print("Final train feature shape:", X.shape)

Final train feature shape: (11229, 12)


### Model Definitions

In [12]:
def make_stratified_bins(target, n_bins=10):
    """Bin the continuous target into discrete intervals for use in 'StratifiedKFold'."""
    # pd.qcut => quantile-based discretization
    bins = pd.qcut(target, q=n_bins, duplicates="drop")  # If duplicates occur, drop them
    return bins.astype(str)  # Convert to string labels

K_FOLDS = 10

y_bins = make_stratified_bins(y, n_bins=10)
skf = StratifiedKFold(n_splits=K_FOLDS, shuffle=True, random_state=RANDOM_SEED)
# kf = KFold(n_splits=K_FOLDS, shuffle=True, random_state=RANDOM_SEED)

In [13]:
# Model definitions and parameter grids
models_and_params = {
    "RandomForest": (
        RandomForestRegressor(random_state=RANDOM_SEED),
        {
            "n_estimators": [100, 200, 500],
            "max_depth": [None],
            "min_samples_leaf": [1, 2, 3],
            "min_samples_split": [4, 5, 6],
        }
    ),
    "ExtraTrees": (
        ExtraTreesRegressor(random_state=RANDOM_SEED),
        {
            "n_estimators": [500, 1000, 2000],
            "max_depth": [None, 10, 20, 40, 50, 60],
            "min_samples_leaf": [1, 2, 3],
        }
    ),
    "DecisionTree": (
        DecisionTreeRegressor(random_state=RANDOM_SEED),
        {
            "max_depth": [10, 20, 40],
            "min_samples_leaf": [1, 2],
        }
    ),
    "KNeighbors": (
        KNeighborsRegressor(),
        {
            "n_neighbors": [2, 3, 4, 5, 10],
            "weights": ["uniform", "distance"],
            "p": [1, 2, 3, 4, 5],  # Manhattan or Euclidean distance
            "algorithm": ['auto', 'ball_tree', 'kd_tree', 'brute'],
            "leaf_size": [10, 20, 30, 40],
            "metric": ['minkowski', 'euclidean', 'manhattan', 'chebyshev'],
        }
    ),
    "XGBoost": (
        XGBRegressor(random_state=RANDOM_SEED, eval_metric="rmse", use_label_encoder=False),
        {
            "n_estimators": [500, 1000, 2000],
            "max_depth": [15, 20, 25],
            "learning_rate": [0.005, 0.01, 0.02],
            "subsample": [0.5, 0.6, 0.7],
            "colsample_bytree": [0.7, 0.8, 0.9],
        }
    ),
    "LightGBM": (
        lgb.LGBMRegressor(random_state=RANDOM_SEED),
        {
            "n_estimators": [100, 200, 500],
            "max_depth": [10, 20, 50],
            "learning_rate": [0.05, 0.1, 0.2],
            "subsample": [0.6, 0.8, 1.0],
            "colsample_bytree": [0.7, 0.8, 0.9],
        }
    ),
    "CatBoost": (
        CatBoostRegressor(silent=True, random_state=RANDOM_SEED),
        {
            "iterations": [100, 200, 500],
            "depth": [2, 5, 10],
            "learning_rate": [0.05, 0.1, 0.2],
            "random_strength": [1, 2, 3, 5],
        }
    ),
    "HistGradientBoosting": (
        HistGradientBoostingRegressor(random_state=RANDOM_SEED),
        {
            "max_iter": [100, 200, 500],
            "learning_rate": [0.001, 0.01, 0.05, 0.1],
            "max_depth": [None, 5, 10, 20],
            "max_leaf_nodes": [15, 31, 63],
            "min_samples_leaf": [10, 20, 50],
            "l2_regularization": [0.0, 0.1, 1.0]
        }
    ),
}

In [14]:
N_ITER = 20
results = []

for model_name, (model, param_grid) in models_and_params.items():
    print(f"\n=== Searching {model_name} ===")
    search = RandomizedSearchCV(
        estimator=model,
        param_distributions=param_grid,
        n_iter=N_ITER,
        cv=list(skf.split(X, y_bins)),
        scoring="r2",
        random_state=RANDOM_SEED,
        n_jobs=-1,
        verbose=1
    )
    search.fit(X, y)
    best_estimator = search.best_estimator_
    best_score = search.best_score_
    best_params = search.best_params_

    results.append({
        "Model": model_name,
        "Best Estimator": best_estimator,
        "Best Score (CV)": best_score,
        "Best Params": best_params
    })

results_df = pd.DataFrame(results).sort_values(by="Best Score (CV)", ascending=False)
print("\nFinal Cross-Val Results:\n", results_df)


=== Searching RandomForest ===
Fitting 10 folds for each of 20 candidates, totalling 200 fits

=== Searching ExtraTrees ===
Fitting 10 folds for each of 20 candidates, totalling 200 fits

=== Searching DecisionTree ===
Fitting 10 folds for each of 6 candidates, totalling 60 fits

=== Searching KNeighbors ===
Fitting 10 folds for each of 20 candidates, totalling 200 fits


20 fits failed out of a total of 200.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
20 fits failed with the following error:
Traceback (most recent call last):
  File "C:\workspace\python\ai-challenge-2025\.venv\lib\site-packages\sklearn\model_selection\_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\workspace\python\ai-challenge-2025\.venv\lib\site-packages\sklearn\base.py", line 1151, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "C:\workspace\python\ai-challenge-2025\.venv\lib\site-packages\sklearn\neighbors\_regression.py", line 218, in fit
    return self._fit(X, y)
  File "C:\workspace\python\ai-challenge-2025\.venv\lib\site-packages\sklearn\neighbors


=== Searching XGBoost ===
Fitting 10 folds for each of 20 candidates, totalling 200 fits

=== Searching LightGBM ===
Fitting 10 folds for each of 20 candidates, totalling 200 fits
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000604 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2813
[LightGBM] [Info] Number of data points in the train set: 11229, number of used features: 12
[LightGBM] [Info] Start training from score 1.000001

=== Searching CatBoost ===
Fitting 10 folds for each of 20 candidates, totalling 200 fits

=== Searching HistGradientBoosting ===
Fitting 10 folds for each of 20 candidates, totalling 200 fits

Final Cross-Val Results:
                   Model                                     Best Estimator  \
3            KNeighbors  KNeighborsRegressor(algorithm='brute', n_neigh...   
1            ExtraTrees  (ExtraTreeRegressor(max_depth=50, random_state...   
4               XGBoos

In [15]:
results_df.to_csv('results_df.csv', index=False)
results_df

Unnamed: 0,Model,Best Estimator,Best Score (CV),Best Params
3,KNeighbors,"KNeighborsRegressor(algorithm='brute', n_neigh...",0.978279,"{'weights': 'distance', 'p': 4, 'n_neighbors':..."
1,ExtraTrees,"(ExtraTreeRegressor(max_depth=50, random_state...",0.974335,"{'n_estimators': 1000, 'min_samples_leaf': 1, ..."
4,XGBoost,"XGBRegressor(base_score=None, booster=None, ca...",0.96635,"{'subsample': 0.7, 'n_estimators': 2000, 'max_..."
0,RandomForest,"(DecisionTreeRegressor(max_features=1.0, min_s...",0.960758,"{'n_estimators': 500, 'min_samples_split': 4, ..."
5,LightGBM,"LGBMRegressor(colsample_bytree=0.9, learning_r...",0.95316,"{'subsample': 0.8, 'n_estimators': 500, 'max_d..."
6,CatBoost,<catboost.core.CatBoostRegressor object at 0x0...,0.950086,"{'random_strength': 3, 'learning_rate': 0.2, '..."
7,HistGradientBoosting,HistGradientBoostingRegressor(l2_regularizatio...,0.937012,"{'min_samples_leaf': 10, 'max_leaf_nodes': 63,..."
2,DecisionTree,"DecisionTreeRegressor(max_depth=20, random_sta...",0.91374,"{'min_samples_leaf': 1, 'max_depth': 20}"


### Validation Data

In [16]:
# 4) Build Validation Data
df_val = pd.read_csv("./data/Submission_template_UHI2025-v2.csv")
gdf_val = gpd.GeoDataFrame(
    df_val,
    geometry=[Point(lon, lat) for lon, lat in zip(df_val.Longitude, df_val.Latitude)],
    crs="EPSG:4326"
).to_crs("EPSG:2263")

# We'll produce the same set of columns for validation:
cov_50_vals, cov_100_vals, cov_200_vals = [], [], []
dist_w_vals, dist_p_vals = [], []
lst_vals, ndvi_vals, ndbi_vals, ndwi_vals, evi_vals = [], [], [], [], []
dist_manh_vals, dist_bron_vals = [], []

for i, row in gdf_val.iterrows():
    geom = row.geometry

    # coverage
    if FEATURE_FLAGS["building_cov_50m"]:
        cov_50_vals.append(building_coverage_fraction(geom, gdf_buildings, 50))
    else:
        cov_50_vals.append(0)

    if FEATURE_FLAGS["building_cov_100m"]:
        cov_100_vals.append(building_coverage_fraction(geom, gdf_buildings, 100))
    else:
        cov_100_vals.append(0)

    if FEATURE_FLAGS["building_cov_200m"]:
        cov_200_vals.append(building_coverage_fraction(geom, gdf_buildings, 200))
    else:
        cov_200_vals.append(0)

    # dist water, parks
    dist_w_vals.append(distance_to_polygons(geom, gdf_water) if FEATURE_FLAGS["distance_water"] else 0)
    dist_p_vals.append(distance_to_polygons(geom, gdf_parks) if FEATURE_FLAGS["distance_parks"] else 0)

    # LST
    lv = 0
    if FEATURE_FLAGS["lst_value"]:
        lv = extract_raster_value(geom, lst_raster_2263, 1)
    lst_vals.append(lv)

    # NDVI/NDBI/NDWI
    ndv, ndb, ndw = 0,0,0
    if FEATURE_FLAGS["ndvi_value"]:
        ndv = extract_raster_value(geom, indices_raster_2263, 1)
    if FEATURE_FLAGS["ndbi_value"]:
        ndb = extract_raster_value(geom, indices_raster_2263, 2)
    if FEATURE_FLAGS["ndwi_value"]:
        ndw = extract_raster_value(geom, indices_raster_2263, 3)
    ndvi_vals.append(ndv)
    ndbi_vals.append(ndb)
    ndwi_vals.append(ndw)

    # EVI (if needed)
    evi_val = 0
    if FEATURE_FLAGS["evi_value"]:
        # same logic as training
        pass
    evi_vals.append(evi_val)

    # dist manhattan, bronx
    distm = 0
    if FEATURE_FLAGS["dist_manhattan_centre"]:
        distm = euclidean_distance(geom.x, geom.y, manhattan_x, manhattan_y)
    distb = 0
    if FEATURE_FLAGS["dist_bronx_centre"]:
        distb = euclidean_distance(geom.x, geom.y, bronx_x, bronx_y)
    dist_manh_vals.append(distm)
    dist_bron_vals.append(distb)

df_val["bld_cov_50"]  = cov_50_vals
df_val["bld_cov_100"] = cov_100_vals
df_val["bld_cov_200"] = cov_200_vals
df_val["dist_water"]  = dist_w_vals
df_val["dist_parks"]  = dist_p_vals
df_val["lst_value"]   = lst_vals
df_val["ndvi_value"]  = ndvi_vals
df_val["ndbi_value"]  = ndbi_vals
df_val["ndwi_value"]  = ndwi_vals
# df_val["evi_value"]   = evi_vals
df_val["dist_manh"]   = dist_manh_vals
df_val["dist_bron"]   = dist_bron_vals

# location clusters if needed
if FEATURE_FLAGS["location_cluster"] and ("kmeans" in globals()) and kmeans is not None:
    coords_val = np.column_stack([gdf_val.geometry.x, gdf_val.geometry.y])
    df_val["location_cluster"] = kmeans.predict(coords_val)
else:
    df_val["location_cluster"] = 0

df_val_feat = df_val[feature_cols].fillna(0.0)
X_val = df_val_feat.values
print("Validation shape:", X_val.shape)

Validation shape: (1040, 12)


In [29]:
# Let's pick top 4 models from the results
top3 = results_df.head(3).reset_index(drop=True)
best_model_1 = top3.iloc[0]["Best Estimator"]
best_model_2 = top3.iloc[1]["Best Estimator"]
best_model_3 = top3.iloc[2]["Best Estimator"]

base_models = []
for i in range(len(top3)):
    model_name = top3.loc[i, "Model"]
    estimator = top3.loc[i, "Best Estimator"]
    base_models.append((model_name, estimator))

print("\nTop 3 Models:\n", base_models)


Top 3 Models:
 [('KNeighbors', KNeighborsRegressor(algorithm='brute', n_neighbors=2, p=4, weights='distance')), ('ExtraTrees', ExtraTreesRegressor(max_depth=50, n_estimators=1000, random_state=42)), ('XGBoost', XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=0.9, device=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric='rmse', feature_types=None,
             gamma=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=0.01, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=25, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             multi_strategy=None, n_estimators=2000, n_jobs=None,
             num_parallel_tree=None, random_state=42, ...))]


In [22]:
def search_ensemble_weights_2(modelA, modelB, X, y, skf, increments=0.01):
    """
    For 2-model ensemble:
      preds = w1 * predsA + (1-w1) * predsB
    We'll pick w1 in [0,1] in steps of increments.
    Evaluate R2 in each fold, pick best average R2.
    Returns best_w1, best_r2
    """
    # We'll generate out-of-fold predictions for each model:
    #  then combine them in each fold for each candidate weight.
    predsA_oof = np.zeros(len(X))
    predsB_oof = np.zeros(len(X))

    for train_idx, valid_idx in skf.split(X, y_bins):
        XA, XV = X[train_idx], X[valid_idx]
        ya, yv = y[train_idx], y[valid_idx]
        mA = clone(modelA)
        mB = clone(modelB)

        mA.fit(XA, ya)
        mB.fit(XA, ya)

        predsA_oof[valid_idx] = mA.predict(XV)
        predsB_oof[valid_idx] = mB.predict(XV)

    # Now we have out-of-fold preds for A and B => evaluate all w1
    best_w = 0
    best_r2 = -999
    w_candidates = np.arange(0, 1.0 + increments, increments)
    for w1 in w_candidates:
        ensemble_oof = w1 * predsA_oof + (1 - w1) * predsB_oof
        r2_ens = r2_score(y, ensemble_oof)
        if r2_ens > best_r2:
            best_r2 = r2_ens
            best_w = w1

    return best_w, best_r2

# Similarly for top 3:
def search_ensemble_weights_3(modelA, modelB, modelC, X, y, skf, increments=0.01):
    """
    For 3-model ensemble:
      preds = w1*predA + w2*predB + w3*predC
    subject to w1 + w2 + w3 = 1, w_i >= 0
    We'll brute-force a small grid in increments.
    """
    predsA_oof = np.zeros(len(X))
    predsB_oof = np.zeros(len(X))
    predsC_oof = np.zeros(len(X))

    for train_idx, valid_idx in skf.split(X, y_bins):
        XA, XV = X[train_idx], X[valid_idx]
        ya, yv = y[train_idx], y[valid_idx]
        mA = clone(modelA)
        mB = clone(modelB)
        mC = clone(modelC)

        mA.fit(XA, ya)
        mB.fit(XA, ya)
        mC.fit(XA, ya)

        predsA_oof[valid_idx] = mA.predict(XV)
        predsB_oof[valid_idx] = mB.predict(XV)
        predsC_oof[valid_idx] = mC.predict(XV)

    w_candidates = np.arange(0, 1.0 + increments, increments)
    best_combo = (0,0,0)
    best_r2 = -999
    for w1 in w_candidates:
        for w2 in w_candidates:
            w3 = 1 - w1 - w2
            if w3 < 0: 
                # skip invalid combos
                continue
            # Compute ensemble
            ensemble_oof = w1*predsA_oof + w2*predsB_oof + w3*predsC_oof
            r2_ens = r2_score(y, ensemble_oof)
            if r2_ens > best_r2:
                best_r2 = r2_ens
                best_combo = (w1, w2, w3)

    return best_combo, best_r2

# Example usage after we find top 2 or 3 in results_df:
modelA = results_df.iloc[0]["Best Estimator"]
modelB = results_df.iloc[1]["Best Estimator"]
modelC = results_df.iloc[2]["Best Estimator"]
# best_w1, best_r2 = search_ensemble_weights_2(modelA, modelB, X, y, skf, increments=0.01)
# # Then we refit modelA, modelB on the entire dataset -> final ensemble weights => predict X_val.

# # -------------------------------------------------------------------------------------
# # 4. Refit best models + apply best weights
# # -------------------------------------------------------------------------------------
# # If we found best w1 for top 2:
# finalA = clone(modelA).fit(X, y)
# finalB = clone(modelB).fit(X, y)
# predsA_val = finalA.predict(X_val)
# predsB_val = finalB.predict(X_val)
# final_ensemble_val = best_w1*predsA_val + (1 - best_w1)*predsB_val

# # Then create submission.
# df_val["UHI Index"] = final_ensemble_val

# # Save the submission
# os.makedirs("output", exist_ok=True)
# df_val.to_csv("output/submission_v12e.csv", index=False)
# print("Saved submission_12e.csv")

# 1) Find best 3-model weights with smaller increments
best_combo, best_r2 = search_ensemble_weights_3(modelA, modelB, modelC, X, y, skf, increments=0.01)
w1, w2, w3 = best_combo
print(f"Best weights = (w1={w1:.2f}, w2={w2:.2f}, w3={w3:.2f}); OOF R^2={best_r2:.5f}")

# 2) Refit all 3 base models on full data
finalA = clone(modelA).fit(X, y)
finalB = clone(modelB).fit(X, y)
finalC = clone(modelC).fit(X, y)

# 3) Predict on X_val from each model
predA_val = finalA.predict(X_val)
predB_val = finalB.predict(X_val)
predC_val = finalC.predict(X_val)

# 4) Combine using the found weights
final_ensemble_val = w1*predA_val + w2*predB_val + w3*predC_val

# 5) Create & save submission
df_val["UHI Index"] = final_ensemble_val
os.makedirs("output", exist_ok=True)
df_val.to_csv("output/submission_v12_3model_weighted.csv", index=False)
print("Saved submission_v12_3model_weighted.csv")

Best weights = (w1=0.72, w2=0.28, w3=0.00); OOF R^2=0.97897
Saved submission_v12_3model_weighted.csv


In [20]:
# just a simple ensemble with top N models and average

val_preds1 = best_model_1.predict(X_val)
val_preds2 = best_model_2.predict(X_val)
val_preds3 = best_model_3.predict(X_val)
ensemble_preds = (val_preds1 + val_preds2 + val_preds3) / 3

# Add predictions to the validation dataframe
df_val["UHI Index"] = ensemble_preds

# Save the submission
os.makedirs("output", exist_ok=True)
df_val.to_csv("output/submission_v12.csv", index=False)
print("Saved submission_12.csv")

Saved submission_12.csv


In [23]:
# Let's do a custom out-of-fold prediction approach for these top 3

# A) Generate OOF predictions for each base model
#    We'll create arrays of shape [n_samples, n_base_models]
oof_preds = np.zeros((len(X), len(base_models)))

for idx, (mname, base_model) in enumerate(base_models):
    print(f"{idx}: {mname}; {base_model}")
    # We'll do a new copy of the model so we don't re-fit the original
    # or we can clone it
    model_clone = clone(base_model)

    # out-of-fold predictions
    fold_idx = 0
    for train_idx, valid_idx in skf.split(X, y_bins):
        print(f"\t{train_idx}, {valid_idx}")
        X_trainF, X_validF = X[train_idx], X[valid_idx]
        y_trainF, y_validF = y[train_idx], y[valid_idx]

        model_clone.fit(X_trainF, y_trainF)
        preds_validF = model_clone.predict(X_validF)
        oof_preds[valid_idx, idx] = preds_validF

oof_preds

0: KNeighbors; KNeighborsRegressor(algorithm='brute', n_neighbors=2, p=4, weights='distance')
	[    0     1     3 ... 11226 11227 11228], [    2     7     9 ... 11179 11191 11203]
	[    0     1     2 ... 11226 11227 11228], [   12    24    38 ... 11205 11210 11215]
	[    0     1     2 ... 11226 11227 11228], [    6     8    22 ... 11204 11209 11221]
	[    0     1     2 ... 11225 11226 11227], [   10    14    17 ... 11134 11213 11228]
	[    0     1     2 ... 11226 11227 11228], [   16    18    20 ... 11161 11173 11220]
	[    0     1     2 ... 11226 11227 11228], [   28    39    51 ... 11217 11219 11222]
	[    0     1     2 ... 11225 11226 11228], [    3    15    30 ... 11201 11208 11227]
	[    0     2     3 ... 11222 11227 11228], [    1     5    32 ... 11224 11225 11226]
	[    1     2     3 ... 11226 11227 11228], [    0    13    25 ... 11184 11199 11218]
	[    0     1     2 ... 11226 11227 11228], [    4    19    21 ... 11198 11202 11207]
1: ExtraTrees; ExtraTreesRegressor(max_depth=5

array([[1.02821517, 1.02549267, 1.02039576],
       [1.02724674, 1.02445553, 1.02353883],
       [1.02707417, 1.02672238, 1.02468228],
       ...,
       [0.98197398, 0.98066278, 0.98051852],
       [0.98228557, 0.98282085, 0.9828307 ],
       [0.98120508, 0.98117871, 0.98096257]])

In [38]:
from sklearn.linear_model import Ridge, Lasso, LinearRegression
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = scaler.fit_transform(oof_preds)

meta_learners = {
    "Linear": LinearRegression(),
    "Ridge": Ridge(alpha=1.0, random_state=42),
    "Lasso": Lasso(alpha=1e-5, random_state=42),
    "XGB": xgb.XGBRegressor(n_estimators=2000, learning_rate=0.01,
                            max_depth=25, random_state=42, 
                            subsample=0.7, colsample_bytree=0.9,
                            eval_metric="rmse", use_label_encoder=False),
    # "LightGBM": lgb.LGBMRegressor(n_estimators=2000, learning_rate=0.1, 
    #                               min_child_samples=1, num_leaves=31,
    #                               max_depth=None, random_state=42),
    "RF": RandomForestRegressor(n_estimators=400, max_depth=10,
                                random_state=42),
    "MLP": MLPRegressor(hidden_layer_sizes=(256,128), activation="relu",
                        solver="adam", max_iter=2000, random_state=42)
}

# We'll store results in a dict
meta_results = {}

for mname, meta_model in meta_learners.items():
    # Fit on the entire training set (oof_preds => y)
    meta_model.fit(oof_preds, y)
    
    # Evaluate OOF R² on the *same* data used for training
    # (some risk of overfitting, but a quick comparison is fine)
    preds_oof = meta_model.predict(oof_preds)
    r2_val = r2_score(y, preds_oof)
    
    meta_results[mname] = r2_val

# Print each meta-learner’s OOF R²
print("=== Meta-Learner Comparison ===")
for mname, score in sorted(meta_results.items(), key=lambda x: x[1], reverse=True):
    print(f"{mname} => OOF R2: {score:.5f}")

=== Meta-Learner Comparison ===
XGB => OOF R2: 0.99405
RF => OOF R2: 0.98691
Linear => OOF R2: 0.97899
Lasso => OOF R2: 0.97738
MLP => OOF R2: 0.97456
Ridge => OOF R2: 0.96587


In [39]:
from copy import deepcopy
from sklearn.base import clone
import numpy as np

# Suppose you have:
# base_models = [(name1, model1), (name2, model2), ...]
# meta_learners = { "LightGBM": best_LGB, "XGB": best_XGB, ... } from your previous cell
# meta_results = { "LightGBM": 0.99822, "XGB": 0.98399, ... } mapping each meta-learner to an OOF R²
# scaler = StandardScaler() # previously fit on oof_preds
# X, y => full training data
# oof_preds => shape (n_samples, len(base_models))

##############################################
# 1) Identify the best meta-learner from OOF
##############################################
best_meta_name = max(meta_results, key=meta_results.get)
best_meta_model = meta_learners[best_meta_name]
print(f"Best meta-learner: {best_meta_name}, OOF R2 => {meta_results[best_meta_name]:.5f}")

##############################################
# 2) Refit each base model on the FULL data
##############################################
base_models_fitted = []
full_preds_stack = np.zeros((len(X), len(base_models)))  # same shape logic as OOF, but now for entire data

for idx, (mname, base_model) in enumerate(base_models):
    print(f"{idx}: {mname}, {base_model}")
    # clone to avoid reusing partial state
    fm = clone(base_model)
    fm.fit(X, y)
    base_models_fitted.append((mname, fm))
    # store predictions
    full_preds_stack[:, idx] = fm.predict(X)

##############################################
# 3) Scale the stacked predictions
##############################################
full_preds_stack_scaled = scaler.transform(full_preds_stack)

##############################################
# 4) Refit the chosen meta-learner on FULL stack
##############################################
final_meta_learner = clone(best_meta_model)
final_meta_learner.fit(full_preds_stack_scaled, y)

print(f"Refitted best meta-learner ({best_meta_name}) on entire dataset.")

Best meta-learner: XGB, OOF R2 => 0.99405
0: KNeighbors, KNeighborsRegressor(algorithm='brute', n_neighbors=2, p=4, weights='distance')
1: ExtraTrees, ExtraTreesRegressor(max_depth=50, n_estimators=1000, random_state=42)
2: XGBoost, XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=0.9, device=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric='rmse', feature_types=None,
             gamma=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=0.01, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=25, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             multi_strategy=None, n_estimators=2000, n_jobs=None,
             num_parallel_tree=None, random_state=42, ...)
Refitted best meta-lear

In [40]:
# -------------------------------------------
# MAKE VALIDATION PREDICTIONS via meta-ensemble
# -------------------------------------------

# 1) Stack predictions from each base model
val_stack = np.zeros((len(X_val), len(base_models_fitted)))
for idx, (mname, fm) in enumerate(base_models_fitted):
    val_stack[:, idx] = fm.predict(X_val)

# 2) Scale the stacked predictions, using the same scaler fit on OOF
val_stack_scaled = scaler.transform(val_stack)

# 3) Meta-learner final predictions
final_val_preds = final_meta_learner.predict(val_stack_scaled)
final_val_preds

array([0.96401143, 0.96337295, 0.96401817, ..., 1.0396885 , 1.0373894 ,
       1.0355791 ], dtype=float32)

In [41]:
# SAVE SUBMISSION
df_val = pd.read_csv("./data/Submission_template_UHI2025-v2.csv")
df_val["UHI Index"] = final_val_preds

os.makedirs("output", exist_ok=True)
submission_path = "output/submission_v12_meta.csv"
df_val.to_csv(submission_path, index=False)
print(f"Saved {submission_path}")

Saved output/submission_v12_meta.csv


### Save Top Models

In [42]:
# Save each base model
os.makedirs("models", exist_ok=True)
for i, (mname, fm) in enumerate(base_models_fitted):
    output_path = f"models/base_{mname}_model_{i}_v12.pkl"
    dump(fm, output_path)
    print(f"Saved base model: {output_path}")

# Save final meta-learner
import datetime
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M")
meta_output_path = f"models/final_meta_learner_{best_meta_name}_{timestamp}.pkl"
dump(final_meta_learner, meta_output_path)
print(f"Saved final meta-learner ({best_meta_name}) => {meta_output_path}")

Saved base model: models/base_KNeighbors_model_0_v12.pkl
Saved base model: models/base_ExtraTrees_model_1_v12.pkl
Saved base model: models/base_XGBoost_model_2_v12.pkl
Saved final meta-learner (XGB) => models/final_meta_learner_XGB_20250131_2008.pkl
