### Imports & Setup

In [28]:
import numpy as np
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point, MultiPoint
import rioxarray as rxr
import rasterio
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

# Models
from sklearn.model_selection import KFold, RandomizedSearchCV, StratifiedKFold
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, StackingRegressor, HistGradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.base import clone
from xgboost import XGBRegressor
import tensorflow as tf
from tensorflow.keras.wrappers.scikit_learn import KerasRegressor

from sklearn.cluster import DBSCAN, KMeans

import os
from joblib import dump

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
tf.random.set_seed(RANDOM_SEED)

### Feature Toggles

In [2]:
FEATURE_FLAGS = {
    "building_coverage_50m": True,
    "building_coverage_100m": True,  # new
    "building_coverage_200m": True,  # new
    "distance_water": True,
    "distance_parks": True,
    "lst_value": True,
    "ndvi_value": True,
    "ndbi_value": True,
    "ndwi_value": True,
    "evi_value": True,              # new EVI
    "kmeans_location_cluster": True,
    "dist_manhattan_centre": True,
    "dist_bronx_centre": True,
}

### Load Borough Boundaries & Building Footprints

In [3]:
# Load NYC borough boundaries from geojson
gdf_boroughs = gpd.read_file("./data/nyc_boroughs.geojson").to_crs("EPSG:2263")

# Extract borough names from the "name" column
if "name" in gdf_boroughs.columns:
    gdf_boroughs = gdf_boroughs.rename(columns={"name": "BoroName"})
else:
    raise ValueError("Expected a 'name' column in borough shapefile for borough names.")

print("Boroughs loaded:", gdf_boroughs["BoroName"].unique())

# building footprints
gdf_buildings = gpd.read_file("./data/Building_Footprint.kml").to_crs("EPSG:2263")
print(f"gdf buildings count before: {gdf_buildings.count()}")

# Spatially join buildings with borough boundaries
gdf_buildings = gpd.sjoin(gdf_buildings, gdf_boroughs, how="left", predicate="intersects")

# Check resulting columns
print("Building columns after join:", gdf_buildings.columns)

Boroughs loaded: ['Staten Island' 'Queens' 'Brooklyn' 'Manhattan' 'Bronx']
gdf buildings count before: Name           9436
Description    9436
geometry       9436
dtype: int64
Building columns after join: Index(['Name', 'Description', 'geometry', 'index_right', 'BoroName',
       'cartodb_id', 'created_at', 'updated_at'],
      dtype='object')


In [4]:
# Ensure there are no missing borough names
if gdf_buildings["BoroName"].isnull().any():
    print("Some buildings are not assigned to any borough. They will be dropped.")
    gdf_buildings = gdf_buildings.dropna(subset=["BoroName"])
    print(f"gdf buildings count after dropna: {gdf_buildings.count()}")

Some buildings are not assigned to any borough. They will be dropped.
gdf buildings count after dropna: Name           9422
Description    9422
geometry       9422
index_right    9422
BoroName       9422
cartodb_id     9422
created_at     9422
updated_at     9422
dtype: int64


In [5]:
gdf_buildings

Unnamed: 0,Name,Description,geometry,index_right,BoroName,cartodb_id,created_at,updated_at
0,,,"MULTIPOLYGON (((1006651.793 248309.569, 100656...",4.0,Bronx,5.0,2013-03-09 02:42:03.692000+00:00,2013-03-09 02:42:03.989000+00:00
1,,,"MULTIPOLYGON (((1005842.639 248829.838, 100585...",4.0,Bronx,5.0,2013-03-09 02:42:03.692000+00:00,2013-03-09 02:42:03.989000+00:00
2,,,"MULTIPOLYGON (((1006243.634 249006.538, 100625...",4.0,Bronx,5.0,2013-03-09 02:42:03.692000+00:00,2013-03-09 02:42:03.989000+00:00
3,,,"MULTIPOLYGON (((1006227.161 249476.52, 1006235...",4.0,Bronx,5.0,2013-03-09 02:42:03.692000+00:00,2013-03-09 02:42:03.989000+00:00
4,,,"MULTIPOLYGON (((1008500.118 249763.236, 100846...",4.0,Bronx,5.0,2013-03-09 02:42:03.692000+00:00,2013-03-09 02:42:03.989000+00:00
...,...,...,...,...,...,...,...,...
9431,,,"MULTIPOLYGON (((997358.658 223175.61, 997394.0...",3.0,Manhattan,4.0,2013-03-09 02:42:03.692000+00:00,2013-03-09 02:42:03.989000+00:00
9432,,,"MULTIPOLYGON (((998198.469 222047.008, 998288....",3.0,Manhattan,4.0,2013-03-09 02:42:03.692000+00:00,2013-03-09 02:42:03.989000+00:00
9433,,,"MULTIPOLYGON (((997517.49 219375.698, 997618.2...",3.0,Manhattan,4.0,2013-03-09 02:42:03.692000+00:00,2013-03-09 02:42:03.989000+00:00
9434,,,"MULTIPOLYGON (((997465.413 215819.05, 997419.4...",3.0,Manhattan,4.0,2013-03-09 02:42:03.692000+00:00,2013-03-09 02:42:03.989000+00:00


### Subset Footprints for Manhattan & Bronx

In [6]:
# Subset by borough name
gdf_manhattan = gdf_buildings[gdf_buildings["BoroName"] == "Manhattan"].copy()
gdf_bronx = gdf_buildings[gdf_buildings["BoroName"] == "Bronx"].copy()

print(f"Manhattan building footprints: {len(gdf_manhattan)}")
print(f"Bronx building footprints: {len(gdf_bronx)}")

Manhattan building footprints: 3495
Bronx building footprints: 5927


### Find Largest Cluster Centers

In [7]:
def find_largest_building_cluster_center(gdf_buildings_boro, eps_m=200, min_samples=10):
    # 1. compute footprint centroids
    building_centroids = gdf_buildings_boro.geometry.centroid
    coords = np.column_stack([building_centroids.x, building_centroids.y])

    # 2. cluster with DBSCAN
    db = DBSCAN(eps=eps_m, min_samples=min_samples).fit(coords)
    labels = db.labels_  # -1 => outliers

    # 3. find largest cluster (excluding -1)
    unique, counts = np.unique(labels, return_counts=True)
    # ignore outliers
    valid_mask = (unique >= 0)
    if not np.any(valid_mask):
        # no valid cluster => pick entire set's centroid?
        mp = MultiPoint(coords)
        return mp.centroid
    # among the valid clusters, pick the largest
    valid_clusters = unique[valid_mask]
    valid_counts   = counts[valid_mask]
    largest_id = valid_clusters[np.argmax(valid_counts)]
    
    # 4. gather coords in largest cluster
    in_largest = coords[labels==largest_id]
    if len(in_largest)==0:
        # fallback
        mp = MultiPoint(coords)
        return mp.centroid
    else:
        return MultiPoint(in_largest).centroid

manhattan_center_geom = find_largest_building_cluster_center(gdf_manhattan, eps_m=200, min_samples=20)
bronx_center_geom = find_largest_building_cluster_center(gdf_bronx, eps_m=200, min_samples=20)

manhattan_x, manhattan_y = manhattan_center_geom.x, manhattan_center_geom.y
bronx_x, bronx_y = bronx_center_geom.x, bronx_center_geom.y

print(f"Manhattan cluster center: {manhattan_x:.1f}, {manhattan_y:.1f}")
print(f"Bronx cluster center: {bronx_x:.1f}, {bronx_y:.1f}")

Manhattan cluster center: 997790.3, 232244.6
Bronx cluster center: 1017485.9, 241760.3


### Load Water, Parks, LST, S2 Indices

In [8]:
import csv
import sys
from shapely import wkt

df_water = pd.read_csv("./data/NYC_Planimetric_Database__Hydrography_20250123.csv")
df_water["geometry"] = df_water["the_geom"].apply(wkt.loads)
gdf_water = gpd.GeoDataFrame(df_water, geometry="geometry", crs="EPSG:4326").to_crs("EPSG:2263")

gdf_parks = gpd.read_file("./data/Parks_Properties_20250123.kml").to_crs("EPSG:2263")

lst_raster = rxr.open_rasterio("Landsat_LST_v4_single_0601_0901.tiff")
lst_raster_2263 = lst_raster.rio.reproject("EPSG:2263")

indices_raster = rxr.open_rasterio("S2_indices_v4_single_0601_0901.tiff")
indices_raster_2263 = indices_raster.rio.reproject("EPSG:2263")

### Load Training Data & Reproject

In [11]:
df_train = pd.read_csv("./data/Training_data_uhi_index_UHI2025-v2.csv")
print("Train data shape:", df_train.shape)

gdf_train = gpd.GeoDataFrame(
    df_train,
    geometry=[Point(lon, lat) for lon, lat in zip(df_train.Longitude, df_train.Latitude)],
    crs="EPSG:4326"
).to_crs("EPSG:2263")

Train data shape: (11229, 4)


### Helper Functions

In [12]:
# Suppose your "indices_raster_2263" is a 3-band stack: band=1 => NDVI, 2 => NDBI, 3 => NDWI
# For EVI, we need actual reflectances from Blue, Red, NIR. If you have them in a separate 4-band or 5-band raster, great.
# If not, you need to produce them. For demonstration, let's assume we have a sentinel raster with B02=blue, B04=red, B08=NIR, etc.
# We'll show a function to compute EVI on the fly below.

def compute_evi(blue, red, nir, L=1.0, C1=6.0, C2=7.5, G=2.5):
    """
    Compute the Enhanced Vegetation Index (EVI).
    EVI = G * (NIR - Red) / (NIR + C1*Red - C2*Blue + L)
    Typically G=2.5, C1=6, C2=7.5, L=1
    """
    evi = G * (nir - red) / (nir + C1 * red - C2 * blue + L)
    return evi

def building_coverage_fraction(geom, building_gdf, radius=50):
    buffer_poly = geom.buffer(radius)
    clipped = gpd.clip(building_gdf, buffer_poly)
    area_buildings = clipped.geometry.area.sum()
    area_buf = buffer_poly.area
    return area_buildings / area_buf if area_buf > 0 else 0

def distance_to_polygons(geom, poly_gdf):
    dists = poly_gdf.geometry.distance(geom)
    return dists.min() if len(dists) > 0 else np.nan

def euclidean_distance(x1, y1, x2, y2):
    return np.sqrt((x1 - x2)**2 + (y1 - y2)**2)

def extract_raster_value(geom, raster, band_index=1, method="nearest"):
    x, y = geom.x, geom.y
    val = raster.sel(x=x, y=y, band=band_index, method=method).values
    return float(val)

### Build Training Features

In [15]:
# BUILD Features:
bld_cov_50 = []
bld_cov_100 = []
bld_cov_200 = []
dist_water_vals = []
dist_parks_vals = []
lst_vals = []
ndvi_vals = []
ndbi_vals = []
ndwi_vals = []
evi_vals = []
dist_man_vals = []
dist_bron_vals = []

for i, row in gdf_train.iterrows():
    geom = row.geometry

    # coverage 50, 100, 200
    if FEATURE_FLAGS["building_coverage_50m"]:
        bld_cov_50.append(building_coverage_fraction(geom, gdf_buildings, radius=50))
    else:
        bld_cov_50.append(0)

    if FEATURE_FLAGS["building_coverage_100m"]:
        bld_cov_100.append(building_coverage_fraction(geom, gdf_buildings, radius=100))
    else:
        bld_cov_100.append(0)

    if FEATURE_FLAGS["building_coverage_200m"]:
        bld_cov_200.append(building_coverage_fraction(geom, gdf_buildings, radius=200))
    else:
        bld_cov_200.append(0)

    # distance water
    dist_water_vals.append(distance_to_polygons(geom, gdf_water) if FEATURE_FLAGS["distance_water"] else 0)
    # distance parks
    dist_parks_vals.append(distance_to_polygons(geom, gdf_parks) if FEATURE_FLAGS["distance_parks"] else 0)
    # LST
    lst_val = extract_raster_value(geom, lst_raster_2263, band_index=1) if FEATURE_FLAGS["lst_value"] else 0
    lst_vals.append(lst_val)
    # NDVI / NDBI / NDWI
    ndv = extract_raster_value(geom, indices_raster_2263, 1) if FEATURE_FLAGS["ndvi_value"] else 0
    ndb = extract_raster_value(geom, indices_raster_2263, 2) if FEATURE_FLAGS["ndbi_value"] else 0
    ndw = extract_raster_value(geom, indices_raster_2263, 3) if FEATURE_FLAGS["ndwi_value"] else 0
    ndvi_vals.append(ndv)
    ndbi_vals.append(ndb)
    ndwi_vals.append(ndw)

    # EVI (requires Blue/Red/NIR). If you only have NDVI, NDBI, NDWI in the raster, you might not have B02, B04, B08.
    # If you do have them in some "s2_bands_raster_2263" with band indices: 1=Blue, 2=Green, 3=Red, 4=NIR, ...
    # Then do something like:
    if FEATURE_FLAGS["evi_value"]:
        # Example: let's pretend we have a 4-band sentinel raster with B02=1, B04=3, B08=4 for demonstration:
        # For actual data, adapt indices accordingly.
        # Or if you created a separate TIF with them, reference that:
        try:
            blue = extract_raster_value(geom, s2_bands_raster_2263, band_index=1)
            red  = extract_raster_value(geom, s2_bands_raster_2263, band_index=3)
            nir  = extract_raster_value(geom, s2_bands_raster_2263, band_index=4)
            evi_val = compute_evi(blue, red, nir)
        except:
            evi_val = 0
    else:
        evi_val = 0
    evi_vals.append(evi_val)

    # Distance to Manhattan, Bronx centers
    dist_man = euclidean_distance(geom.x, geom.y, manhattan_x, manhattan_y) if FEATURE_FLAGS["dist_manhattan_centre"] else 0
    dist_bro = euclidean_distance(geom.x, geom.y, bronx_x, bronx_y) if FEATURE_FLAGS["dist_bronx_centre"] else 0
    dist_man_vals.append(dist_man)
    dist_bron_vals.append(dist_bro)

gdf_train["bld_cov_50"]   = bld_cov_50
gdf_train["bld_cov_100"]  = bld_cov_100
gdf_train["bld_cov_200"]  = bld_cov_200
gdf_train["dist_water"]   = dist_water_vals
gdf_train["dist_parks"]   = dist_parks_vals
gdf_train["lst_value"]    = lst_vals
gdf_train["ndvi_value"]   = ndvi_vals
gdf_train["ndbi_value"]   = ndbi_vals
gdf_train["ndwi_value"]   = ndwi_vals
gdf_train["evi_value"]    = evi_vals
gdf_train["dist_manh"]    = dist_man_vals
gdf_train["dist_bron"]    = dist_bron_vals

### K-Means for location-based cluster

In [16]:
N_CLUSTERS = 20

if FEATURE_FLAGS["kmeans_location_cluster"]:
    coords = np.column_stack([gdf_train.geometry.x, gdf_train.geometry.y])
    kmeans = KMeans(n_clusters=N_CLUSTERS, random_state=RANDOM_SEED, n_init=10).fit(coords)
    gdf_train["location_cluster"] = kmeans.labels_
else:
    kmeans = None
    gdf_train["location_cluster"] = np.nan

In [17]:
gdf_train

Unnamed: 0,Longitude,Latitude,datetime,UHI Index,geometry,bld_cov_50,bld_cov_100,bld_cov_200,dist_water,dist_parks,lst_value,ndvi_value,ndbi_value,ndwi_value,evi_value,dist_manh,dist_bron,location_cluster
0,-73.909167,40.813107,24-07-2021 15:53,1.030289,POINT (1009393.606 235526.824),0.019848,0.119473,0.248509,3536.717021,371.277131,,,,,0,12058.572295,10214.777740,15
1,-73.909187,40.813045,24-07-2021 15:53,1.030289,POINT (1009388.093 235504.35),0.039795,0.159338,0.260397,3520.867715,358.889116,,,,,0,12047.167026,10232.870157,15
2,-73.909215,40.812978,24-07-2021 15:53,1.023798,POINT (1009380.276 235480.052),0.026731,0.194538,0.278762,3504.846309,344.200944,,,,,0,12033.085822,10253.921329,15
3,-73.909242,40.812908,24-07-2021 15:53,1.023798,POINT (1009372.92 235454.541),0.020690,0.227561,0.312600,3487.673871,330.119247,,,,,0,12019.162378,10275.373165,15
4,-73.909257,40.812845,24-07-2021 15:53,1.021634,POINT (1009368.791 235431.463),0.049321,0.263256,0.351619,3470.826136,320.290749,,,,,0,12009.039312,10292.806926,15
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11224,-73.957050,40.790333,24-07-2021 15:57,0.972470,POINT (996143.074 227219.577),0.000000,0.000000,0.000000,569.780991,0.000000,,,,,0,5288.111794,25825.361761,3
11225,-73.957063,40.790308,24-07-2021 15:57,0.972470,POINT (996139.388 227210.467),0.000000,0.000000,0.000000,559.973656,0.000000,,,,,0,5297.917140,25833.538520,3
11226,-73.957093,40.790270,24-07-2021 15:57,0.981124,POINT (996131.087 227196.498),0.000000,0.000000,0.000000,543.787136,0.000000,,,,,0,5313.778307,25848.265634,3
11227,-73.957112,40.790253,24-07-2021 15:59,0.981245,POINT (996126.012 227190.422),0.000000,0.000000,0.000000,536.117477,0.000000,,,,,0,5321.136077,25855.882277,3


### Final Feature Table

In [21]:
feature_cols = [
    "bld_cov_50", "bld_cov_100", "bld_cov_200",
    "dist_water", "dist_parks",
    "lst_value",
    "ndvi_value", "ndbi_value", "ndwi_value", "evi_value",
    "dist_manh", "dist_bron",
    "location_cluster"
]
X = gdf_train[feature_cols].fillna(0.0).values
y = gdf_train["UHI Index"].values
print("Final train feature shape:", X.shape)

Final train feature shape: (11229, 13)


### Model Definitions

In [33]:
def make_stratified_bins(target, n_bins=10):
    """Bin the continuous target into discrete intervals for use in 'StratifiedKFold'."""
    # pd.qcut => quantile-based discretization
    bins = pd.qcut(target, q=n_bins, duplicates="drop")  # If duplicates occur, drop them
    return bins.astype(str)  # Convert to string labels

K_FOLDS = 5

y_bins = make_stratified_bins(y, n_bins=10)
skf = StratifiedKFold(n_splits=K_FOLDS, shuffle=True, random_state=RANDOM_SEED)
# kf = KFold(n_splits=K_FOLDS, shuffle=True, random_state=RANDOM_SEED)

In [34]:
# Model definitions and parameter grids
models_and_params = {
    "RandomForest": (
        RandomForestRegressor(random_state=RANDOM_SEED),
        {
            "n_estimators": [3000, 4000, 5000],
            "max_depth": [None, 2, 4],
            "min_samples_leaf": [1, 2, 3],
            "min_samples_split": [4, 5, 6],
        }
    ),
    "ExtraTrees": (
        ExtraTreesRegressor(random_state=RANDOM_SEED),
        {
            "n_estimators": [3000, 4000, 5000],
            "max_depth": [None, 10, 20, 40, 50, 60],
            "min_samples_leaf": [1, 2, 3],
        }
    ),
    "DecisionTree": (
        DecisionTreeRegressor(random_state=RANDOM_SEED),
        {
            "max_depth": [60, 100, 120],
            "min_samples_leaf": [1, 2],
        }
    ),
    "KNeighbors": (
        KNeighborsRegressor(),
        {
            "n_neighbors": [2, 3, 4, 5, 7, 10],
            "weights": ["uniform", "distance"],
            "p": [1, 2],  # Manhattan or Euclidean distance
        }
    ),
    "XGBoost": (
        XGBRegressor(random_state=RANDOM_SEED, eval_metric="rmse", use_label_encoder=False),
        {
            "n_estimators": [1500, 2000, 2500, 3000],
            "max_depth": [15, 20, 25],
            "learning_rate": [0.005, 0.01, 0.02],
            "subsample": [0.5, 0.6, 0.7],
            "colsample_bytree": [0.7, 0.8, 0.9],
        }
    ),
    "LightGBM": (
        lgb.LGBMRegressor(random_state=RANDOM_SEED),
        {
            "n_estimators": [500, 1000],
            "max_depth": [50, 80, 100],
            "learning_rate": [0.05, 0.1, 0.2],
            "subsample": [0.6, 0.8, 1.0],
            "colsample_bytree": [0.7, 0.8, 0.9],
        }
    ),
    "CatBoost": (
        CatBoostRegressor(silent=True, random_state=RANDOM_SEED),
        {
            "iterations": [500, 1000],
            "depth": [10, 12, 14],
            "learning_rate": [0.05, 0.1, 0.2],
            "random_strength": [1, 2, 3, 5],
        }
    ),
    "HistGradientBoosting": (
        HistGradientBoostingRegressor(random_state=RANDOM_SEED),
        {
            "max_iter": [500, 1000, 2000],
            "learning_rate": [0.001, 0.01, 0.05, 0.1],
            "max_depth": [None, 5, 10, 20],
            "max_leaf_nodes": [15, 31, 63],
            "min_samples_leaf": [10, 20, 50],
            "l2_regularization": [0.0, 0.1, 1.0]
        }
    ),
}

In [35]:
N_ITER = 20
results = []

for model_name, (model, param_grid) in models_and_params.items():
    print(f"\n=== Searching {model_name} ===")
    search = RandomizedSearchCV(
        estimator=model,
        param_distributions=param_grid,
        n_iter=N_ITER,
        cv=list(skf.split(X, y_bins)),
        scoring="r2",
        random_state=RANDOM_SEED,
        n_jobs=-1,
        verbose=1
    )
    search.fit(X, y)
    best_estimator = search.best_estimator_
    best_score = search.best_score_
    best_params = search.best_params_

    results.append({
        "Model": model_name,
        "Best Estimator": best_estimator,
        "Best Score (CV)": best_score,
        "Best Params": best_params
    })

results_df = pd.DataFrame(results).sort_values(by="Best Score (CV)", ascending=False)
print("\nFinal Cross-Val Results:\n", results_df)


=== Searching RandomForest ===
Fitting 5 folds for each of 20 candidates, totalling 100 fits


KeyboardInterrupt: 

In [None]:
results_df.to_csv('results_df.csv', index=False)
results_df

### Validation Data

In [None]:
df_val = pd.read_csv("./data/Submission_template_UHI2025-v2.csv")
gdf_val = gpd.GeoDataFrame(
    df_val,
    geometry=[Point(lon, lat) for lon, lat in zip(df_val.Longitude, df_val.Latitude)],
    crs="EPSG:4326"
).to_crs("EPSG:2263")

cv_val = []
dw_val = []
dp_val = []
lstv_val=[]
ndv_val=[]
ndbi_val=[]
ndwi_val=[]
distm_val=[]
distb_val=[]

for idx, row in gdf_val.iterrows():
    geom = row.geometry
    if FEATURE_FLAGS["building_coverage_50m"]:
        cfrac = building_coverage_fraction(geom,gdf_buildings,50)
    else:
        cfrac=0
    cv_val.append(cfrac)

    if FEATURE_FLAGS["distance_water"]:
        w=distance_to_polygons(geom,gdf_water)
    else:
        w=0
    dw_val.append(w)

    if FEATURE_FLAGS["distance_parks"]:
        p=distance_to_polygons(geom,gdf_parks)
    else:
        p=0
    dp_val.append(p)

    if FEATURE_FLAGS["lst_value"]:
        lv=extract_raster_value(geom, lst_raster_2263,1)
    else:
        lv=0
    lstv_val.append(lv)

    # NDVI, NDBI, NDWI
    ndv=0
    nb=0
    nw=0
    if FEATURE_FLAGS["ndvi_value"]:
        ndv=extract_raster_value(geom, indices_raster_2263,1)
    if FEATURE_FLAGS["ndbi_value"]:
        nb=extract_raster_value(geom, indices_raster_2263,2)
    if FEATURE_FLAGS["ndwi_value"]:
        nw=extract_raster_value(geom, indices_raster_2263,3)
    ndv_val.append(ndv)
    ndbi_val.append(nb)
    ndwi_val.append(nw)

    # dist manhattan
    if FEATURE_FLAGS["dist_manhattan_centre"]:
        dm=euclidean_distance(geom.x, geom.y, manhattan_x, manhattan_y)
    else:
        dm=0
    distm_val.append(dm)

    # dist bronx
    if FEATURE_FLAGS["dist_bronx_centre"]:
        db=euclidean_distance(geom.x, geom.y, bronx_x, bronx_y)
    else:
        db=0
    distb_val.append(db)

df_val["bld_cover_50m"] = cv_val
df_val["dist_water"] = dw_val
df_val["dist_parks"] = dp_val
df_val["lst_value"] = lstv_val
df_val["ndvi_value"] = ndv_val
df_val["ndbi_value"] = ndbi_val
df_val["ndwi_value"] = ndwi_val
df_val["dist_manhattan_centre"] = distm_val  # Ensure column name matches
df_val["dist_bronx_centre"] = distb_val  # Ensure column name matches

# Handle location clusters for validation
if FEATURE_FLAGS["kmeans_location_cluster"] and kmeans is not None:
    coords_val = np.column_stack([gdf_val.geometry.x, gdf_val.geometry.y])
    df_val["location_cluster"] = kmeans.predict(coords_val)
else:
    df_val["location_cluster"] = 0  # Default to cluster 0 if not using kmeans

# Select features for validation
df_val_feat = df_val[all_possible_features].fillna(0.0)
X_val = df_val_feat.values
X_val

In [None]:
# Let's pick top 4 models from the results
top4 = results_df.head(4).reset_index(drop=True)
best_model_1 = top4.iloc[0]["Best Estimator"]
best_model_2 = top4.iloc[1]["Best Estimator"]
best_model_3 = top4.iloc[2]["Best Estimator"]

base_models = []
for i in range(len(top4)):
    model_name = top4.loc[i, "Model"]
    estimator = top4.loc[i, "Best Estimator"]
    base_models.append((model_name, estimator))

print("\nTop 4 Models:\n", base_models)

In [None]:
def search_ensemble_weights_2(modelA, modelB, X, y, skf, increments=0.05):
    """
    For 2-model ensemble:
      preds = w1 * predsA + (1-w1) * predsB
    We'll pick w1 in [0,1] in steps of increments.
    Evaluate R2 in each fold, pick best average R2.
    Returns best_w1, best_r2
    """
    # We'll generate out-of-fold predictions for each model:
    #  then combine them in each fold for each candidate weight.
    predsA_oof = np.zeros(len(X))
    predsB_oof = np.zeros(len(X))

    for train_idx, valid_idx in skf.split(X, y_bins):
        XA, XV = X[train_idx], X[valid_idx]
        ya, yv = y[train_idx], y[valid_idx]
        mA = clone(modelA)
        mB = clone(modelB)

        mA.fit(XA, ya)
        mB.fit(XA, ya)

        predsA_oof[valid_idx] = mA.predict(XV)
        predsB_oof[valid_idx] = mB.predict(XV)

    # Now we have out-of-fold preds for A and B => evaluate all w1
    best_w = 0
    best_r2 = -999
    w_candidates = np.arange(0, 1.0 + increments, increments)
    for w1 in w_candidates:
        ensemble_oof = w1 * predsA_oof + (1 - w1) * predsB_oof
        r2_ens = r2_score(y, ensemble_oof)
        if r2_ens > best_r2:
            best_r2 = r2_ens
            best_w = w1

    return best_w, best_r2

# Similarly for top 3:
def search_ensemble_weights_3(modelA, modelB, modelC, X, y, skf, increments=0.05):
    """
    For 3-model ensemble:
      preds = w1*predA + w2*predB + w3*predC
    subject to w1 + w2 + w3 = 1, w_i >= 0
    We'll brute-force a small grid in increments.
    """
    predsA_oof = np.zeros(len(X))
    predsB_oof = np.zeros(len(X))
    predsC_oof = np.zeros(len(X))

    for train_idx, valid_idx in skf.split(X, y_bins):
        XA, XV = X[train_idx], X[valid_idx]
        ya, yv = y[train_idx], y[valid_idx]
        mA = clone(modelA)
        mB = clone(modelB)
        mC = clone(modelC)

        mA.fit(XA, ya)
        mB.fit(XA, ya)
        mC.fit(XA, ya)

        predsA_oof[valid_idx] = mA.predict(XV)
        predsB_oof[valid_idx] = mB.predict(XV)
        predsC_oof[valid_idx] = mC.predict(XV)

    w_candidates = np.arange(0, 1.0 + increments, increments)
    best_combo = (0,0,0)
    best_r2 = -999
    for w1 in w_candidates:
        for w2 in w_candidates:
            w3 = 1 - w1 - w2
            if w3 < 0: 
                # skip invalid combos
                continue
            # Compute ensemble
            ensemble_oof = w1*predsA_oof + w2*predsB_oof + w3*predsC_oof
            r2_ens = r2_score(y, ensemble_oof)
            if r2_ens > best_r2:
                best_r2 = r2_ens
                best_combo = (w1, w2, w3)

    return best_combo, best_r2

# Example usage after we find top 2 or 3 in results_df:
# modelA = results_df.iloc[0]["Best Estimator"]
# modelB = results_df.iloc[1]["Best Estimator"]
# best_w1, best_r2 = search_ensemble_weights_2(modelA, modelB, X, y, skf)
# Then we refit modelA, modelB on the entire dataset -> final ensemble weights => predict X_val.

# -------------------------------------------------------------------------------------
# 4. Refit best models + apply best weights
# -------------------------------------------------------------------------------------
# If we found best w1 for top 2:
# finalA = clone(modelA).fit(X, y)
# finalB = clone(modelB).fit(X, y)
# predsA_val = finalA.predict(X_val)
# predsB_val = finalB.predict(X_val)
# final_ensemble_val = best_w1*predsA_val + (1 - best_w1)*predsB_val

# Then create submission.

In [35]:
val_preds1 = best_model_1.predict(X_val)
val_preds2 = best_model_2.predict(X_val) if best_model_1 != best_model_2 else val_preds1
val_preds3 = best_model_3.predict(X_val) if best_model_1 != best_model_3 else val_preds1
ensemble_preds = (val_preds1 + val_preds2) / 2

# Add predictions to the validation dataframe
df_val["UHI Index"] = ensemble_preds

# Save the submission
os.makedirs("output", exist_ok=True)
df_val.to_csv("output/submission_v11.csv", index=False)
print("Saved submission_11.csv")

Saved submission_11.csv


In [36]:
# Let's do a custom out-of-fold prediction approach for these top 4

# A) Generate OOF predictions for each base model
#    We'll create arrays of shape [n_samples, n_base_models]
oof_preds = np.zeros((len(X), len(base_models)))

for idx, (mname, base_model) in enumerate(base_models):
    print(f"{idx}: {mname}; {base_model}")
    # We'll do a new copy of the model so we don't re-fit the original
    # or we can clone it
    model_clone = clone(base_model)

    # out-of-fold predictions
    fold_idx = 0
    for train_idx, valid_idx in skf.split(X, y_bins):
        print(f"\t{train_idx}, {valid_idx}")
        X_trainF, X_validF = X[train_idx], X[valid_idx]
        y_trainF, y_validF = y[train_idx], y[valid_idx]

        model_clone.fit(X_trainF, y_trainF)
        preds_validF = model_clone.predict(X_validF)
        oof_preds[valid_idx, idx] = preds_validF

oof_preds

0: ExtraTrees; ExtraTreesRegressor(max_depth=50, n_estimators=3000, random_state=42)
	[    0     1     3 ... 11226 11227 11228], [    2     7     9 ... 11179 11191 11203]
	[    0     1     2 ... 11226 11227 11228], [   12    24    38 ... 11205 11210 11215]
	[    0     1     2 ... 11226 11227 11228], [    6     8    22 ... 11204 11209 11221]
	[    0     1     2 ... 11225 11226 11227], [   10    14    17 ... 11134 11213 11228]
	[    0     1     2 ... 11226 11227 11228], [   16    18    20 ... 11161 11173 11220]
	[    0     1     2 ... 11226 11227 11228], [   28    39    51 ... 11217 11219 11222]
	[    0     1     2 ... 11225 11226 11228], [    3    15    30 ... 11201 11208 11227]
	[    0     2     3 ... 11222 11227 11228], [    1     5    32 ... 11224 11225 11226]
	[    1     2     3 ... 11226 11227 11228], [    0    13    25 ... 11184 11199 11218]
	[    0     1     2 ... 11226 11227 11228], [    4    19    21 ... 11198 11202 11207]
1: KNeighbors; KNeighborsRegressor(weights='distance')


NameError: name 'LinearRegression' is not defined

In [40]:
# Evaluate how these base model OOF stacks do if we just pick the best column
# But let's do a meta-learner approach:
# B) Train a small meta-learner on oof_preds => y
meta_learner = LinearRegression()  # could also try Ridge, XGBoost, etc.
meta_learner.fit(oof_preds, y)

# Evaluate the R2 on the same OOF predictions (some overfitting risk, but let's see)
meta_preds_oof = meta_learner.predict(oof_preds)
r2_meta = r2_score(y, meta_preds_oof)
print(f"Meta-learner OOF R2: {r2_meta:.5f}")
# If that looks good, we proceed

Meta-learner OOF R2: 0.97723


In [49]:
# Else we can also do a nested CV approach to measure meta-learner performance more robustly.

# C) Refit each base model on the FULL training set, then produce meta features for the FULL set
base_models_fitted = []
full_preds_stack = np.zeros((len(X), len(base_models)))
for idx, (mname, base_model) in enumerate(base_models):
    print(f"{idx}: {mname}; {base_model}")
    # clone to avoid reusing the partially-fitted model
    fm = clone(base_model)
    fm.fit(X, y)
    base_models_fitted.append((mname, fm))
    full_preds_stack[:, idx] = fm.predict(X)

# Refit meta-learner on the full stacked predictions
meta_learner_full = clone(meta_learner)
meta_learner_full.fit(full_preds_stack, y)

0: ExtraTrees; ExtraTreesRegressor(max_depth=50, n_estimators=3000, random_state=42)
1: KNeighbors; KNeighborsRegressor(weights='distance')
2: XGBoost; XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=0.8, device=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric='rmse', feature_types=None,
             gamma=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=0.01, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=20, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             multi_strategy=None, n_estimators=2000, n_jobs=None,
             num_parallel_tree=None, random_state=42, ...)
3: RandomForest; RandomForestRegressor(min_samples_split=5, n_estimators=4000, random_state=42)


In [53]:
# -------------------------------------------------------------------
# 5. MAKE VALIDATION PREDICTIONS
# We'll also create stacked features for X_val
val_stack = np.zeros((len(X_val), len(base_models)))
for idx, (mname, fm) in enumerate(base_models_fitted):
    val_stack[:, idx] = fm.predict(X_val)

val_preds_meta = meta_learner_full.predict(val_stack)
# val_preds_meta = meta_learner.predict(val_stack)

# Alternatively, you can do the simpler top2 average if that historically gave the best result:
# e.g. top2 = base_models_fitted[:2]
# val_preds_1 = top2[0][1].predict(X_val)
# val_preds_2 = top2[1][1].predict(X_val)
# val_preds_simple_avg = (val_preds_1 + val_preds_2) / 2

# If you want to see if meta is better or top2 is better, you can do both and compare.
# Let's define final predictions as the meta's:
final_val_preds = val_preds_meta
final_val_preds

array([0.96319713, 0.96366751, 0.96178878, ..., 1.03934582, 1.0375195 ,
       1.03582612])

In [54]:
# 6. SAVE SUBMISSION
df_val = pd.read_csv("./data/Submission_template_UHI2025-v2.csv")
df_val["UHI Index"] = final_val_preds
os.makedirs("output", exist_ok=True)
submission_path = "output/submission_v11-meta-preds.csv"
df_val.to_csv(submission_path, index=False)
print(f"Saved {submission_path}")

Saved output/submission_v11-meta-preds.csv


In [48]:
val_preds1 = best_model_1.predict(X_val)
val_preds2 = best_model_2.predict(X_val) if best_model_1 != best_model_2 else val_preds1
ensemble_preds = (val_preds1 + val_preds2) / 2

# Add predictions to the validation dataframe
df_val["UHI Index"] = ensemble_preds

# Save the submission
os.makedirs("output", exist_ok=True)
submission_path = "output/submission_v11.csv"
df_val.to_csv(submission_path, index=False)
print(f"Saved {submission_path}")

Saved output/submission_v11.csv


### Save Top Models

In [60]:
os.makedirs("models", exist_ok=True)

# Save base models
for i, (mname, fm) in enumerate(base_models_fitted):
    outpath = f"models/base_{mname}_model_{i}_v11.pkl"
    dump(fm, outpath)
    print(f"Saved base model: {outpath}")

# Save meta-learner
dump(meta_learner_full, "models/meta_learner_linear_v11.pkl")
print("Saved meta-learner.")

Saved base model: models/base_ExtraTrees_model_0_v11.pkl
Saved base model: models/base_KNeighbors_model_1_v11.pkl
Saved base model: models/base_XGBoost_model_2_v11.pkl
Saved base model: models/base_RandomForest_model_3_v11.pkl
Saved meta-learner.
