### IMPORTS AND SETUP

In [1]:
import numpy as np
import pandas as pd
import geopandas as gpd
import shapely.wkt
import os, warnings
# warnings.filterwarnings("ignore", category=UserWarning)
import logging

logging.basicConfig(filename='warnings.log', level=logging.DEBUG)

def custom_warning_handler(message, category, filename, lineno, file=None, line=None):
    logging.debug(f'{category.__name__}: {message} in {filename}:{lineno}')

warnings.showwarning = custom_warning_handler

from shapely.geometry import Point, Polygon, MultiPolygon, MultiPoint
import rioxarray as rxr
import rasterio
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, HistGradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV
from sklearn.metrics import r2_score
from sklearn.base import clone
from sklearn.cluster import DBSCAN
from joblib import dump
from shapely import wkt
import matplotlib.pyplot as plt

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
SHOW_PLOTS = True

### FEATURE TOGGLES

In [2]:
FEATURE_FLAGS = {
    # building coverage
    "building_cov_100m": True,
    "building_cov_200m": True,
    "building_cov_500m": True,

    # parks coverage
    "park_cov_1000m": True,

    # new: water coverage
    "water_cov_1000m": True,

    # new: street trees coverage => we can do a "point->buffer" approach
    "street_tree_cov_500m": True,

    # distance to city center
    "dist_to_closest_calculated_city_centroid": False,

    # mathematically calculated neighbourhood clustering
    "location_cluster": True,

    # official cluster approach (point-in-polygon with official NTA or neighborhoods)
    "official_cluster": False,

    # census data for population density
    "population_density": False,

    # subway stations data
    "dist_to_closest_subway_station": False,

    # raster-based features
    "lst_value": True,
    "ndvi_value": True,
    "ndbi_value": True,
    "ndwi_value": True,
    "evi_value": False
}

### HELPER FUNCTIONS

In [3]:
def coverage_fraction(geom, polygon_gdf, radius=50):
    """
    coverage_fraction:
      - We do buffer around 'geom' by 'radius'.
      - Clip polygon_gdf with that buffer.
      - Sum area of clipped polygons / area of buffer => coverage fraction.
    """
    buffer_poly = geom.buffer(radius)
    clipped = gpd.clip(polygon_gdf, buffer_poly)
    area_polygons = clipped.geometry.area.sum()
    area_buffer = buffer_poly.area
    return area_polygons / area_buffer if area_buffer > 0 else 0

# def building_coverage_fraction(geom, building_gdf, radius=50):
#     buffer_poly = geom.buffer(radius)
#     clipped = gpd.clip(building_gdf, buffer_poly)
#     area_buildings = clipped.geometry.area.sum()
#     area_buf = buffer_poly.area
#     return area_buildings / area_buf if area_buf > 0 else 0

def distance_to_polygons(geom, poly_gdf):
    dists = poly_gdf.geometry.distance(geom)
    return dists.min() if len(dists)>0 else np.nan

def euclidean_distance(x1, y1, x2, y2):
    return np.sqrt((x1 - x2)**2 + (y1 - y2)**2)

def extract_raster_value(geom, raster, band_index=1, method="nearest"):
    """Return the pixel value from raster at geom's location."""
    x, y = geom.x, geom.y
    val = raster.sel(x=x, y=y, band=band_index, method=method).values
    return float(val)

def point_in_official_cluster(geom, cluster_gdf, id_column="NTACode"):
    """
    If official_cluster is True, we do a point-in-polygon sjoin
    with an official neighborhoods or NTA shapefile that has an 'id_column'
    representing the area ID.
    """
    pt_gdf = gpd.GeoDataFrame([1], geometry=[geom], crs=cluster_gdf.crs)
    joined = gpd.sjoin(pt_gdf, cluster_gdf, how="left", predicate="within")
    if len(joined)==0:
        return -1
    else:
        return joined[id_column].iloc[0]  # or name field

def compute_evi(blue, red, nir, L=1.0, C1=6.0, C2=7.5, G=2.5):
    return G * (nir - red) / (nir + C1*red - C2*blue + L)

### LOAD BOROUGH BOUNDARIES, BUILDINGS, WATER, PARKS, TREES

In [4]:
print("Loading building footprints, parks, water coverage, street tree CSV...")

# 3.0 NYC boroughs
# gdf_boroughs = gpd.read_file("./data/nyc_boroughs.geojson").to_crs("EPSG:2263")
# if "name" in gdf_boroughs.columns:
#     gdf_boroughs.rename(columns={"name":"BoroName"}, inplace=True)

# 3.1 Building footprints
gdf_buildings = gpd.read_file("./data/Building_Footprint.kml").to_crs("EPSG:2263")
# gdf_buildings = gpd.sjoin(gdf_buildings, gdf_boroughs, how="left", predicate="intersects")
# gdf_buildings = gdf_buildings.dropna(subset=["BoroName"])

# 3.2 Parks
gdf_parks = gpd.read_file("./data/Parks_Properties_20250123.kml").to_crs("EPSG:2263")

# 3.3 Water
gdf_water = pd.read_csv("./data/NYC_Planimetric_Database__Hydrography_20250123.csv")
gdf_water["geometry"] = gdf_water["the_geom"].apply(wkt.loads)
gdf_water = gpd.GeoDataFrame(gdf_water, geometry="geometry", crs="EPSG:4326").to_crs("EPSG:2263")

# 3.4 Street Trees from 2015 Census (CSV with lat/lon)
df_trees = pd.read_csv("./data/2015_Street_Tree_Census_-_Tree_Data_20250205.csv")
# columns: [tree_id, block_id, created_at, tree_dbh, lat, lon, ...] => check real col names
# Actually from your snippet: "latitude","longitude"
df_trees.rename(columns={"latitude":"lat","longitude":"lon"}, inplace=True)
# convert to geodf in EPSG:2263
gdf_trees_pts = gpd.GeoDataFrame(
    df_trees,
    geometry=[Point(lon, lat) for lon, lat in zip(df_trees["lon"], df_trees["lat"])],
    crs="EPSG:4326"
).to_crs("EPSG:2263")

# We can create a small buffer around each tree => approximate canopy => polygons:
# Maybe scale buffer by DBH? For simplicity, do 2m buffer:
gdf_trees_poly = gdf_trees_pts.copy()
gdf_trees_poly["geometry"] = gdf_trees_poly.geometry.buffer(2.0)  # 2m radius
    
# 3.6 Official Cluster Polygons (NTAs or Neighborhoods)
# gdf_official_clusters = gpd.read_file("./data/2020_NTAs.shp").to_crs("EPSG:2263")

# 3.7 Population Census for population density
# gdf_census = gpd.read_file("./data/2020 Census Tracts_20250206.geojson").to_crs("EPSG:2263")

# 3.8 Subway stations
gdf_stations = gpd.read_file("./data/MTA Subway Stations_20250206.geojson").to_crs("EPSG:2263")

print("Done")

Loading building footprints, parks, water coverage, street tree CSV...
Done


In [5]:
# Load training data
df_train = pd.read_csv("./data/Training_data_uhi_index 2025-02-04.csv")
gdf_train = gpd.GeoDataFrame(
    df_train,
    geometry=[Point(lon, lat) for lon, lat in zip(df_train.Longitude, df_train.Latitude)],
    crs="EPSG:4326"
).to_crs("EPSG:2263")

In [6]:
# import geopandas as gpd
# from shapely.geometry import box

# # 1. Extract the total bounds of gdf_train
# minx_train, miny_train, maxx_train, maxy_train = gdf_train.total_bounds

# # 2. Define a buffer in the same units as your CRS (here, meters)
# buffer = 1000  # adjust this value if you need a slightly larger or smaller margin

# # 3. Create an expanded bounding box
# expanded_bbox = box(minx_train - buffer, 
#                     miny_train - buffer, 
#                     maxx_train + buffer, 
#                     maxy_train + buffer)

# # Optional: If you want to see the coordinates of the expanded bounding box:
# print("Expanded Bounding Box:", expanded_bbox.bounds)

# # 4. Create a sub-dataframe from gdf_buildings 1GB that only contains features intersecting the expanded bbox
# gdf_buildings = gdf_buildings[gdf_buildings.geometry.intersects(expanded_bbox)]
# gdf_buildings['centroid'] = gdf_buildings.geometry.centroid

# # Check the total bounds of the new subset
# print("Subset Total Bounds:", gdf_buildings.total_bounds)
# print("Train Total Bounds:", gdf_train.total_bounds)

In [7]:
# # Find the centroid of the largest cluster of building points.
# coords = np.column_stack([gdf_buildings.centroid.x, gdf_buildings.centroid.y]) # Extract coordinates from centroids

# # Cluster building points using DBSCAN
# # These parameters are chosen to merge clusters (e.g., Manhattan and Bronx) in NYC.
# eps_m = 400   # maximum distance (in meters) to consider two points as neighbors
# min_samples = 10  # minimum number of points required to form a cluster
# db = DBSCAN(eps=eps_m, min_samples=min_samples).fit(coords)
# labels = db.labels_
# gdf_buildings['cluster'] = labels

# # Determine the largest cluster (ignoring noise, which is labeled -1)
# valid_mask = (labels >= 0)
# if np.any(valid_mask):
#     unique_labels, counts = np.unique(labels[valid_mask], return_counts=True)
#     largest_label = unique_labels[np.argmax(counts)]
#     in_largest = coords[labels == largest_label]
#     if len(in_largest) > 0:
#         city_centroid = MultiPoint(in_largest).centroid
#     else:
#         city_centroid = MultiPoint(coords).centroid
# else:
#     city_centroid = MultiPoint(coords).centroid

# city_x, city_y = city_centroid.x, city_centroid.y
# print(f"Centroid of the largest cluster of buildings in the city x/y: {city_x}, {city_y}")

# if SHOW_PLOTS:
#     # Plot all building centroids colored by their cluster labels, and overlay the city centroid
#     fig, ax = plt.subplots(figsize=(10, 8))
    
#     # Scatter plot: Use a colormap (e.g., 'tab20') to distinguish different clusters.
#     # Noise points (label -1) will get their own color.
#     scatter = ax.scatter(
#         coords[:, 0], 
#         coords[:, 1], 
#         c=labels, 
#         cmap='tab20', 
#         s=10, 
#         alpha=0.6,
#         label="Building Centroids"
#     )
    
#     # Overlay the city centroid as a large red star
#     ax.scatter(city_centroid.x, city_centroid.y, color='red', marker='*', s=200, label='City Centroid')
    
#     ax.set_title("Building Clusters and City Centroid")
#     ax.set_xlabel("X coordinate")
#     ax.set_ylabel("Y coordinate")
#     ax.legend()
#     plt.colorbar(scatter, ax=ax, label="Cluster Label")
#     plt.show()

### LOAD + CHECK RASTERS

In [8]:
print("Loading reprojected LST + Indices...")
lst_raster = rxr.open_rasterio("Landsat_LST_v4_single_0601_0901.tiff")
lst_raster_2263 = lst_raster.rio.reproject("EPSG:2263")

indices_raster = rxr.open_rasterio("S2_indices_v4_single_0601_0901.tiff")
indices_raster_2263 = indices_raster.rio.reproject("EPSG:2263")

# Check if the rasters actually have valid data
ratio_lst = (~lst_raster_2263.isnull()).mean().values
ratio_idx = (~indices_raster_2263.isnull()).mean().values
print(f"LST valid ratio: {ratio_lst:.3f}")
print(f"Indices valid ratio: {ratio_idx:.3f}")

if ratio_lst == 0.0:
    print("WARNING: LST raster is entirely NaN. Possibly an empty mosaic or over-strict cloud mask!")
if ratio_idx == 0.0:
    print("WARNING: Indices raster is entirely NaN. Possibly an empty mosaic or over-strict cloud mask!")

print("LST raster bounds:", lst_raster_2263.rio.bounds())
print("Indices raster bounds:", indices_raster_2263.rio.bounds())

Loading reprojected LST + Indices...
LST valid ratio: 1.000
Indices valid ratio: 0.998
LST raster bounds: (981437.4489166049, 212457.5409991683, 1023088.2146540834, 260009.54459267546)
Indices raster bounds: (981462.3462672028, 212512.2857439316, 1023050.6163433915, 259944.0070645472)


### BUILD TRAINING FEATURES

In [9]:
# Prepare empty arrays for each feature
arr_cov_bld_100  = []
arr_cov_bld_200  = []
arr_cov_bld_500  = []

arr_cov_park_1000 = []
arr_cov_tree_500  = []
arr_cov_water_1000 = []

arr_dist_city_centroid = []
arr_dist_subway_station = []

arr_lst_vals  = []
arr_ndvi_vals = []
arr_ndbi_vals = []
arr_ndwi_vals = []

# If you eventually compute location_cluster, official_cluster, population_density,
# you'd prepare arrays for them as well. For now we skip them or store 0.

total_rows = len(gdf_train)
print(f"total rows: {total_rows}")

for i, row in gdf_train.iterrows():
    if i % 1000 == 0:
        percent_done = (i / total_rows) * 100
        print(f"{percent_done:.2f}%")
        
    geom = row.geometry
    
    # building coverage
    if FEATURE_FLAGS["building_cov_100m"]:
        b100 = coverage_fraction(geom, gdf_buildings, 100)
    else:
        b100 = 0
    arr_cov_bld_100.append(b100)

    if FEATURE_FLAGS["building_cov_200m"]:
        b200 = coverage_fraction(geom, gdf_buildings, 200)
    else:
        b200 = 0
    arr_cov_bld_200.append(b200)

    if FEATURE_FLAGS["building_cov_500m"]:
        b500 = coverage_fraction(geom, gdf_buildings, 500)
    else:
        b500 = 0
    arr_cov_bld_500.append(b500)

    # park coverage
    if FEATURE_FLAGS["park_cov_1000m"]:
        p1000 = coverage_fraction(geom, gdf_parks, 1000)
    else:
        p1000 = 0
    arr_cov_park_1000.append(p1000)

    # street trees coverage
    if FEATURE_FLAGS["street_tree_cov_500m"]:
        t500 = coverage_fraction(geom, gdf_trees_poly, 500)
    else:
        t500 = 0
    arr_cov_tree_500.append(t500)

    # water coverage
    if FEATURE_FLAGS["water_cov_1000m"]:
        w1000 = coverage_fraction(geom, gdf_water, 1000)
    else:
        w1000 = 0
    arr_cov_water_1000.append(w1000)

    # distance to city centroid (calculated)
    if FEATURE_FLAGS["dist_to_closest_calculated_city_centroid"]:
        d_centroid = euclidean_distance(geom.x, geom.y, city_x, city_y)
    else:
        d_centroid = None
    arr_dist_city_centroid.append(d_centroid)

    # distance to nearest subway station
    if FEATURE_FLAGS["dist_to_closest_subway_station"]:
        dists = gdf_stations.geometry.distance(geom)
        dist_min = dists.min() if len(dists) > 0 else 0
    else:
        dist_min = None
    arr_dist_subway_station.append(dist_min)

    # LST / NDVI / NDBI / NDWI
    if FEATURE_FLAGS["lst_value"]:
        lv = extract_raster_value(geom, lst_raster_2263, band_index=1)
    else:
        lv = 0
    arr_lst_vals.append(lv)

    if FEATURE_FLAGS["ndvi_value"]:
        ndv = extract_raster_value(geom, indices_raster_2263, band_index=1)
    else:
        ndv = 0
    arr_ndvi_vals.append(ndv)

    if FEATURE_FLAGS["ndbi_value"]:
        ndb = extract_raster_value(geom, indices_raster_2263, band_index=2)
    else:
        ndb = 0
    arr_ndbi_vals.append(ndb)

    if FEATURE_FLAGS["ndwi_value"]:
        ndw = extract_raster_value(geom, indices_raster_2263, band_index=3)
    else:
        ndw = 0
    arr_ndwi_vals.append(ndw)

print("100.00% Done with feature computations.")

# Attach columns to gdf_train if the flag is True
if FEATURE_FLAGS["building_cov_100m"]:
    gdf_train["building_cov_100m"] = arr_cov_bld_100

if FEATURE_FLAGS["building_cov_200m"]:
    gdf_train["building_cov_200m"] = arr_cov_bld_200

if FEATURE_FLAGS["building_cov_500m"]:
    gdf_train["building_cov_500m"] = arr_cov_bld_500

if FEATURE_FLAGS["park_cov_1000m"]:
    gdf_train["park_cov_1000m"] = arr_cov_park_1000

if FEATURE_FLAGS["street_tree_cov_500m"]:
    gdf_train["street_tree_cov_500m"] = arr_cov_tree_500

if FEATURE_FLAGS["water_cov_1000m"]:
    gdf_train["water_cov_1000m"] = arr_cov_water_1000

if FEATURE_FLAGS["dist_to_closest_calculated_city_centroid"]:
    gdf_train["dist_to_closest_calculated_city_centroid"] = arr_dist_city_centroid

if FEATURE_FLAGS["dist_to_closest_subway_station"]:
    gdf_train["dist_to_closest_subway_station"] = arr_dist_subway_station

if FEATURE_FLAGS["lst_value"]:
    gdf_train["lst_value"] = arr_lst_vals

if FEATURE_FLAGS["ndvi_value"]:
    gdf_train["ndvi_value"] = arr_ndvi_vals

if FEATURE_FLAGS["ndbi_value"]:
    gdf_train["ndbi_value"] = arr_ndbi_vals

if FEATURE_FLAGS["ndwi_value"]:
    gdf_train["ndwi_value"] = arr_ndwi_vals

# optional KMeans
if FEATURE_FLAGS["location_cluster"]:
    from sklearn.cluster import KMeans
    N_CLUSTERS = 10
    coords_train = np.column_stack([gdf_train.geometry.x, gdf_train.geometry.y])
    kmeans = KMeans(n_clusters=N_CLUSTERS, random_state=RANDOM_SEED, n_init=10).fit(coords_train)
    gdf_train["location_cluster"] = kmeans.labels_

print("Done attaching columns.")

total rows: 11269
0.00%
8.87%
17.75%
26.62%
35.50%
44.37%
53.24%
62.12%
70.99%
79.87%
88.74%
97.61%
100.00% Done with feature computations.
Done attaching columns.


### OUTLIER REMOVAL

In [10]:
# E.g. cap y above 99th percentile
cap_quantile= 0.99
cap_val= gdf_train["UHI Index"].quantile(cap_quantile)
mask_out= (gdf_train["UHI Index"]> cap_val)
if mask_out.sum()>0:
    print(f"Capping {mask_out.sum()} outliers above Q={cap_quantile} at {cap_val:.3f}")
    gdf_train.loc[mask_out, "UHI Index"]= cap_val

Capping 112 outliers above Q=0.99 at 1.037


### FINAL FEATURE TABLE

In [11]:
feature_cols = []

if FEATURE_FLAGS["building_cov_100m"]:
    feature_cols.append("building_cov_100m")
if FEATURE_FLAGS["building_cov_200m"]:
    feature_cols.append("building_cov_200m")
if FEATURE_FLAGS["building_cov_500m"]:
    feature_cols.append("building_cov_500m")

if FEATURE_FLAGS["park_cov_1000m"]:
    feature_cols.append("park_cov_1000m")

if FEATURE_FLAGS["street_tree_cov_500m"]:
    feature_cols.append("street_tree_cov_500m")

if FEATURE_FLAGS["water_cov_1000m"]:
    feature_cols.append("water_cov_1000m")

if FEATURE_FLAGS["dist_to_closest_calculated_city_centroid"]:
    feature_cols.append("dist_to_closest_calculated_city_centroid")

if FEATURE_FLAGS["dist_to_closest_subway_station"]:
    feature_cols.append("dist_to_closest_subway_station")

if FEATURE_FLAGS["lst_value"]:
    feature_cols.append("lst_value")

if FEATURE_FLAGS["ndvi_value"]:
    feature_cols.append("ndvi_value")

if FEATURE_FLAGS["ndbi_value"]:
    feature_cols.append("ndbi_value")

if FEATURE_FLAGS["ndwi_value"]:
    feature_cols.append("ndwi_value")

if FEATURE_FLAGS["location_cluster"]:
    feature_cols.append("location_cluster")
if FEATURE_FLAGS["official_cluster"]:
    feature_cols.append("official_cluster")
if FEATURE_FLAGS["population_density"]:
    feature_cols.append("population_density")

df_train_feat = gdf_train[feature_cols].fillna(0.0)
X = df_train_feat.values
y = gdf_train["UHI Index"].values

print("Train shape:", X.shape)
print("Feature columns:", feature_cols)

Train shape: (11269, 11)
Feature columns: ['building_cov_100m', 'building_cov_200m', 'building_cov_500m', 'park_cov_1000m', 'street_tree_cov_500m', 'water_cov_1000m', 'lst_value', 'ndvi_value', 'ndbi_value', 'ndwi_value', 'location_cluster']


In [12]:
df_train_feat.describe()

Unnamed: 0,building_cov_100m,building_cov_200m,building_cov_500m,park_cov_1000m,street_tree_cov_500m,water_cov_1000m,lst_value,ndvi_value,ndbi_value,ndwi_value,location_cluster
count,11269.0,11269.0,11269.0,11269.0,11269.0,11269.0,11269.0,11269.0,11269.0,11269.0,11269.0
mean,0.231138,0.307259,0.328751,0.14104,0.001614,0.026786,40.441024,-0.005381,-0.001932,0.02277,4.277842
std,0.187019,0.18982,0.164384,0.212185,0.000779,0.073054,2.666938,0.063932,0.067918,0.048364,2.770942
min,0.0,0.0,0.0,0.0,0.0,0.0,32.484185,-0.231795,-0.353383,-0.499395,0.0
25%,0.061817,0.172796,0.236984,0.020243,0.001113,0.0,39.067291,-0.037037,-0.037608,0.014394,2.0
50%,0.218239,0.315302,0.331127,0.052138,0.001661,0.0,40.663506,-0.019201,-0.003988,0.032028,4.0
75%,0.355174,0.439667,0.433307,0.160069,0.002149,0.001418,42.167435,0.008161,0.031743,0.045154,7.0
max,0.999675,0.824851,0.679903,1.0,0.003626,0.502626,54.564594,0.562697,0.440744,0.165486,9.0


In [13]:
df_train_feat

Unnamed: 0,building_cov_100m,building_cov_200m,building_cov_500m,park_cov_1000m,street_tree_cov_500m,water_cov_1000m,lst_value,ndvi_value,ndbi_value,ndwi_value,location_cluster
0,0.295823,0.460293,0.442002,0.022438,0.001908,0.000000,44.091780,-0.023052,0.013941,0.031556,3
1,0.323297,0.429166,0.432448,0.022526,0.001918,0.000000,43.203095,-0.036626,0.026451,0.045455,3
2,0.313570,0.403163,0.428255,0.022573,0.001901,0.000000,43.203095,-0.040126,0.057551,0.035192,3
3,0.267607,0.397447,0.425799,0.022469,0.001871,0.000000,43.203095,-0.022312,0.063714,0.036385,3
4,0.215741,0.415295,0.426592,0.022220,0.001791,0.000000,43.203095,-0.038647,0.055567,0.034371,3
...,...,...,...,...,...,...,...,...,...,...,...
11264,0.000000,0.000000,0.000000,0.901596,0.000000,0.057350,34.890471,0.028827,-0.023243,-0.000817,2
11265,0.000000,0.000000,0.000000,0.901045,0.000000,0.059504,34.890471,0.051500,-0.028651,-0.016411,2
11266,0.000000,0.000000,0.000000,0.901269,0.000000,0.063221,34.890471,0.051500,-0.028651,-0.016411,2
11267,0.000000,0.000000,0.000000,0.901956,0.000000,0.065068,34.890471,0.048808,-0.048659,-0.018743,2


In [17]:
gdf_train

Unnamed: 0,Longitude,Latitude,datetime,UHI Index,geometry,building_cov_100m,building_cov_200m,building_cov_500m,park_cov_1000m,street_tree_cov_500m,water_cov_1000m,lst_value,ndvi_value,ndbi_value,ndwi_value,location_cluster
0,-73.919037,40.814292,24-07-2021 15:53,1.034616,POINT (1006661.089 235955.883),0.295823,0.460293,0.442002,0.022438,0.001908,0.000000,44.091780,-0.023052,0.013941,0.031556,3
1,-73.918978,40.814365,24-07-2021 15:53,1.028125,POINT (1006677.213 235982.615),0.323297,0.429166,0.432448,0.022526,0.001918,0.000000,43.203095,-0.036626,0.026451,0.045455,3
2,-73.918927,40.814433,24-07-2021 15:53,1.028125,POINT (1006691.49 236007.523),0.313570,0.403163,0.428255,0.022573,0.001901,0.000000,43.203095,-0.040126,0.057551,0.035192,3
3,-73.918875,40.814500,24-07-2021 15:53,1.025961,POINT (1006705.77 236031.826),0.267607,0.397447,0.425799,0.022469,0.001871,0.000000,43.203095,-0.022312,0.063714,0.036385,3
4,-73.918827,40.814560,24-07-2021 15:53,1.025961,POINT (1006719.128 236053.699),0.215741,0.415295,0.426592,0.022220,0.001791,0.000000,43.203095,-0.038647,0.055567,0.034371,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11264,-73.957050,40.790333,24-07-2021 15:59,0.972470,POINT (996143.074 227219.577),0.000000,0.000000,0.000000,0.901596,0.000000,0.057350,34.890471,0.028827,-0.023243,-0.000817,2
11265,-73.957063,40.790308,24-07-2021 15:59,0.972470,POINT (996139.388 227210.467),0.000000,0.000000,0.000000,0.901045,0.000000,0.059504,34.890471,0.051500,-0.028651,-0.016411,2
11266,-73.957093,40.790270,24-07-2021 15:59,0.981124,POINT (996131.087 227196.498),0.000000,0.000000,0.000000,0.901269,0.000000,0.063221,34.890471,0.051500,-0.028651,-0.016411,2
11267,-73.957112,40.790253,24-07-2021 15:59,0.981245,POINT (996126.012 227190.422),0.000000,0.000000,0.000000,0.901956,0.000000,0.065068,34.890471,0.048808,-0.048659,-0.018743,2


### DEFINE STRATIFIED CV

In [14]:
def make_stratified_bins(target, n_bins=10):
    """Bin the continuous target into discrete intervals for use in 'StratifiedKFold'."""
    # pd.qcut => quantile-based discretization
    bins = pd.qcut(target, q=n_bins, duplicates="drop")  # If duplicates occur, drop them
    return bins.astype(str)  # Convert to string labels

K_FOLDS = 10

y_bins = make_stratified_bins(y, n_bins=10)
skf = StratifiedKFold(n_splits=K_FOLDS, shuffle=True, random_state=RANDOM_SEED)
# kf = KFold(n_splits=K_FOLDS, shuffle=True, random_state=RANDOM_SEED)

In [20]:
# Model definitions and parameter grids
models_and_params = {
    "KNeighbors": (
        KNeighborsRegressor(),
        {
            "n_neighbors": [2, 3, 4, 5, 6],
            "weights": ["uniform", "distance"],
            "p": [1, 2, 3, 4],
            "algorithm": ['auto', 'ball_tree', 'kd_tree', 'brute'],
            "leaf_size": [10, 20, 30, 40],
            "metric": ['minkowski', 'euclidean', 'manhattan', 'chebyshev'],
        }
    ),
    "LightGBM": (
        lgb.LGBMRegressor(random_state=RANDOM_SEED, device='gpu', verbose=-1),
        {
            "n_estimators": [200, 300, 400, 500],
            "max_depth": [-1, 10, 20, 30],
            "learning_rate": [0.01, 0.05, 0.1, 0.15, 0.2],
            "subsample": [0.5, 0.6, 0.7, 0.8],
            "colsample_bytree": [0.6, 0.7, 0.8, 0.9],
            "boosting_type": ["gbdt", "dart", "rf"],
            "num_leaves": [15, 31, 63],
            "reg_alpha": [0, 0.1, 1],
            "reg_lambda": [1.0, 2.0, 0.5],
        }
    ),
    "CatBoost": (
        CatBoostRegressor(silent=True, random_state=RANDOM_SEED, task_type="GPU", devices='0'),
        {
            "iterations": [200, 300, 400, 500],
            "max_depth": [None, 5, 10, 15, 20],
            "learning_rate": [0.1, 0.15, 0.2, 0.25],
            "random_strength": [1, 2, 3, 4],
        }
    ),
    "HistGradientBoosting": (
        HistGradientBoostingRegressor(random_state=RANDOM_SEED),
        {
            "max_iter": [200, 300, 400, 500],
            "learning_rate": [0.001, 0.01, 0.05, 0.1],
            "max_depth": [None, 10, 20, 30],
            "max_leaf_nodes": [15, 31, 63],
            "l2_regularization": [0.0, 0.1, 1.0],
            # "loss": ["squared_error", "absolute_error", "gamma", "poisson", "quantile"],
            "quantile": [0.1, 0.5, 0.8, 0.9, 0.95],
            "min_samples_leaf": [10, 20, 40],
            "warm_start": [True, False],
        }
    ),
    "DecisionTree": (
        DecisionTreeRegressor(random_state=RANDOM_SEED),
        {
            # "criterion": ["squared_error", "friedman_mse", "absolute_error", "poisson"],
            "splitter": ["best", "random"],
            "max_depth": [None, 10, 20, 30],
            "min_samples_leaf": [1, 2, 3],
        }
    ),
    "RandomForest": (
        RandomForestRegressor(random_state=RANDOM_SEED),
        {
            "n_estimators": [100, 400, 500, 600],
            # "criterion": ["squared_error", "friedman_mse", "absolute_error", "poisson"],
            "max_depth": [None, 10, 20, 30],
            "min_samples_leaf": [1, 2, 3],
            "min_samples_split": [2, 3, 4, 5],
            "max_features": [1.0, "sqrt", "log2", 0.5],
            "bootstrap": [False, True],
            "oob_score": [False, True],   # only valid if bootstrap=True
        }
    ),
    "ExtraTrees":(
        ExtraTreesRegressor(random_state=RANDOM_SEED),
        {
            "n_estimators": [100, 200, 300, 400],
            # "criterion": ["squared_error", "friedman_mse", "absolute_error", "poisson"],
            "max_depth": [None, 20, 25, 30, 35],
            "min_samples_split": [4, 6, 8, 10],
            "min_samples_leaf": [1, 2, 3],
            "max_features": [1.0, "sqrt", "log2", 0.5],
            "ccp_alpha": [0.0, 0.001, 0.01],
            "bootstrap": [False, True],
            "oob_score": [False, True],   # only valid if bootstrap=True
        }
    ),
    "XGBoost":(
        xgb.XGBRegressor(random_state=RANDOM_SEED, use_label_encoder=False, eval_metric="rmse", tree_method="gpu_hist", predictor="gpu_predictor"),
        {
            "n_estimators": [80, 100, 150, 200, 300],
            "learning_rate": [0.01, 0.05, 0.1],
            "max_depth": [None, 5, 10, 15, 20],
            "booster": ["gbtree", "dart"],
            "gamma": [0, 0.1, 1],
            "reg_alpha": [0, 0.1, 1],
            "reg_lambda": [0.5, 1.0, 2.0, 5.0],
            "subsample": [0.6, 0.7, 0.8, 0.9],
            "colsample_bytree": [0.6, 0.7, 0.8, 0.9],
            # "tree_method": ["auto", "hist", "approx"],
            "tree_method": ["gpu_hist"],
        }
    ),
}

In [21]:
N_ITER = 5
results = []

for model_name, (model, param_grid) in models_and_params.items():
    print(f"\n=== Searching {model_name} ===")
    search = RandomizedSearchCV(
        estimator=model,
        param_distributions=param_grid,
        n_iter=N_ITER,
        cv=list(skf.split(X, y_bins)),
        scoring="r2",
        random_state=RANDOM_SEED,
        n_jobs=1,
        verbose=3
    )
    search.fit(X, y)
    best_estimator = search.best_estimator_
    best_score = search.best_score_
    best_params = search.best_params_

    results.append({
        "Model": model_name,
        "Best Estimator": best_estimator,
        "Best Score (CV)": best_score,
        "Best Params": best_params
    })

results_df = pd.DataFrame(results).sort_values(by="Best Score (CV)", ascending=False)
print("\nFinal Cross-Val Results:\n", results_df)


=== Searching KNeighbors ===
Fitting 10 folds for each of 5 candidates, totalling 50 fits
[CV 1/10] END algorithm=ball_tree, leaf_size=20, metric=euclidean, n_neighbors=4, p=3, weights=uniform;, score=0.842 total time=   0.0s
[CV 2/10] END algorithm=ball_tree, leaf_size=20, metric=euclidean, n_neighbors=4, p=3, weights=uniform;, score=0.827 total time=   0.0s
[CV 3/10] END algorithm=ball_tree, leaf_size=20, metric=euclidean, n_neighbors=4, p=3, weights=uniform;, score=0.840 total time=   0.0s
[CV 4/10] END algorithm=ball_tree, leaf_size=20, metric=euclidean, n_neighbors=4, p=3, weights=uniform;, score=0.830 total time=   0.0s
[CV 5/10] END algorithm=ball_tree, leaf_size=20, metric=euclidean, n_neighbors=4, p=3, weights=uniform;, score=0.838 total time=   0.0s
[CV 6/10] END algorithm=ball_tree, leaf_size=20, metric=euclidean, n_neighbors=4, p=3, weights=uniform;, score=0.850 total time=   0.0s
[CV 7/10] END algorithm=ball_tree, leaf_size=20, metric=euclidean, n_neighbors=4, p=3, weight

In [22]:
results_df.to_csv('results_df.csv', index=False, sep='|')
results_df

Unnamed: 0,Model,Best Estimator,Best Score (CV),Best Params
5,RandomForest,"(DecisionTreeRegressor(max_depth=30, max_featu...",0.937135,"{'oob_score': False, 'n_estimators': 600, 'min..."
6,ExtraTrees,"(ExtraTreeRegressor(max_depth=25, min_samples_...",0.93014,"{'oob_score': False, 'n_estimators': 300, 'min..."
2,CatBoost,<catboost.core.CatBoostRegressor object at 0x7...,0.922999,"{'random_strength': 3, 'max_depth': 10, 'learn..."
3,HistGradientBoosting,"HistGradientBoostingRegressor(max_depth=20, ma...",0.903688,"{'warm_start': True, 'quantile': 0.1, 'min_sam..."
0,KNeighbors,"KNeighborsRegressor(algorithm='ball_tree', met...",0.872765,"{'weights': 'distance', 'p': 4, 'n_neighbors':..."
1,LightGBM,"LGBMRegressor(colsample_bytree=0.6, device='gp...",0.86653,"{'subsample': 0.6, 'reg_lambda': 1.0, 'reg_alp..."
4,DecisionTree,"DecisionTreeRegressor(max_depth=30, random_sta...",0.827073,"{'splitter': 'best', 'min_samples_leaf': 1, 'm..."
7,XGBoost,"XGBRegressor(base_score=None, booster='dart', ...",0.752503,"{'tree_method': 'gpu_hist', 'subsample': 0.8, ..."


### VALIDATION DATA

In [23]:
df_val = pd.read_csv("./data/Submission_template_UHI2025-v2.csv")
gdf_val = gpd.GeoDataFrame(
    df_val,
    geometry=[Point(lon, lat) for lon, lat in zip(df_val.Longitude, df_val.Latitude)],
    crs="EPSG:4326"
).to_crs("EPSG:2263")

# We'll create a dictionary to hold computed values for each feature flag
computed_feature_values = {key: [] for key in FEATURE_FLAGS if FEATURE_FLAGS[key]}

total_rows = len(gdf_val)
print(f"total rows: {total_rows}")

for idx, row in gdf_val.iterrows():
    if idx % 100 == 0:
        percent_done = (idx / total_rows) * 100
        print(f"{percent_done:.2f}%")
    
    geom = row.geometry
    
    # 1) building_cov_100m
    if FEATURE_FLAGS["building_cov_100m"]:
        val_bld_100 = coverage_fraction(geom, gdf_buildings, 100)
        computed_feature_values["building_cov_100m"].append(val_bld_100)
    
    # 2) building_cov_200m
    if FEATURE_FLAGS["building_cov_200m"]:
        val_bld_200 = coverage_fraction(geom, gdf_buildings, 200)
        computed_feature_values["building_cov_200m"].append(val_bld_200)
    
    # 3) building_cov_500m
    if FEATURE_FLAGS["building_cov_500m"]:
        val_bld_500 = coverage_fraction(geom, gdf_buildings, 500)
        computed_feature_values["building_cov_500m"].append(val_bld_500)
    
    # 4) park_cov_1000m
    if FEATURE_FLAGS["park_cov_1000m"]:
        val_park_1000 = coverage_fraction(geom, gdf_parks, 1000)
        computed_feature_values["park_cov_1000m"].append(val_park_1000)
    
    # 5) water_cov_1000m
    if FEATURE_FLAGS["water_cov_1000m"]:
        val_water_1000 = coverage_fraction(geom, gdf_water, 1000)
        computed_feature_values["water_cov_1000m"].append(val_water_1000)
    
    # 6) street_tree_cov_500m
    if FEATURE_FLAGS["street_tree_cov_500m"]:
        val_tree_500 = coverage_fraction(geom, gdf_trees_poly, 500)
        computed_feature_values["street_tree_cov_500m"].append(val_tree_500)
    
    # 7) dist_to_closest_calculated_city_centroid
    if FEATURE_FLAGS["dist_to_closest_calculated_city_centroid"]:
        dcc = euclidean_distance(geom.x, geom.y, city_x, city_y)
        computed_feature_values["dist_to_closest_calculated_city_centroid"].append(dcc)
    
    # 8) dist_to_closest_subway_station
    if FEATURE_FLAGS["dist_to_closest_subway_station"]:
        dists = gdf_stations.geometry.distance(geom)
        dist_min = dists.min() if len(dists) > 0 else 0
        computed_feature_values["dist_to_closest_subway_station"].append(dist_min)
    
    # 9) location_cluster
    if FEATURE_FLAGS["location_cluster"]:
        # Suppose you have a kmeans object for location
        # If not, default to 0
        if kmeans is not None:
            arr = np.array([[geom.x, geom.y]])
            cluster_label = kmeans.predict(arr)[0]
        else:
            cluster_label = 0
        computed_feature_values["location_cluster"].append(cluster_label)
    
    # 10) lst_value
    if FEATURE_FLAGS["lst_value"]:
        lv = extract_raster_value(geom, lst_raster_2263, band_index=1)
        computed_feature_values["lst_value"].append(lv)
    
    # 11) ndvi_value
    if FEATURE_FLAGS["ndvi_value"]:
        ndv = extract_raster_value(geom, indices_raster_2263, band_index=1)
        computed_feature_values["ndvi_value"].append(ndv)
    
    # 12) ndbi_value
    if FEATURE_FLAGS["ndbi_value"]:
        ndb = extract_raster_value(geom, indices_raster_2263, band_index=2)
        computed_feature_values["ndbi_value"].append(ndb)
    
    # 13) ndwi_value
    if FEATURE_FLAGS["ndwi_value"]:
        ndw = extract_raster_value(geom, indices_raster_2263, band_index=3)
        computed_feature_values["ndwi_value"].append(ndw)
    
    # 14) evi_value -- if you had raw bands. This is turned off, so we skip unless it's True.

print("100.00% Done with feature computations.")

# Now attach these columns to df_val
for feat_key, feat_values in computed_feature_values.items():
    # We create a column named exactly as feat_key
    df_val[feat_key] = feat_values

# Create a list of feature columns from FEATURE_FLAGS (only those == True)
feature_cols = [key for key, val in FEATURE_FLAGS.items() if val]

# Build final X_val
df_val_feat = df_val[feature_cols].fillna(0.0)
X_val = df_val_feat.values
print("Validation shape:", X_val.shape)
print("feature_cols:", feature_cols)

total rows: 1040
0.00%
9.62%
19.23%
28.85%
38.46%
48.08%
57.69%
67.31%
76.92%
86.54%
96.15%
100.00% Done with feature computations.
Validation shape: (1040, 11)
feature_cols: ['building_cov_100m', 'building_cov_200m', 'building_cov_500m', 'park_cov_1000m', 'water_cov_1000m', 'street_tree_cov_500m', 'location_cluster', 'lst_value', 'ndvi_value', 'ndbi_value', 'ndwi_value']


In [27]:
df_val.describe()

Unnamed: 0,Longitude,Latitude,UHI Index,building_cov_100m,building_cov_200m,building_cov_500m,park_cov_1000m,water_cov_1000m,street_tree_cov_500m,location_cluster,lst_value,ndvi_value,ndbi_value,ndwi_value
count,1040.0,1040.0,0.0,1040.0,1040.0,1040.0,1040.0,1040.0,1040.0,1040.0,1040.0,1040.0,1040.0,1040.0
mean,-73.934816,40.807991,,0.228639,0.306988,0.326797,0.131451,0.027108,0.00158,4.245192,40.578069,-0.010139,-0.002524,0.026572
std,0.028661,0.0232,,0.183847,0.187521,0.163913,0.199796,0.068922,0.000805,2.591998,2.662403,0.056343,0.066481,0.041292
min,-73.993163,40.758877,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,33.171207,-0.193109,-0.352174,-0.413428
25%,-73.95703,40.790802,,0.062545,0.175813,0.215039,0.020485,0.0,0.001006,2.0,39.280917,-0.037819,-0.036624,0.017195
50%,-73.934618,40.809553,,0.214779,0.31739,0.331559,0.050155,0.0,0.0016,4.0,40.776301,-0.021421,-0.00511,0.032663
75%,-73.910655,40.823054,,0.357297,0.436838,0.434537,0.151146,0.003989,0.002139,7.0,42.246904,0.003323,0.029413,0.045315
max,-73.879537,40.859243,,0.998447,0.822876,0.674815,1.0,0.502626,0.003548,9.0,54.564594,0.524535,0.301264,0.143845


In [26]:
df_val

Unnamed: 0,Longitude,Latitude,UHI Index,building_cov_100m,building_cov_200m,building_cov_500m,park_cov_1000m,water_cov_1000m,street_tree_cov_500m,location_cluster,lst_value,ndvi_value,ndbi_value,ndwi_value
0,-73.971665,40.788763,,0.248892,0.417782,0.434283,0.022609,0.000000,0.002833,5,40.178148,-0.051394,-0.025287,0.058137
1,-73.971928,40.788875,,0.167185,0.401631,0.434841,0.023426,0.000000,0.002616,5,40.178148,-0.039764,-0.039936,0.046170
2,-73.967080,40.789080,,0.685125,0.469204,0.393609,0.402937,0.094989,0.001872,5,38.168352,-0.044698,0.024026,0.049971
3,-73.972550,40.789082,,0.505082,0.524747,0.419828,0.018978,0.000000,0.002479,5,40.311450,-0.012213,-0.059256,0.040509
4,-73.969697,40.787953,,0.576894,0.712504,0.645928,0.161423,0.000054,0.002761,5,38.130754,-0.025760,-0.075313,0.039932
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1035,-73.919388,40.813803,,0.374859,0.371426,0.425470,0.017420,0.000000,0.001819,3,44.901851,-0.076566,0.301264,0.040039
1036,-73.931033,40.833178,,0.119606,0.250616,0.235188,0.024506,0.110873,0.001466,6,39.904706,-0.018511,-0.066043,0.035125
1037,-73.934647,40.854542,,0.215837,0.468919,0.400809,0.026099,0.000000,0.001841,9,41.237734,0.072678,-0.054582,-0.068365
1038,-73.917223,40.815413,,0.423393,0.377644,0.422928,0.000000,0.000000,0.001072,3,43.435521,-0.021245,0.067223,0.025241


In [28]:
# 1) Identify top N from results_df
topN = results_df.head(3).reset_index(drop=True)  # e.g. top 3
base_models = []
for i in range(len(topN)):
    model_name = topN.loc[i, "Model"]
    estimator = topN.loc[i, "Best Estimator"]
    base_models.append((model_name, estimator))

print("\nTop Models:\n", base_models)
modelA, modelB, modelC = base_models[0][1], base_models[1][1], base_models[2][1]


Top Models:
 [('RandomForest', RandomForestRegressor(bootstrap=False, max_depth=30, max_features='sqrt',
                      min_samples_leaf=2, min_samples_split=3, n_estimators=600,
                      random_state=42)), ('ExtraTrees', ExtraTreesRegressor(max_depth=25, min_samples_leaf=3, min_samples_split=10,
                    n_estimators=300, random_state=42)), ('CatBoost', <catboost.core.CatBoostRegressor object at 0x7f90990b3d90>)]


### WEIGHTED SEARCH

In [29]:
def search_ensemble_weights_2(modelA, modelB, X, y, skf, increments=0.01):
    predsA_oof = np.zeros(len(X))
    predsB_oof = np.zeros(len(X))

    for train_idx, valid_idx in skf.split(X, y_bins):
        XA, XV = X[train_idx], X[valid_idx]
        ya, yv = y[train_idx], y[valid_idx]
        mA = clone(modelA); mB = clone(modelB)
        mA.fit(XA, ya); mB.fit(XA, ya)
        predsA_oof[valid_idx] = mA.predict(XV)
        predsB_oof[valid_idx] = mB.predict(XV)

    best_w, best_r2 = 0, -999
    for w1 in np.arange(0, 1.0 + increments, increments):
        blend = w1 * predsA_oof + (1-w1) * predsB_oof
        r2_ens = r2_score(y, blend)
        if r2_ens > best_r2:
            best_r2 = r2_ens
            best_w = w1
    return best_w, best_r2

def search_ensemble_weights_3(modelA, modelB, modelC, X, y, skf, increments=0.01):
    predsA_oof = np.zeros(len(X))
    predsB_oof = np.zeros(len(X))
    predsC_oof = np.zeros(len(X))

    for train_idx, valid_idx in skf.split(X, y_bins):
        XA, XV = X[train_idx], X[valid_idx]
        ya, yv = y[train_idx], y[valid_idx]
        clone(modelA).fit(XA, ya)
        clone(modelB).fit(XA, ya)
        clone(modelC).fit(XA, ya)

        predsA_oof[valid_idx] = modelA.predict(XV)
        predsB_oof[valid_idx] = modelB.predict(XV)
        predsC_oof[valid_idx] = modelC.predict(XV)

    best_combo, best_r2 = (0,0,0), -999
    for w1 in np.arange(0,1+increments,increments):
        for w2 in np.arange(0,1+increments,increments):
            w3 = 1 - w1 - w2
            if w3 < 0: continue
            blend = w1*predsA_oof + w2*predsB_oof + w3*predsC_oof
            r2_ens = r2_score(y, blend)
            if r2_ens > best_r2:
                best_r2 = r2_ens
                best_combo = (w1,w2,w3)
    return best_combo, best_r2

In [30]:
# Weighted 3-model ensemble with finer increments
best_combo, best_r2 = search_ensemble_weights_3(modelA, modelB, modelC, X, y, skf, increments=0.01)
w1, w2, w3 = best_combo
print(f"Best 3-model weights => w1={w1:.3f}, w2={w2:.3f}, w3={w3:.3f}; OOF R^2={best_r2:.5f}")

# Refit all on full data
finalA = clone(modelA).fit(X, y)
finalB = clone(modelB).fit(X, y)
finalC = clone(modelC).fit(X, y)

predA_val = finalA.predict(X_val)
predB_val = finalB.predict(X_val)
predC_val = finalC.predict(X_val)

final_ensemble_val = w1 * predA_val + w2 * predB_val + w3 * predC_val
df_val["UHI Index"] = final_ensemble_val

# Create a submission DataFrame with only the required columns
submission = df_val[["Longitude", "Latitude", "UHI Index"]]

os.makedirs("output", exist_ok=True)
submission.to_csv("output/submission_v14_3model_weighted.csv", index=False)
print("Saved 3-model weighted ensemble.")

Best 3-model weights => w1=1.000, w2=0.000, w3=0.000; OOF R^2=0.99721
Saved 3-model weighted ensemble.


### SAVE THE WEIGHTED ENSEMBLE

In [31]:
import pickle

final_ensemble_dict = {
    "model_names": ["ModelA","ModelB","ModelC"],
    "models": [finalA, finalB, finalC],
    "weights": (w1,w2,w3)
}
os.makedirs("models", exist_ok=True)
with open("models/3model_weighted_ensemble_v14.pkl","wb") as f:
    pickle.dump(final_ensemble_dict, f)
print("Saved 3-model weighted ensemble with discovered weights.")

Saved 3-model weighted ensemble with discovered weights.


### SIMPLE ENSEMBLE WITH AVERAGE

In [32]:
val_predsA = modelA.predict(X_val)
val_predsB = modelB.predict(X_val)
val_predsC = modelC.predict(X_val)
ensemble_preds_3 = (val_predsA + val_predsB + val_predsC) / 3
ensemble_preds_2 = (val_predsA + val_predsB) / 2

# Add predictions to the validation dataframe
df_val["UHI Index"] = ensemble_preds_3
submission = df_val[["Longitude", "Latitude", "UHI Index"]]

# Save the submission
os.makedirs("output", exist_ok=True)
submission.to_csv("output/submission_v14-avg3.csv", index=False)
print("Saved submission for average ensemble")

df_val["UHI Index"] = ensemble_preds_2
submission = df_val[["Longitude", "Latitude", "UHI Index"]]

# Save the submission
os.makedirs("output", exist_ok=True)
submission.to_csv("output/submission_v14-avg2.csv", index=False)
print("Saved submission for average ensemble")

df_val["UHI Index"] = val_predsA
submission = df_val[["Longitude", "Latitude", "UHI Index"]]

# Save the submission
os.makedirs("output", exist_ok=True)
submission.to_csv("output/submission_v14.csv", index=False)
print("Saved submission for average ensemble")

Saved submission for average ensemble
Saved submission for average ensemble
Saved submission for average ensemble


### OUT-OF-FOLD PREDICTION

In [33]:
# Let's do a custom out-of-fold prediction approach for these top 3

# A) Generate OOF predictions for each base model
#    We'll create arrays of shape [n_samples, n_base_models]
oof_preds = np.zeros((len(X), len(base_models)))

for idx, (mname, base_model) in enumerate(base_models):
    print(f"{idx}: {mname}; {base_model}")
    # We'll do a new copy of the model so we don't re-fit the original
    # or we can clone it
    model_clone = clone(base_model)

    # out-of-fold predictions
    fold_idx = 0
    for train_idx, valid_idx in skf.split(X, y_bins):
        print(f"\t{train_idx}, {valid_idx}")
        X_trainF, X_validF = X[train_idx], X[valid_idx]
        y_trainF, y_validF = y[train_idx], y[valid_idx]

        model_clone.fit(X_trainF, y_trainF)
        preds_validF = model_clone.predict(X_validF)
        oof_preds[valid_idx, idx] = preds_validF

oof_preds

0: RandomForest; RandomForestRegressor(bootstrap=False, max_depth=30, max_features='sqrt',
                      min_samples_leaf=2, min_samples_split=3, n_estimators=600,
                      random_state=42)
	[    0     2     3 ... 11265 11266 11267], [    1    11    17 ... 11252 11263 11268]
	[    0     1     2 ... 11266 11267 11268], [    8    21    22 ... 11254 11259 11260]
	[    0     1     2 ... 11266 11267 11268], [   19    25    28 ... 11231 11233 11261]
	[    0     1     2 ... 11266 11267 11268], [   33    59    72 ... 11209 11248 11264]
	[    0     1     3 ... 11266 11267 11268], [    2     6    13 ... 11228 11235 11257]
	[    0     1     2 ... 11266 11267 11268], [   10    24    49 ... 11210 11223 11241]
	[    0     1     2 ... 11265 11267 11268], [    3     4     7 ... 11238 11249 11266]
	[    0     1     2 ... 11264 11266 11268], [    5    18    34 ... 11253 11265 11267]
	[    1     2     3 ... 11266 11267 11268], [    0     9    38 ... 11242 11246 11258]
	[    0     1  

array([[1.01642201, 1.01684682, 1.01318253],
       [1.02025508, 1.02117015, 1.01917202],
       [1.02408301, 1.02241886, 1.02439831],
       ...,
       [0.97753032, 0.97830715, 0.97576247],
       [0.98239024, 0.98038406, 0.98117287],
       [0.98140884, 0.97933213, 0.98036391]])

### META-LEARNERS

In [34]:
from sklearn.linear_model import Ridge, Lasso, LinearRegression
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import StackingRegressor

scaler = StandardScaler()
X_scaled = scaler.fit_transform(oof_preds)
X_stacked= np.hstack([oof_preds, X])

meta_learners = {
    "Linear": LinearRegression(),
    "Ridge": Ridge(alpha=1.0, random_state=RANDOM_SEED),
    "Lasso": Lasso(alpha=1e-5, random_state=RANDOM_SEED),
    "XGB": xgb.XGBRegressor(n_estimators=100, learning_rate=0.05,
                            max_depth=15, random_state=RANDOM_SEED, 
                            subsample=0.9, colsample_bytree=0.9,
                            eval_metric="rmse", use_label_encoder=False,
                            tree_method="gpu_hist", predictor="gpu_predictor"),
    "LightGBM": lgb.LGBMRegressor(n_estimators=400, learning_rate=0.05, 
                                  min_child_samples=1, num_leaves=31,
                                  max_depth=None, random_state=RANDOM_SEED,
                                  device='gpu'),
    "RF": RandomForestRegressor(n_estimators=500, max_depth=None,
                                min_samples_split=4, min_samples_leaf=2, 
                                max_features=0.5, random_state=RANDOM_SEED),
    "MLP": MLPRegressor(hidden_layer_sizes=(256,128), activation="relu",
                        solver="adam", max_iter=500, random_state=RANDOM_SEED)
}

# We'll store results in a dict
meta_results = {}

for mname, meta_model in meta_learners.items():
    # Fit on the entire training set (oof_preds => y)
    meta_model.fit(X_scaled, y)
    
    # Evaluate OOF R² on the *same* data used for training
    # (some risk of overfitting, but a quick comparison is fine)
    preds_oof = meta_model.predict(X_scaled)
    r2_val = r2_score(y, preds_oof)
    
    meta_results[mname] = r2_val

# Print each meta-learner’s OOF R²
print("=== Meta-Learner Comparison ===")
for mname, score in sorted(meta_results.items(), key=lambda x: x[1], reverse=True):
    print(f"{mname} => OOF R2: {score:.5f}")

=== Meta-Learner Comparison ===
XGB => OOF R2: 0.98220
RF => OOF R2: 0.98018
LightGBM => OOF R2: 0.96387
Linear => OOF R2: 0.94269
Ridge => OOF R2: 0.94269
Lasso => OOF R2: 0.94269
MLP => OOF R2: 0.92740


In [35]:
from copy import deepcopy
from sklearn.base import clone
import numpy as np

# Suppose you have:
# base_models = [(name1, model1), (name2, model2), ...]
# meta_learners = { "LightGBM": best_LGB, "XGB": best_XGB, ... } from your previous cell
# meta_results = { "LightGBM": 0.99822, "XGB": 0.98399, ... } mapping each meta-learner to an OOF R²
# scaler = StandardScaler() # previously fit on oof_preds
# X, y => full training data
# oof_preds => shape (n_samples, len(base_models))

##############################################
# 1) Identify the best meta-learner from OOF
##############################################
best_meta_name = max(meta_results, key=meta_results.get)
best_meta_model = meta_learners[best_meta_name]
print(f"Best meta-learner: {best_meta_name}, OOF R2 => {meta_results[best_meta_name]:.5f}")

##############################################
# 2) Refit each base model on the FULL data
##############################################
base_models_fitted = []
full_preds_stack = np.zeros((len(X), len(base_models)))  # same shape logic as OOF, but now for entire data

for idx, (mname, base_model) in enumerate(base_models):
    print(f"{idx}: {mname}, {base_model}")
    # clone to avoid reusing partial state
    fm = clone(base_model)
    fm.fit(X, y)
    base_models_fitted.append((mname, fm))
    # store predictions
    full_preds_stack[:, idx] = fm.predict(X)

##############################################
# 3) Scale the stacked predictions
##############################################
full_preds_stack_scaled = scaler.transform(full_preds_stack)

##############################################
# 4) Refit the chosen meta-learner on FULL stack
##############################################
final_meta_learner = clone(best_meta_model)
final_meta_learner.fit(full_preds_stack_scaled, y)

print(f"Refitted best meta-learner ({best_meta_name}) on entire dataset.")

Best meta-learner: XGB, OOF R2 => 0.98220
0: RandomForest, RandomForestRegressor(bootstrap=False, max_depth=30, max_features='sqrt',
                      min_samples_leaf=2, min_samples_split=3, n_estimators=600,
                      random_state=42)
1: ExtraTrees, ExtraTreesRegressor(max_depth=25, min_samples_leaf=3, min_samples_split=10,
                    n_estimators=300, random_state=42)
2: CatBoost, <catboost.core.CatBoostRegressor object at 0x7f90990b3d90>
Refitted best meta-learner (XGB) on entire dataset.


In [36]:
# MAKE VALIDATION PREDICTIONS via meta-ensemble

# 1) Stack predictions from each base model
val_stack = np.zeros((len(X_val), len(base_models_fitted)))
for idx, (mname, fm) in enumerate(base_models_fitted):
    val_stack[:, idx] = fm.predict(X_val)

# 2) Scale the stacked predictions, using the same scaler fit on OOF
val_stack_scaled = scaler.transform(val_stack)

# 3) Meta-learner final predictions
final_val_preds = final_meta_learner.predict(val_stack_scaled)
final_val_preds

array([0.9913063 , 0.9921562 , 0.98269475, ..., 0.9902654 , 0.998148  ,
       0.9912577 ], dtype=float32)

In [37]:
# SAVE SUBMISSION
# df_val = pd.read_csv("./data/Submission_template_UHI2025-v2.csv")
df_val["UHI Index"] = final_val_preds
submission = df_val[["Longitude", "Latitude", "UHI Index"]]

os.makedirs("output", exist_ok=True)
submission_path = "output/submission_v14_meta.csv"
submission.to_csv(submission_path, index=False)
print(f"Saved {submission_path}")

Saved output/submission_v14_meta.csv


### TWEAK MODEL APPOACH

In [38]:
# We'll do a 3-model OOF to confirm final ensemble => then tweak
predsA_oof = np.zeros(len(X))
predsB_oof = np.zeros(len(X))
predsC_oof = np.zeros(len(X))

for train_idx, valid_idx in skf.split(X,y_bins):
    XA, XV = X[train_idx], X[valid_idx]
    ya, yv = y[train_idx], y[valid_idx]
    
    foldA = clone(modelA).fit(XA, ya)
    foldB = clone(modelB).fit(XA, ya)
    foldC = clone(modelC).fit(XA, ya)

    predsA_oof[valid_idx] = foldA.predict(XV)
    predsB_oof[valid_idx] = foldB.predict(XV)
    predsC_oof[valid_idx] = foldC.predict(XV)

# wA, wB, wC = (0.3,0.4,0.3)  # or from search_ensemble_weights_3
ensemble_oof = w1*predsA_oof + w2*predsB_oof + w3*predsC_oof
residual_oof = y - ensemble_oof

# Tweak model:
tweak_model = DecisionTreeRegressor(max_depth=4).fit(X, residual_oof)

# Full-data refit
finalA = clone(modelA).fit(X,y)
finalB = clone(modelB).fit(X,y)
finalC = clone(modelC).fit(X,y)

predA_val = finalA.predict(X_val)
predB_val = finalB.predict(X_val)
predC_val = finalC.predict(X_val)

ensemble_val = w1*predA_val + w2*predB_val + w3*predC_val
residual_val = tweak_model.predict(X_val)
final_pred = ensemble_val + residual_val

df_val["UHI Index"] = final_pred
submission = df_val[["Longitude", "Latitude", "UHI Index"]]
submission_path = "output/submission_v14_tweak_3model.csv"
submission.to_csv(submission_path, index=False)
print(f"Saved {submission_path}")

Saved output/submission_v14_tweak_3model.csv


### SAVE TOP MODELS

In [39]:
# Save each base model
os.makedirs("models", exist_ok=True)
for i, (mname, fm) in enumerate(base_models_fitted):
    output_path = f"models/base_{mname}_model_{i}_v14.pkl"
    dump(fm, output_path)
    print(f"Saved base model: {output_path}")

# Save final meta-learner
import datetime
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M")
meta_output_path = f"models/final_meta_learner_{best_meta_name}_{timestamp}.pkl"SS
dump(final_meta_learner, meta_output_path)
print(f"Saved final meta-learner ({best_meta_name}) => {meta_output_path}")

Saved base model: models/base_RandomForest_model_0_v14.pkl
Saved base model: models/base_ExtraTrees_model_1_v14.pkl
Saved base model: models/base_CatBoost_model_2_v14.pkl
Saved final meta-learner (XGB) => models/final_meta_learner_XGB_20250208_1757.pkl
