In [None]:
import os
import numpy as np
import xarray as xr
import pandas as pd
import yaml
import glob
import geopandas as gpd
from shapely.geometry import box
from shapely.ops import unary_union
from shapely.geometry import Point


## Create the shapefile for each city

In [None]:
with open("selected_cities_description.yaml", "r") as file:
    cities = yaml.safe_load(file)

# Ensure that the folder 'box/' exists
os.makedirs("box", exist_ok=True)

In [None]:
def find_masks(city_name):
    """
    Finds all mask files for a given city in the specified directories.
    """
    pattern1 = f"results/*{city_name}*/urmask*.nc"
    pattern2 = f"results_CERRA/*{city_name}*/urmask*.nc"
    return glob.glob(pattern1) + glob.glob(pattern2)

In [None]:
def get_expanded_bounding_box(masks, var_name="urmask"):
    """
    Finds the largest bounding box that contains all non-NaN values across multiple masks.
    """
    lat_min_list, lat_max_list = [], []
    lon_min_list, lon_max_list = [], []

    for mask in masks:
        valid_mask = mask[var_name].values == 0 
        rows, cols = np.where(valid_mask)

        if len(rows) > 0 and len(cols) > 0:
            lat = mask.lat.values
            lon = mask.lon.values
            lat_min_list.append(lat[rows.min()]  - 0.125)
            lat_max_list.append(lat[rows.max()] + 0.125)
            lon_min_list.append(lon[cols.min()]  - 0.125)
            lon_max_list.append(lon[cols.max()] + 0.125)

    if not lat_min_list or not lon_min_list:
        return None  # No valid data

    return min(lat_min_list), max(lat_max_list), min(lon_min_list), max(lon_max_list)

In [None]:
for city, info in cities.items():
    city_name = info["name"]
    mask_paths = find_masks(city)

    if not mask_paths:
        print(f"No masks found for {city_name}")
        continue

    masks = [xr.open_dataset(path) for path in mask_paths]

    # Obtain the largest bounding box
    bbox = get_expanded_bounding_box(masks)

    if bbox:
        lat_min, lat_max, lon_min, lon_max = bbox
        geom = box(lon_min, lat_min, lon_max, lat_max)  # Create rectangle

        # Create a GeoDataFrame with the rectangle
        gdf = gpd.GeoDataFrame({"city": [city_name]}, geometry=[geom], crs="EPSG:4326")

        # Save the shapefile in the folder 'box/'
        gdf.to_file(f"box/{city}_bbox.shp")

        print(f"Saved {city_name} bounding box to 'box/{city_name}_bbox.shp'")
    else:
        print(f"No valid bounding box for {city_name}")

## Combine the shapefile

In [None]:
# Directory containing the shapefiles
box_dir = "box/"

# List all .shp files in the box directory
shp_files = [os.path.join(box_dir, f) for f in os.listdir(box_dir) if f.endswith(".shp")]

gdfs = []

for shp in shp_files:
    city_name = os.path.basename(shp).split("_")[0]  # Extract city name before "_"
    gdf = gpd.read_file(shp)
    
    # Remove 'FID' column if it exists
    if "FID" in gdf.columns:
        gdf = gdf.drop(columns=["FID"])
    
    # Add a new column with the city name
    gdf["city"] = city_name  
    gdfs.append(gdf)

if gdfs:
    # Concatenate all GeoDataFrames into a single one
    gdf_all = gpd.GeoDataFrame(pd.concat(gdfs, ignore_index=True))

    # Dissolve geometries by city to create MultiPolygons per city
    gdf_final = gdf_all.dissolve(by="city")

    # Save the final shapefile
    output_path = "box/all_cities_bbox.shp"
    gdf_final.to_file(output_path)

    print(f"Saved combined multipolygon to {output_path}")
else:
    print("No shapefiles found in the 'box/' directory.")


In [None]:
root= "box/"
all_cities = gpd.read_file(root + '/all_cities_bbox.shp')

In [None]:
all_cities.plot()