# Polygonization by rivers & railways - get raw data; filter by tag; polygonize

In this notebook, for each city, we
* load the city boundary polygon
* download and save raw OSM data (features) on rivers/railways for this polygon
* load the info which tags to keep (derived from **manual exploration** of each data set, cf. `03_naturalbarriers_explore.ipynb`)
* process the data set (dropping unwanted tags), merge, and polygonize
* save polygons as output ("fragmentation by rivers/railways/borders" in each city)

In [None]:
# import warnings
# warnings.filterwarnings("ignore")
import os
# import sys
# import argparse
# import random
# from time import time
# import numpy as np
import pandas as pd
# import pickle
import matplotlib.pyplot as plt
import geopandas as gpd
# from shapely.geometry import LineString, Point, MultiPoint, MultiLineString, Polygon
# from tqdm import tqdm
# from geopy.distance import distance, geodesic, great_circle
import osmnx as ox
# import networkx as nx
# import scipy.stats
# from scipy.stats import ks_2samp
# import sklearn
# import igraph
# from igraph import Graph

# from random import choice
# from bisect import bisect_left
# import copy

# from functools import partial
# import pyproj
# from pyproj import Geod
# from pyproj.crs import ProjectedCRS
# from pyproj.crs.coordinate_operation import AzimuthalEquidistantConversion
# from shapely.ops import transform

import yaml
# from utils import *
import shapely
# ox.__version__
import momepy
import contextily as cx
import folium

**Def tiny functions**
(plots for sanity check)

In [None]:
def plot_polygons(polygons, natbar, folder_plots, city_name):
    fig, ax = plt.subplots(1,1, figsize = (20,20))
    polygons.plot(ax=ax, color = "#F7F7F7", alpha = .5)
    polygons.boundary.plot(ax=ax, color = "black", lw = 1, linestyle = "dotted")
    natbar.plot(ax=ax, column="barrier_type", legend = True, lw = 10, alpha = .2)
    xlims = list(natbar[natbar["barrier_type"]=="city_border"].bounds[["minx", "maxx"]].values[0])
    ylims = list(natbar[natbar["barrier_type"]=="city_border"].bounds[["miny", "maxy"]].values[0])
    ax.set_xlim(xlims)
    ax.set_ylim(ylims)
    ax.set_axis_off()
    ax.set_title(city_name, fontsize = 20)
    plt.tight_layout()
    fig.savefig(folder_plots + f"{city_name}.png", dpi = 300, bbox_inches = "tight")
    plt.close()
    return None

In [None]:
proj_crs = "epsg:9311"

In [None]:
# create subfolders for output data
folders = [
    "../data/natural_barriers/",
    "../data/natural_barriers/raw/",
    "../data/natural_barriers/polygonized/"
    "../data/natural_barriers/plots/"
]
for folder in folders:
    os.makedirs(folder, exist_ok=True)

# define folder paths
folder_osm = "../data/natural_barriers/raw/"
folder_poly = "../data/natural_barriers/polygonized/"
folder_plots = "../data/natural_barriers/plots/"

barrier_types = ["railway", "waterway"]

In [None]:
# read in and format CBSA table to loop through
cbsacode_file = '../data/cbsacode.csv'
df_cbsacodes = pd.read_csv(cbsacode_file)
df_cbsacodes = df_cbsacodes[["cbsacode", "name", "full_name", "geometry", "west", "south", "east", "north"]]
# convert text to shapely Polygon
df_cbsacodes["geometry"] = df_cbsacodes.geometry.apply(lambda x: shapely.from_wkt(x))
# convert to geodataframe
gdf_cbsacodes = gpd.GeoDataFrame(df_cbsacodes, crs = "EPSG:4326")
gdf_cbsacodes.head()

In [None]:
gdf_cbsacodes

Download and save data sets

In [None]:
### Download and save OSM data

# initialize dict where we will save all data
data_dict = {}

for _, row in gdf_cbsacodes.iterrows():

    cbsacode = row.cbsacode
    city_name = row["name"]
    geom = row.geometry

    print(f"{city_name}:")

    data_dict[city_name] = {} 

    for barrier_type in barrier_types:
        
        barrier_path = folder_osm + f'{cbsacode}_{barrier_type}.gpkg'
        
        if os.path.exists(barrier_path):
            print(f"\t file found, loading OSM data: {barrier_type}")
            gdf = gpd.read_file(barrier_path)
        else:
            print(f"\t file not found, downloading OSM data: {barrier_type}")
            gdf = ox.features_from_polygon(
                polygon=geom,
                tags = {
                    barrier_type:True, 
                }
            )
            gdf = gdf.explode(ignore_index=True)
            gdf = gdf[["geometry",barrier_type]]
            gdf.to_file(barrier_path)
            
        data_dict[city_name][barrier_type] = gdf
        del gdf

        print("\t done")

*******
*******

# Manual verification

(cf. `03_naturalbarriers_explore.ipynb`)

*******
*******


**Load tag dict** (derived manually)

In [None]:
with open("barriertags.yml", "r") as file:
    tags = yaml.load(file, Loader=yaml.FullLoader)
tags

**reduce data sets** accordingly

In [None]:
for city_name, tag_dict in tags.items():    

    gdfs = []

    cbsacode = gdf_cbsacodes[gdf_cbsacodes["name"]==city_name].cbsacode.values[0]
    
    natbar_path = folder_poly + f"{cbsacode}_natbar.gpkg"
    polygon_path = folder_poly + f"{cbsacode}_polygons.gpkg"

    # do this only if not done yet
    if not (os.path.exists(natbar_path) and os.path.exists(polygon_path)):
        print("Runing for", city_name)
        city_border = gpd.GeoDataFrame(
            {
                "geometry": [gdf_cbsacodes[gdf_cbsacodes["name"]==city_name].boundary.values[0]],
                "barrier_type": "city_border"
            },
            crs = gdf_cbsacodes.crs
        )
        
        gdfs.append(city_border)

        for barrier_type in barrier_types:
            gdf = data_dict[city_name][barrier_type].copy()
            gdf = gdf[(gdf.geom_type=="LineString")&(gdf[barrier_type]).isin(tag_dict[barrier_type])]
            gdf["barrier_type"] = gdf[barrier_type]
            del gdf[barrier_type]
            # data_dict[city_name][barrier_type] = gdf
            gdfs.append(gdf)

        # now we have a list of 3 gdfs: city border, railway, waterway. concat them all:
        # and save to file - these are the polygon outlines
        natbar = pd.concat(gdfs).reset_index(drop=True)
        natbar.to_file(folder_poly + f"{cbsacode}_natbar.gpkg", index = False)

        # and polygonize & save
        polygons = momepy.FaceArtifacts(natbar).polygons[["geometry"]].set_crs(natbar.crs)
        polygons.to_file(folder_poly + f"{cbsacode}_polygons.gpkg", index = False)

        plot_polygons(polygons, natbar, folder_plots, city_name)

        del natbar, polygons

## postprocess charlotte - remove short rivers!!

In [None]:
city_name = "Charlotte"
tag_dict = tags[city_name].copy()

gdfs = []

cbsacode = gdf_cbsacodes[gdf_cbsacodes["name"]==city_name].cbsacode.values[0]
print(cbsacode)

natbar_path = folder_poly + f"{cbsacode}_natbar.gpkg"
polygon_path = folder_poly + f"{cbsacode}_polygons.gpkg"

# do this only if not done yet

city_border = gpd.GeoDataFrame(
    {
        "geometry": [gdf_cbsacodes[gdf_cbsacodes["name"]==city_name].boundary.values[0]],
        "barrier_type": "city_border"
    },
    crs = gdf_cbsacodes.crs
)

gdfs.append(city_border)

for barrier_type in barrier_types:
    gdf = data_dict[city_name][barrier_type].copy()
    gdf = gdf[(gdf.geom_type=="LineString")&(gdf[barrier_type]).isin(tag_dict[barrier_type])]
    gdf["barrier_type"] = gdf[barrier_type]
    del gdf[barrier_type]
    # for waterways, drop short pieces:
    if barrier_type == "waterway":
        print(len(gdf))
        gdf_proj = gdf.to_crs(proj_crs).copy()
        gdf_proj = gdf_proj[gdf_proj.length > 1000].copy()
        gdf = gdf_proj.to_crs(gdf.crs)
        print(len(gdf))
    data_dict[city_name][barrier_type] = gdf
    gdfs.append(gdf)

    # now we have a list of 3 gdfs: city border, railway, waterway. concat them all:
    # and save to file - these are the polygon outlines
    natbar = pd.concat(gdfs).reset_index(drop=True)
    natbar.to_file(folder_poly + f"{cbsacode}_natbar.gpkg", index = False)

    # and polygonize & save
    polygons = momepy.FaceArtifacts(natbar).polygons[["geometry"]].set_crs(natbar.crs)
    polygons.to_file(folder_poly + f"{cbsacode}_polygons.gpkg", index = False)

    plot_polygons(polygons, natbar, folder_plots, city_name)

    del natbar, polygons

## reprocess orlando and miami - removing short canals

**orlando**

In [None]:
city_name = "Miami"
barrier_type = "waterway"
cbsacode = gdf_cbsacodes[gdf_cbsacodes["name"]==city_name].cbsacode.values[0]
folder_osm = "../data/natural_barriers/raw/"
barrier_path = folder_osm + f'{cbsacode}_{barrier_type}.gpkg'
gdf = gpd.read_file(barrier_path)
assert all([t in ["Point", "LineString", "Polygon"] for t in gdf.geom_type.unique()]), "Unexpected geom type, double check"

In [None]:
gdf = gdf[gdf.geom_type=="LineString"]
gdf = gdf[gdf[barrier_type].isin(["canal", "river", "stream", "drain"])].copy().reset_index(drop=True)

In [None]:
gdf = gdf.to_crs(proj_crs)

In [None]:
gdf["lengths"] = gdf.length

In [None]:
m = gdf.explore(tiles = "cartodb positron", column = barrier_type, name = "all")
folium.LayerControl().add_to(m)
m

In [None]:
gdf.crs

try merging small polygons to their (also small) neighbours?


In [None]:
poly = gpd.read_file(f"../data/natural_barriers/polygonized/33100_polygons.gpkg")

* add area column
* find reasonable threshold for merging adjacent polygons (contiguous! cf simp) that are too small

In [None]:
from esda.shape import diameter_ratio
from libpysal import graph

In [None]:
# project
poly = poly.to_crs(proj_crs)
poly["sqkm"] = poly.area / 10**6
poly["i"] = poly.geometry.apply(lambda x: diameter_ratio(x)[0])

In [None]:
# poly[poly.i<0.12].explore(column = "i", tiles = "cartodb positron")

In [None]:
poly_merge = poly[(poly.i > 0.12)&(poly.sqkm<7)].copy()

In [None]:
#poly_merge.explore(tiles="cartodb positron", column="sqkm")

In [None]:
rook = graph.Graph.build_contiguity(poly_merge, rook=True)

In [None]:
poly_merge["label"] = poly_merge.index

In [None]:
poly_merge["neighbours"] = poly_merge.label.apply(lambda x: rook.neighbors[x])

In [None]:
poly_merge["comp"] = rook.component_labels

In [None]:
# poly_merge.explore(tiles="cartodb positron", column="comp", cmap = "Dark2")

In [None]:
# manually untangle: 642 & 643 are their own comp; also 701

In [None]:
last_comp = int(poly_merge.comp.max())
poly_merge.loc[[642,643],"comp"] = last_comp + 1
poly_merge.loc[701,"comp"] = last_comp + 2

In [None]:
# use info to merge rest of polygons

In [None]:
poly["merge"] = None

In [None]:
for ix, row in poly_merge.iterrows():
    poly.loc[ix,"merge"] = row.comp

In [None]:
poly.loc[poly_merge.index]

In [None]:
# these that have no particular comp to be merged with - keep as they are
poly_keep = poly[poly["merge"].isna()].copy()

In [None]:
# these that need to be merged - merge them
geoms = []
for comp, mygroup in poly_merge.groupby("comp"):
    geoms.append(mygroup.geometry.union_all())

In [None]:
new_gdf = gpd.GeoDataFrame(
    {
        "geometry": list(poly_keep["geometry"]) + geoms
    }, 
    crs = poly.crs
)
new_gdf = new_gdf.to_crs("EPSG:4326")

In [None]:
m = new_gdf.explore(tiles="cartodb positron", name = "new")
poly.explore(m=m, name = "old", color = "red")
folium.LayerControl().add_to(m)
m

redo this but without dropping indeces -- then we can track which polygons to merge!

In [None]:
poly.explore()