# Polygonization by rivers & railways - manual exploration (tags)

In this notebook, for a given city, we
* load the raw OSM data on rivers/railways for this polygon
* visualize to explore
* verify that we can drop Points and Polygons
* annotate all tag values to keep in `barriertags.yml`


In [None]:
# import warnings
# warnings.filterwarnings("ignore")
import os
# import sys
# import argparse
# import random
# from time import time
# import numpy as np
import pandas as pd
# import pickle
import matplotlib.pyplot as plt
import geopandas as gpd
# from shapely.geometry import LineString, Point, MultiPoint, MultiLineString, Polygon
# from tqdm import tqdm
# from geopy.distance import distance, geodesic, great_circle
import osmnx as ox
# import networkx as nx
# import scipy.stats
# from scipy.stats import ks_2samp
# import sklearn
# import igraph
# from igraph import Graph

# from random import choice
# from bisect import bisect_left
# import copy

# from functools import partial
# import pyproj
# from pyproj import Geod
# from pyproj.crs import ProjectedCRS
# from pyproj.crs.coordinate_operation import AzimuthalEquidistantConversion
# from shapely.ops import transform

import yaml
# from utils import *
import shapely
# ox.__version__
import momepy
import folium

In [None]:
# read in and format CBSA table to loop through
cbsacode_file = '../data/cbsacode.csv'
df_cbsacodes = pd.read_csv(cbsacode_file)
df_cbsacodes = df_cbsacodes[["cbsacode", "name", "full_name", "geometry", "west", "south", "east", "north"]]
# convert text to shapely Polygon
df_cbsacodes["geometry"] = df_cbsacodes.geometry.apply(lambda x: shapely.from_wkt(x))
# convert to geodataframe
gdf_cbsacodes = gpd.GeoDataFrame(df_cbsacodes, crs = "EPSG:4326")
gdf_cbsacodes

# Choose city and barrier type to explore

In [None]:
city_name = "Milwaukee"
barrier_type = "waterway"

**load data**

In [None]:
cbsacode = gdf_cbsacodes[gdf_cbsacodes["name"]==city_name].cbsacode.values[0]
folder_osm = "../data/natural_barriers/raw/"
barrier_path = folder_osm + f'{cbsacode}_{barrier_type}.gpkg'
gdf = gpd.read_file(barrier_path)
assert all([t in ["Point", "LineString", "Polygon"] for t in gdf.geom_type.unique()]), "Unexpected geom type, double check"

**Can we drop Points and Polygons?** (if yes - no need to do anything further)

In [None]:
m = gdf[gdf.geom_type=="LineString"].explore(tiles = "CartoDB.Positron", name = "LineString")
gdf[gdf.geom_type=="Point"].explore(m=m, tiles = "CartoDB.Positron", name = "Point", color = "red")
gdf[gdf.geom_type=="Polygon"].explore(m=m, tiles = "CartoDB.Positron", name = "Polygon", color = "green")
folium.LayerControl().add_to(m)
m

**Which tags should we keep?**

In [None]:
gdf = gdf[gdf.geom_type=="LineString"]
gdf[barrier_type].unique()

In [None]:
# m = gdf[(gdf.geom_type=="LineString")&gdf[barrier_type].isin(["rail", "abandoned"])].explore(tiles = "CartoDB.Positron", column = barrier_type, cmap = "Set2", opacity=.9, style_kwds={"weight":5})
# m = gdf[gdf.waterway.isin(["river", "dam"])].explore(tiles = "CartoDB.Positron", column = barrier_type, cmap = "Set2", opacity=.9, style_kwds={"weight":5})
# m = gdf[gdf[barrier_type].isin(["rail", "abandoned", "disused"])].explore(tiles = "CartoDB.Positron", column = barrier_type, cmap = "Set2", opacity=.9, style_kwds={"weight":5})
m = gdf.explore(tiles = "CartoDB.Positron", column = barrier_type, cmap = "Set2", opacity=.9, style_kwds={"weight":5})
folium.LayerControl().add_to(m)
m

*******
*******

# Manual verification:

*this is currently only done for 1 city - Austin. would need to manually check all others, too*

* Can we drop Points?
* Can we drop Polygons?
* For LineStrings, which tag values for `waterway` and `railway` should we be keeping?

*******
*******


Geometry type point: for now, manual step: verify if we can drop them

In [None]:
# fig, ax = plt.subplots(1,1,figsize = (10,10))
# gpd.GeoSeries([city_boundaries], crs=gdf_cbsacodes.crs).plot(ax=ax, alpha = .1, color = "grey")
# railways[railways.geom_type =="Polygon"].plot(ax=ax, color = "red")
# waterways[waterways.geom_type =="Polygon"].plot(ax=ax, color = "blue")
# ax.set_axis_off()

Geometry type polygon: For now, manual step: check if we can ignore the Polygon geometries

In [None]:
# fig, ax = plt.subplots(1,1,figsize = (10,10))
# gpd.GeoSeries([city_boundaries], crs=gdf_cbsacodes.crs).plot(ax=ax, alpha = .1, color = "grey")
# railways[railways.geom_type =="Polygon"].plot(ax=ax, color = "red")
# waterways[waterways.geom_type =="Polygon"].plot(ax=ax, color = "blue")
# ax.set_axis_off()

For linestrings, identify relevant tag values:

In [None]:
# m = gpd.GeoSeries([city_boundaries],crs=waterways.crs).explore(
#     opacity =.1, name = "city", tiles="CartoDB.Positron")
# waterways[["waterway","geometry"]].explore(
#     m=m,column = "waterway",name="waterway"
#     )
# folium.LayerControl().add_to(m)
# m

executive decision: for this case, only keeping the linestrings with the tag "river"

In [None]:
# m = gpd.GeoSeries([city_boundaries],crs=railways.crs).explore(
#     opacity =.1, name = "city", tiles="CartoDB.Positron")
# railways[["railway","geometry"]].explore(
#     m=m,column = "railway",name="railway"
#     )
# folium.LayerControl().add_to(m)
# m

executive decision: keeping only railway=rail

***
***

# Inserting results from manual verification:

In [None]:
# once manual verification is done, insert info here:
process_dict = {}
process_dict[city_name] = {}
process_dict[city_name]["railway"] = { # currently only done for Austin
    "drop_points": True, # hopefully so!
    "drop_polygons": True, # hopefully so!
    "keep_tags": ["rail"], # list of strings (tag values to keep)
}
process_dict[city_name]["waterway"] = { # currently only done for Austin
    "drop_points": True, # hopefully so!
    "drop_polygons": True, # hopefully so!
    "keep_tags": ["river"], # list of strings (tag values to keep)
}


***
***

In [None]:
# further data preprocessing according to manual verification

natbar = []

for barrier_type in ["railway", "waterway"]:

    gdf = data_dict[city_name][barrier_type]

    if process_dict[city_name][barrier_type]["drop_points"]:
        gdf = gdf[gdf.geom_type != "Point"]

    if process_dict[city_name][barrier_type]["drop_polygons"]:
        gdf = gdf[gdf.geom_type != "Polygon"]
    
    gdf_tokeep = gdf[gdf[barrier_type].isin(
            process_dict[city_name][barrier_type]["keep_tags"]
        )
    ].copy().reset_index(drop=True)

    gdf_tokeep["barrier_type"] = gdf_tokeep[barrier_type]
    del gdf_tokeep[barrier_type]

    natbar.append(gdf_tokeep)

natbar.append(
            gpd.GeoDataFrame(
                {
                    "geometry": [city_boundaries.boundary],
                    "barrier_type": "city_border"
                },
            crs = target_city.crs
        )
)

natbar = pd.concat(natbar)

In [None]:
fig, ax = plt.subplots(1,1)
natbar.plot(
    ax=ax,
    column="barrier_type",
    legend=True
)
ax.set_axis_off()

polygonize the lines

In [None]:
fa = momepy.FaceArtifacts(natbar)
polygons = fa.polygons[["geometry"]]

drop artifacts (small polygons)

In [None]:
# keep only polygons with a large enough area - for this we need to project!
polygons = polygons.set_crs(natbar.crs)
polygons = polygons.to_crs(proj_crs)
polygons = polygons[polygons.area > area_threshold].copy().reset_index(drop=True)

save to file

In [None]:
polygons.to_file(folder_poly + f"{cbsacode}.gpkg", index = False)

count twitter users in each polygon (from RANDOM data!)

In [None]:
# read in nodes (RANDOMLY generated ones!)
nodes_gpkg = '../data/twitter_dummy/twitter_users_tract_random.gpkg'
nodes = gpd.read_file(nodes_gpkg)
nodes = nodes[nodes.cbsacode==cbsacode]

# count nodes in each polygon
nodes = nodes.to_crs(polygons.crs)
polygons["user_count"] = polygons.geometry.apply(lambda x: len(nodes.sindex.query(x, predicate="contains")))

In [None]:
m = gpd.GeoSeries([city_boundaries],crs=natbar.crs).explore(
    opacity =.1, name = "city", tiles="CartoDB.Positron")
natbar.explore(
    m=m, name="natural barriers", color = "black"
    )
polygons.explore(
    m=m, name = "polygons inbetween", opacity = 0.4, column = "user_count", cmap = "Set2"
)
nodes.explore(
    m=m, name = "random users"
)
folium.LayerControl().add_to(m)
m