In [1]:
import pandas as pd
import numpy as np
import copy
import json
import psycopg2
import geopandas as gpd
import shapely
from shapely.geometry import box
from shapely.ops import split
from arcgis.gis import GIS
from arcgis import geocode
from arcgis.geometry import BaseGeometry, Geometry
import openai
from keybert.llm import OpenAI
from keybert import KeyLLM


In [2]:
%%time


@classmethod
def from_shapely(cls, shapely_geometry):
    return cls(shapely_geometry.__geo_interface__)

BaseGeometry.from_shapely = from_shapely

# init db connection
with open("awesome_db_key", "r") as f:
    database, host, user, password, port = f.read().splitlines()
    
conn = psycopg2.connect(database=database,
                        host=host,
                        user=user,
                        password=password,
                        port=port)
cursor = conn.cursor()


# init GIS connection
with open("agol_account_info.txt", "r") as f:
    url, username, password = f.read().splitlines()
    
gis = GIS(url, username=username, password=password)

# init llm connection
with open("openai_api_key", "r") as f:
    MY_API_KEY = f.readline()
    
    
# Create your LLM
client = openai.OpenAI(api_key=MY_API_KEY)
llm = OpenAI(client)

# Load it in KeyLLM
kw_model = KeyLLM(llm)


cursor.execute("""
SELECT id, city, community, zipcodes FROM public.community_neighborhoods
ORDER BY id ASC 
""")
db_out = cursor.fetchall()
community_neighborhoods_df = pd.DataFrame(db_out, columns =["id", "city", "community", "zipcodes"])

stop_words = ['neighborhood', 'region', 'location', 'geographic', 'geographical location', 'street', 'streets', 'landmarks', 'area', 'part', 'specific location']
directional_predictates = ['north', 'south', 'west', 'east']
arcgis_community_boundaries_lyr = gis.content.get("23a806fb906e428cb75d123cf2ab580c").layers[0]
community_boundaries_sdf = pd.DataFrame.spatial.from_layer(arcgis_community_boundaries_lyr)
fset = arcgis_community_boundaries_lyr.query()
gjson_string = fset.to_geojson
community_boundaries_gdf = gpd.read_file(gjson_string, driver='GeoJSON').set_crs(2230, allow_override=True)
sd_roads_gdf = gpd.read_file('sd_roads.json', driver='GeoJSON').set_crs(2230, allow_override=True)
bgs_gdf = gpd.read_file("bgs_sd_imp/bgs_sd_imp.shp").to_crs(2230)

Wall time: 4min 7s


In [61]:
def get_extent(geom_id, geom_txt):
    if geom_id == 0:
        return get_community(geom_txt)
    elif geom_id == 1:
        return get_poi(geom_txt)
    elif geom_id == 2:
        return get_road(geom_txt)
    else:
        raise Exception("Invalid geom_id")


def get_community(geom_txt):
    dw_out = []
    community_out = []
    while not len(community_out):
        keywords = kw_model.extract_keywords(geom_txt)[0]

        for kw in copy.deepcopy(keywords):
            for dw in directional_predictates:
                if dw in kw.lower():
                    dw_out.append(dw)
                    keywords.remove(kw)
        print(keywords)

        for kw in keywords:    
            kw_community = community_neighborhoods_df[
                community_neighborhoods_df.apply(
                    lambda row: kw.lower() in row['community'].lower(), 
                    axis = 1)
            ]
            if len(kw_community):
                community_out.append(kw)
        
            
            
    geocode_out = []
    for kw in community_out:
        geocode_out.append(geocode(address = kw, max_locations = 10))
        
        
    for g_out in geocode_out[0]:
        p = g_out['location']
        p = gpd.GeoSeries(shapely.Point(p['x'],  p['y'])).set_crs(4326).to_crs(2230)
        temp_gdf = community_boundaries_gdf[community_boundaries_gdf.geometry.contains(p[0])]
        if len(temp_gdf) > 0:
            # the geocode out are ranked in confidence score
            # break the first outcome is found
            break
    
 
    community_shape = temp_gdf.geometry.iloc[0]
    x,y = community_shape.centroid.x, community_shape.centroid.y
    minx, miny, maxx, maxy = community_shape.bounds
    
    
    l1 = shapely.LineString([(x, miny), (x, maxy)])
    l2 = shapely.LineString([(minx, y), (maxx, y)])
    
    if len(dw_out) == 0:
        return community_shape
    
    out_poly = []
    for dw in dw_out:
        if dw in ['north', 'south']:
            split1, split2 = split(community_shape, l2).geoms
            if split1.centroid.y > split2.centroid.y:
                out_dict = {'north': split1, 'south': split2}
            else:
                out_dict = {'north': split2, 'south': split1}
        else:
            split1, split2 = split(community_shape, l1).geoms
            if split1.centroid.x < split2.centroid.x:
                out_dict = {'west': split1, 'east': split2}
            else:
                out_dict = {'west': split2, 'east': split1}
        out_poly.append(out_dict[dw])
        
    return out_poly[0]
    

def get_poi(geom_txt):
    
    keywords = kw_model.extract_keywords(geom_txt)[0]
    for kw in keywords.copy():
        if kw.lower() in stop_words:
            keywords.remove(kw)
            
    geocode_out = geocode(address = " ".join(keywords) + ", San Diego", max_locations = 1)[0]
    return gpd.GeoSeries(shapely.Point(geocode_out['location']['x'],  geocode_out['location']['y'])).set_crs(4326).to_crs(2230).buffer(500)[0]


def get_road(geom_txt):
    keywords = kw_model.extract_keywords(geom_txt)[0]
    for kw in keywords.copy():
        if kw.lower() in stop_words:
            keywords.remove(kw)
            
    geocode_out = []
    # specify SD to be disambiguous for geocode
    for kw in keywords:
        geocode_out.append(geocode(address = kw + ", San Diego", max_locations = 1)[0]['location'])
    
    
    rd = []
    for g_out in geocode_out:
        p = gpd.GeoSeries(shapely.Point(g_out['x'],  g_out['y'])).set_crs(4326).to_crs(2230).buffer(100)
        rd.append(sd_roads_gdf[sd_roads_gdf.geometry.crosses(p[0])])
    
    out_road = []
    for r in rd:
        rd_name = r.RD20FULL.iloc[0]
        road_segments = sd_roads_gdf[sd_roads_gdf['RD20FULL'] == rd_name]['geometry'].tolist()
        temp_road = road_segments[0]
        for r in road_segments[1:]:
            temp_road = temp_road.union(r)
        out_road.append(temp_road)
        
    simple_rd = []
    for rd in out_road:
        minx, miny, maxx, maxy = rd.bounds
        if (maxy - miny) > (maxx - minx):
            x = rd.centroid.x
            simple_rd.append(shapely.LineString([(x, miny), (x, maxy)]))
        else:

            y = rd.centroid.y
            simple_rd.append(shapely.LineString([(minx, y), (maxx, y)]))
    
    
    rd_bound = []
    road = None
    for rd1 in simple_rd:
        count = 0
        for rd2 in simple_rd:
            if rd1.crosses(rd2): count += 1 
        if count == 2:
            for i, rd2 in enumerate(simple_rd):
                pt = rd1.intersection(rd2)
                if isinstance(pt, shapely.Point):
                    rd_bound.append(pt)
                else:
                    road = out_road[i]
            break
            
    idxmax = np.argmax(np.abs(np.array(rd_bound[0].bounds) - rd_bound[1].bounds))
    maxpt1, maxpt2 = rd_bound[0].xy[idxmax][0], rd_bound[1].xy[idxmax][0]
    minx, miny, maxx, maxy = road.bounds
    minx, maxx = min(maxpt1, maxpt2), max(maxpt1, maxpt2)
    
    out_road = box(minx, miny, maxx, maxy).intersection(road)
    
    return out_road.buffer(100)


def get_enrich_data(study_area, precision='block group'):
    return bgs_gdf[bgs_gdf.geometry.intersects(study_area)]

In [66]:
%%time
study_area = get_extent(0, "south Del Mar Heights")

['Neighborhood', 'Location']
['Location']
['Del Mar Heights']
Wall time: 1.4 s


In [67]:
import folium

In [68]:
m = folium.Map([32.7, -117.2], zoom_start=10, tiles="cartodbpositron")

#folium.GeoJson(community_boundaries_gdf).add_to(m)
study_area_gjson = gpd.GeoSeries(study_area).simplify(tolerance=0.001).set_crs(2230)
folium.GeoJson(study_area_gjson, style_function=lambda x: {"fillColor": "red"}).add_to(m)
m

In [69]:
enrich_out = get_enrich_data(study_area)
enrich_out

Unnamed: 0,statefp,countyfp,tractce,blkgrpce,source_cou,aggregatio,population,apportionm,has_data,x2001_a,...,val400k_fy,val500k_fy,val750k_fy,val1m_fy,medval_fy,avgval_fy,valbase_fy,wlthindxcy,sei_cy,geometry
328,6,73,8346,2,USA,BlockApportionment:US.BlockGroups;PointsLayer:...,2.191,2.576,1,1220.44,...,1.0,36.0,96.0,50.0,902344.0,970942.0,191.0,207.0,81.6,"POLYGON ((6268179.678 1905175.574, 6268148.304..."
592,6,73,8339,1,USA,BlockApportionment:US.BlockGroups;PointsLayer:...,2.191,2.576,1,762.27,...,18.0,137.0,7.0,2.0,617701.0,659857.0,175.0,66.0,51.2,"POLYGON ((6265666.898 1898628.089, 6265668.484..."
895,6,73,8324,3,USA,BlockApportionment:US.BlockGroups;PointsLayer:...,2.191,2.576,1,1395.5,...,2.0,4.0,68.0,296.0,1279561.0,1324948.0,479.0,245.0,67.9,"POLYGON ((6256273.177 1919791.663, 6256203.228..."
1067,6,73,8312,1,USA,BlockApportionment:US.BlockGroups;PointsLayer:...,2.191,2.576,1,2242.76,...,0.0,0.0,15.0,35.0,2000001.0,1923611.0,198.0,464.0,72.0,"POLYGON ((6249166.007 1890094.391, 6249143.637..."
1118,6,73,8350,1,USA,BlockApportionment:US.BlockGroups;PointsLayer:...,2.191,2.576,1,1145.33,...,41.0,493.0,356.0,195.0,790028.0,937555.0,1360.0,185.0,49.6,"POLYGON ((6270156.589 1899534.387, 6270145.749..."
1587,6,73,8371,2,USA,BlockApportionment:US.BlockGroups;PointsLayer:...,2.191,2.576,1,1183.14,...,0.0,0.0,24.0,164.0,1416159.0,1472741.0,321.0,175.0,82.3,"POLYGON ((6262997.953 1910270.649, 6262935.496..."
1618,6,73,8373,2,USA,BlockApportionment:US.BlockGroups;PointsLayer:...,2.191,2.576,1,1378.49,...,0.0,4.0,145.0,256.0,1327148.0,1350513.0,633.0,213.0,86.4,"POLYGON ((6259479.844 1915152.519, 6259214.433..."
2018,6,73,8324,4,USA,BlockApportionment:US.BlockGroups;PointsLayer:...,2.191,2.576,1,1713.84,...,0.0,0.0,10.0,126.0,1558824.0,1625856.0,292.0,368.0,85.1,"POLYGON ((6255670.837 1919953.153, 6255501.334..."
2019,6,73,8324,5,USA,BlockApportionment:US.BlockGroups;PointsLayer:...,2.191,2.576,1,1717.6,...,0.0,0.0,6.0,101.0,1956522.0,1856722.0,424.0,366.0,89.7,"POLYGON ((6255783.234 1920508.062, 6255639.520..."
2078,6,73,8369,2,USA,BlockApportionment:US.BlockGroups;PointsLayer:...,2.191,2.576,1,866.3,...,10.0,53.0,67.0,286.0,1122378.0,1053359.0,454.0,80.0,66.8,"POLYGON ((6257544.574 1920093.566, 6257536.061..."


In [70]:
m = folium.Map([32.7, -117.2], zoom_start=10, tiles="cartodbpositron")

folium.GeoJson(enrich_out.geometry.simplify(tolerance=0.001)).add_to(m)
folium.GeoJson(study_area_gjson, style_function=lambda x: {"color": "orange", "fillColor": "orange"}).add_to(m)
m