In [3]:
import pandas as pd
import geopandas as gpd
import numpy as np
import docx
from arcgis.gis import GIS

In [4]:
from arcgis.geoenrichment import enrich, Country

In [5]:
# init GIS connection
with open("agol_account_info.txt", "r") as f:
    url, username, password = f.read().splitlines()

In [6]:
gis = GIS(url, username=username, password=password)

In [11]:
def get_enrich_vals(gis, enrich_val_doc_path):
    """
    return dataframe: geoenrich-able variables in ESRI acceptable format
    """
    usa = Country('usa')
    ev_df = usa.enrich_variables
    
    # grab the long document by Ilya
    doc = docx.Document(enrich_val_doc_path)
    doc_lst = [p.text for p in doc.paragraphs][3:-2]
    
    out_idx_lst = []
    empty_txt = []
    
    
    def get_val(ev_df, txt, vintage_year=None):
        """
        helper to get list of index with description text
        """
        def get_year_bool(vintage_year, x):
            """
            return boolean: if vintage_year is in description text
            True if no vintage_year is provided
            """
            if vintage_year:
                return vintage_year in x
            else:
                return True
            
        return ev_df[ev_df.description.apply(
            lambda x: txt in x.lower() and get_year_bool(vintage_year, x) if x is not None else False)].index.tolist()
    
    for corpus in doc_lst:
        cur_idx_lst = get_val(ev_df, corpus.lower())
        if len(cur_idx_lst) == 0:
            ### only used in inspection purpose
            empty_txt.append(corpus)
        else:
            out_idx_lst += cur_idx_lst
    
    # pop age, pop ethnicity, pop tapestry 
    # total housing units, housing price average, affordability index, housing while attending school, etc.
    # unemployment rate and by ethnicity group
    # average household income, by group
    # food stamp recieving/not-recieving
    description_kws = ["population", "tapestry", "housing", "unemployment", "average household income", "food stamp"]
    for dkw in description_kws:
        out_idx_lst += get_val(ev_df, dkw, "2024")
        
        
    # food related data collection
    ev_df_food = ev_df[ev_df.data_collection.apply(lambda x: x in ['food', 'PsychographicsFood', 'GroceryAlcoholicBeverages'])]
    
    out_idx_lst += ev_df_food[ev_df_food.apply(
        lambda row: row['data_collection'] == 'food' and row['vintage'] == '2024', axis=1)].index.tolist()
    
    # hh used, bread in past 30 days, etc.
    # food store
    # buy many units if food item on sale
    # restaurant related
    description_kws = ["hh used", "food store", "food item on sale", "restaurant"]
    for dkw in description_kws:
        out_idx_lst += get_val(ev_df_food, dkw, "2024")
        
    # trends in food, projected 2029
    for dkw in description_kws:
        out_idx_lst += get_val(ev_df_food, dkw, "2029")
    
    out_idx_lst += ev_df_food[ev_df_food.apply(
        lambda row: row['data_collection'] == 'food' and row['vintage'] == '2029', axis=1)].index.tolist()
    
    # crime
    out_idx_lst += ev_df[ev_df['data_collection'] == 'crime'].index.tolist()
    
    # ony interesting ones are "2024 Like Eating Foods from Different Cultures"
    out_idx_lst += ev_df[ev_df.description.apply(
        lambda x: "cultur" in x.lower() and 'agricultur' not in x.lower() if x is not None else False)].index.tolist()
    
    ########################### NEED OUTSIDE LAYER, NOT FOUND ###########################
    # null, drove alone to work
    # null, rental property price, the below finds only home/vacation home property price
    # null, foreign-born
    # 'Zoning laws and other regulations ',
    # 'zoning map from SANDAG',
    # 'Access to suppliers',
    # 'proximity to highways and distribution centers (in urban areas); proximity to farms (in rural areas) - need to find relevant layers',
    # Availability of infrastructure (water, electricity, internet, etc.) including maps of utilities'
    #####################################################################################
    
    kv_df = ev_df.loc[set(out_idx_lst)].drop_duplicates(subset=['name'])
    
    return kv_df

In [12]:
kv_df = get_enrich_vals(gis, f"Opportunities Map Process.docx")

In [2]:
#enrich_out = usa.enrich(imp_sd_bg_sedf, kv_df)

In [18]:
### read geojson from url
# San Diego Public Law Beats Data
url = "https://opendata.sandag.org/resource/7p9e-ppq2.geojson"
gdf = gpd.read_file(url)

In [20]:
gdf.crs

<Geographic 2D CRS: EPSG:4326>
Name: WGS 84
Axis Info [ellipsoidal]:
- Lat[north]: Geodetic latitude (degree)
- Lon[east]: Geodetic longitude (degree)
Area of Use:
- name: World.
- bounds: (-180.0, -90.0, 180.0, 90.0)
Datum: World Geodetic System 1984 ensemble
- Ellipsoid: WGS 84
- Prime Meridian: Greenwich