# Enrich seed locations

Enrich seeds locations: calculate each parameter from DO table for the seed location, store them in a test ws_enrichment table


In [22]:
# Step 1: Set environment
# We need to set environment's correct values if we want to load project modules. 
import sys, os

# It's important load the PROJECT_ROOT path. We need to replace PROJECT_ROOT with correct folder
# Module is placed in api folder inside project root folder
# PROJECT_ROOT = '/Users/44371/Documents/Cases/V7FC/bain-vantage'
PROJECT_ROOT = '/home/jacrisol/github/bain-vantage'
MODULE_FULL_PATH = os.path.join(PROJECT_ROOT, 'api')
sys.path.insert(1, MODULE_FULL_PATH)


from api.settings import env, config

# config file sample can be foun at PROJECT_ROOT/api/config.env.sample. All keys and passwods are placed there.
# in this step, we load this file to get all carto api keys needed.
# env.read_env('/Users/44371/Documents/Cases/V7FC/bain-vantage/api/notebooks.env')
env.read_env('/home/jacrisol/github/bain-vantage/api/notebooks.env')
with env.prefixed("BAIN_VANT_API_"):
    config['carto'] = {
        'base_url': env.str('CARTO_BASE_URL', ''),
        'user': env.str('CARTO_ADMIN_USER', 'xxx'),
        'api_key': env.str('CARTO_ADMIN_API_KEY', 'xxx'),
    }
    
# Step 2: create carto variable
# create carto object to be able to use methods inside it. We import all necessary classes for the samples too 

from etl.cf_model import CartoFramesModel, GeometryType
from etl.constants.global_constants import meters_in_mile
from cartoframes import to_carto, create_table_from_query, read_carto
from etl.constants.global_constants import enrichment_features

carto = CartoFramesModel()

## Append to path prediction model folder

In [23]:
import joblib
sys.path.insert(0,'../../../../')

## Enrich with CARTO datasets (zip code information)

we can use enrichment carto libraries, but in CartoFramesModel we can find a method for enrich any dataframe, `carto.enrichment_variables`. By default, the third parameter is POLYGONS, so if we pass a geodataframe with points, we need to to change to GeometryType.POINTS value.

In [24]:
def enrich_with_carto(seed_locations):
    seeds_enriched = carto.enrichment_variables(seed_locations, ['geoid_c90eb55a'], GeometryType.POINTS)
    seeds_enriched['geoid'] = seeds_enriched['geoid'].fillna(0).astype('int')
    seeds_enriched.rename(columns={'geoid':'zipcode'}, inplace=True)

    return seeds_enriched

## Enrich with zip_urbanicity table

In this method, we need to make enrichment against a table in database. So, we upload a temp table with calculations by sql and gets a dataframe with read_carto. In this table can be used by different analysis, we need to use a unique name and we will delete this table after the enrichment.

In [25]:
# Map in territory information
def enrich_with_urbanicity_table():
    sql = """
        WITH updated_seeds AS(
            SELECT
                cartodb_id
                , seed_id
                , the_geom
                , ST_Y(the_geom) lat
                , ST_X(the_geom) long
                , LPAD(CAST(zipcode as VARCHAR), 5, '0') zipcode,
                0.0 as target_var
            FROM
                vtg_test_ws_seed_clusters)
        SELECT
            a.*
            , b.city_name
            , b.cbsa_code
            , b.cbsa_name
            , b.dma_code
            , b.dma_name
            , b.state_fip
            , b.state_abb
            , b.region
            , b.urbanicity
        FROM
            updated_seeds a
        INNER JOIN
            btunnell9.zip_urbanicity b
        ON
            a.zipcode = b.zipcode

        """
    create_table_from_query(sql, "vtg_test_ws_enrichment", if_exists="replace")

## Enrich with process defined in 2_DAP_AUV_Predict notebook

In [26]:
# Buffer column designation
urb_buffers = {'Urban': 1, 'Sub-urban': 3, 'Rural': 5, 'Other': 5}

In [27]:
def fill_missing(df, features):
    enriched_cols = list(df)
    missing_cols = [col for col in features if col not in enriched_cols]
    df[missing_cols] = 0
    
    return df

In [28]:
def enrich_process():
    seed_table = "vtg_test_ws_enrichment"
    
     # Read in carto table with seed location
    print("reading base table...")
    base_df = read_carto(seed_table)
    
    # Create buffers
    print("creating buffers...")
    df = carto.make_buffer(seed_table, urb_buffers, 'urbanicity')

    # Enrich with demographics
    print("enriching with demographics...")
    
    enriched_demos = carto.std_var_enrich(df, 
                                   'demo',
                                    incl_perc=True,
                                    incl_catchment=True)
    
    # Enrich with poi data
    print("enriching with poi...")
    enriched_poi_cat = carto.poi_density(
        carto_tbl=seed_table, 
        buffers=urb_buffers, 
        cat_col='urbanicity',
        poi_tbl='vtg_pois',
        poi_cat='poi_class', 
        incl_catchment=True)
    
    
    # Create brand densities using priority competitor lookup table
    # in bainandco schema, we can't access to priority_competitors table, 
    #so we can't use  param: priority_pois='priority_competitors',
    enriched_poi_brand = carto.poi_density(
        carto_tbl=seed_table, 
        buffers=urb_buffers, 
        cat_col='urbanicity',
        poi_tbl='vtg_pois',
        poi_cat='brand', 
        incl_catchment=True)
    
    
    # Find nearest
    print("finding nearest...")
    nearest_self = carto.find_nearest(seed_table, 'vtg_customer_locations', self_flag=True)
    nearest_poi = carto.find_nearest(seed_table, 'vtg_pois')
    nearest_chicken = carto.find_nearest(seed_table, 
                                      'vtg_pois', 
                                      cat_col='poi_class', 
                                      cat_val='chicken')
    nearest_burger = carto.find_nearest(seed_table,
                                     'vtg_pois',
                                     cat_col='poi_class',
                                     cat_val='Hamburger/Roast Beef')

    # Merge tables
    print("merging tables...")
    carto_df = base_df.drop(columns='the_geom').merge(enriched_demos, how='left', on='cartodb_id')
    carto_df = carto_df.merge(enriched_poi_cat.drop(columns='area'), how='left', on='cartodb_id')
    carto_df = carto_df.merge(enriched_poi_brand.drop(columns='area'), how='left', on='cartodb_id')
    carto_df = carto_df.merge(nearest_self, how='left', on='cartodb_id')
    carto_df = carto_df.merge(nearest_poi, how='left', on='cartodb_id')
    carto_df = carto_df.merge(nearest_chicken, how='left', on='cartodb_id')
    carto_df = carto_df.merge(nearest_burger, how='left', on='cartodb_id')

    # BE CAREFUL. carto_df has 431 columns, so if we try to save table in carto, it will FAIL. 
    # We need to reduce number of columns and save only neccesary columns
    # to_carto(carto_df, seed_table, if_exists='replace')
    return carto_df


In [29]:
def predict(dataframe):
    # Fill missing columns
    print("filling missing columns")
    carto_df = fill_missing(dataframe, enrichment_features)
    
    print("generating prediction")
    pred_data = carto_df[enrichment_features]
    
    # Load model
    mod = joblib.load('../../../../models/mod_pipeline.pkl')
    
    # Generate prediction using pipeline object
    output = mod.predict(pred_data)
    return output

In [32]:
def bain_method():
    # get seeds from carto and enrich with carto datasets
    seed_locations = read_carto('vtg_test_ws_seed_locations')
    seeds_enriched = enrich_with_carto(seed_locations)
    
    # Upload to carto
    to_carto(seeds_enriched, "vtg_test_ws_seed_clusters", if_exists='replace')
    
    #enrich with relational information inside a database table. Save to vtg_test_ws_enrichment table
    enrich_with_urbanicity_table()
    
    #enrich with brian's code in 2_DAP_AUV_Predict nb
    seeds_enriched = enrich_process()
    print('Enrichment finished!')
    
    predictions = predict(seeds_enriched)
    print(predictions)

    final = seeds_enriched.copy()
    final['target_var'] = predictions
    response = carto.update_rows('vtg_test_ws_enrichment', 'cartodb_id', ['target_var'], final[['cartodb_id', 'target_var']])
    print('Predictions finished!')

In [33]:
bain_method()

[2021-02-05T18:34:26Z] (267892) {carto.py:130} INFO - Success! Data uploaded to table "vtg_test_ws_seed_clusters" correctly
[2021-02-05T18:34:29Z] (267892) {carto.py:294} INFO - Success! Table "vtg_test_ws_enrichment" created correctly


reading base table...
creating buffers...
enriching with demographics...
Downloading standard variables from Carto...complete
Aggregating data and calculating percentages...complete
enriching with poi...


[2021-02-05T18:35:04Z] (267892) {carto.py:294} INFO - Success! Table "dgn_7957fb24_67d8_11eb_af5c_33bb25bc92ec_catchment" created correctly


0 null rows found in poi_class column, representing 0.0% of total rows...dropped 0 rows


[2021-02-05T18:35:11Z] (267892) {carto.py:294} INFO - Success! Table "dgn_7d75b0d4_67d8_11eb_af5c_33bb25bc92ec_catchment" created correctly


0 null rows found in brand column, representing 0.0% of total rows...dropped 0 rows
finding nearest...


[2021-02-05T18:35:18Z] (267892) {cf_model.py:162} INFO - Processing data


merging tables...
Enrichment finished!
filling missing columns
generating prediction
[1287039.06433875 1321349.57124732 1266872.4579099  1348474.92130404
 1333264.41282701 1254634.08781928 1030793.36003273 1222579.02955045
 1145939.91291806 1299041.63382231 1299501.10851287 1048278.75419508
 1008698.18759394 1191995.90090721 1209040.14275016 1249330.82863171
 1255058.87070891 1124059.82689849  902970.97736465 1342129.99350813
 1246862.98594246 1588212.55243989 1401002.15243734 1157654.89496663
 1516682.57192666 1213115.8249331  1593747.46523303 1416948.47976215
 1181643.70361171 1537162.46028142 1193272.26161265 1219559.05734503
 1383125.09383292 1456859.78130221 1881683.64019455 1603414.60710964
 1033692.85387054 1441028.08003791 1454125.12809755 1271044.39925208
 1803241.13482001 1052863.3990834  1160793.19712871 1616460.92073788
 1443285.95837335 1178043.13767165 1517230.40714287 1387304.80291032
 1117192.12947933 1375491.5957171 ]


[2021-02-05T18:35:22Z] (267892) {cf_model.py:178} INFO - Updating data ...
[2021-02-05T18:35:26Z] (267892) {cf_model.py:181} INFO - Updated !


Predictions finished!
