In [2]:
import gspread
import sys
from oauth2client.service_account import ServiceAccountCredentials
import pandas as pd
import numpy as np
import urllib
import sqlalchemy
from gspread_dataframe import set_with_dataframe
from gspread_dataframe import get_as_dataframe

In [3]:
from column_map import column_map

In [49]:
sys.path.append('../..')
from IPM_Shared_Code_public.Python.google_creds_functions import create_assertion_session
from IPM_Shared_Code_public.Python.utils import get_config
from IPM_Shared_Code_public.Python.delta_functions import *
from IPM_Shared_Code_public.Python.sql_functions import sql_update

### Use the config file to setup connections

In [5]:
config = get_config('c:\Projects\config.ini')

driver = config['srv']['driver']
server = config['srv']['server']
dwh = config['db']['crowdsdb']
cred_file = config['google']['path_to_file']

### Create the dictionary to rename the columns

In [24]:
col_rename = {'PROPERTY_I': 'site_id',
               'DESCRIPTIO': 'site_desc', 
               'DISTRICT': 'park_district', 
               'DESC_LOCAT': 'desc_location', 
               'Latitiude': 'latitude', 
               'Longitude': 'longitude'}

In [25]:
cols = list(col_rename.values())

### Read the current data from SQL

In [29]:
con_string = 'Driver={' + driver + '};Server=' + server +';Database=' + dwh + ';Trusted_Connection=Yes;'
params = urllib.parse.quote_plus(con_string)
engine = sqlalchemy.create_engine("mssql+pyodbc:///?odbc_connect=%s" % params)

In [30]:
sql = 'select * from crowdsdb.dbo.tbl_ref_sites'

In [31]:
sites_sql = (pd.read_sql(con = engine, sql = sql)
             .fillna(value = np.nan, axis = 1))[cols]

In [32]:
hash_rows(sites_sql, exclude_cols = ['site_id'], hash_name = 'row_hash')

### Read the latest data from Google Sheets

In [10]:
scope = ['https://spreadsheets.google.com/feeds',
         'https://www.googleapis.com/auth/drive']
creds = ServiceAccountCredentials.from_json_keyfile_name(cred_file, scope)
client = gspread.authorize(creds)

In [11]:
sheet = client.open('DailyTasks_WebMerc_Centroids')

In [13]:
ws = sheet.worksheet('Sheet1')

In [26]:
sites = (get_as_dataframe(ws, evaluate_formulas = True, header= 0)
         .rename(columns = col_rename)
         .fillna(value = np.nan, axis = 1))[cols]

In [39]:
#Exclude any row with a null site_id
sites = sites[sites['site_id'].notna()]

In [None]:
sites['dups'] = sites['site_id'].duplicated()

In [59]:
sites[sites['dups'] == True]

Unnamed: 0,site_id,site_desc,park_district,desc_location,latitude,longitude,row_hash,dups
568,Q052-01,Captain Tilly Park-Captain Tilly Playground,Q-08,Q052-01 | Captain Tilly Park-Captain Tilly Pla...,40.7126,-73.7992,44464b0a962ecd6a29bc2fc32a3eb570bf693c80f723b4...,True
4836,M198,Martin Luther King Jr. Playground,M-10,M198 | Martin Luther King Jr. Playground,40.8002,-73.9504,3395d42a55ced6eed402b1b9665bf0d245d7d391c729c2...,True
4837,M404,NYC AIDS Memorial Park at St. Vincent’s Triangle,M-02,M404 | NYC AIDS Memorial Park at St. Vincent’s...,40.7372,-74.0012,a6daf06321df6730bd8aadbe90724004f8b9706bc724aa...,True
4841,X039-ZN19,Orchard Beach,,X039-ZN19 | Orchard Beach,40.86926,-73.79017,47c9e632848013b3a2b049167aa70ed03bfaca517b97d1...,True
4844,M042-ZN04,Inwood Hill Park - Dyckman Ballfield,,M042-ZN04 | Inwood Hill Park Zone 4 (Dyckman F...,40.87202,-73.93082,4f6fc4b2122cba93cffb6ea5ada1a11b84ce0316effd42...,True
4845,M071-37,Riverside Park-Tot Lot One Hundred And Twelve,,M071-37 | Riverside Park-Tot Lot One Hundred A...,40.80625,-73.96805,ccfbcc7cac06ac8a4b665524a58cde4c96458f6bd64093...,True
4846,M071-38,Riverside Park-Tot Lot One Hundred And Sixteen,,M071-38 | Riverside Park-Tot Lot One Hundred A...,40.8098,-73.96535,fcff21907385e420e055ba422f0d8bf1392c755e2100e3...,True
4848,X044,St. James Park,,,40.86516,-73.89779,b12c32bed66e5ccb2e032e29599df6f1580b3376d60418...,True


In [34]:
hash_rows(sites, exclude_cols = ['site_id'], hash_name = 'row_hash')

### Perform the delta check

In [35]:
sites_deltas = (check_deltas(new_df = sites, old_df = sites_sql, on = 'site_id', 
                              hash_name = 'row_hash', dml_col = 'dml_verb'))

In [37]:
sites_deltas.head()

Unnamed: 0,site_id,site_desc,park_district,desc_location,latitude,longitude,row_hash,site_desc_old,park_district_old,desc_location_old,latitude_old,longitude_old,row_hash_old,dml_verb
0,Q102,Juniper Valley Park,Q-05,Q102 | Juniper Valley Park,40.7202,-73.8804,d322002ab6bc23a6a3e67b72fb0aa7005da8c5212380dc...,,,,,,,I
1,Q024,Kissena Park,Q-07,Q024 | Kissena Park,40.7454,-73.8048,6b3148652d96466cf59804d9f6121c6005211be0033f6e...,,,,,,,I
2,Q347,Colden Playground,Q-07,Q347 | Colden Playground,40.7702,-73.8272,f8b1b82cc366fdf7d2a30573cefd225e0b53ff020b38f5...,,,,,,,I
3,Q121,Detective Keith L Williams Park,Q-12,Q121 | Detective Keith L Williams Park,40.7029,-73.7844,fbe4b102c854dc470881fbe0c2bfdd1a10f7a830f1dfda...,,,,,,,I
4,Q375,Willets Point Playground,Q-07,Q375 | Willets Point Playground,40.7838,-73.7964,1cdfd2182f58db2aa5ca2c81a2ba3c814fcdf1df9dcb04...,,,,,,,I


### Slice the inserts and push them to SQL

In [42]:
sites_inserts = sites_deltas[sites_deltas['dml_verb'] == 'I'][cols]

In [52]:
sites_inserts.shape

(4849, 6)

In [54]:
len(pd.unique(sites_inserts['site_id']))

4841

In [44]:
sites_inserts.head()

Unnamed: 0,site_id,site_desc,park_district,desc_location,latitude,longitude
0,Q102,Juniper Valley Park,Q-05,Q102 | Juniper Valley Park,40.7202,-73.8804
1,Q024,Kissena Park,Q-07,Q024 | Kissena Park,40.7454,-73.8048
2,Q347,Colden Playground,Q-07,Q347 | Colden Playground,40.7702,-73.8272
3,Q121,Detective Keith L Williams Park,Q-12,Q121 | Detective Keith L Williams Park,40.7029,-73.7844
4,Q375,Willets Point Playground,Q-07,Q375 | Willets Point Playground,40.7838,-73.7964


In [None]:
sites_inserts.to_sql('tbl_ref_sites', engine, index = False, if_exists = 'append')

### Slice the updates and push them to SQL

In [43]:
sites_updates = sites_deltas[sites_deltas['dml_verb'] == 'U'][cols]

In [45]:
sites_updates.head()

Unnamed: 0,site_id,site_desc,park_district,desc_location,latitude,longitude


In [50]:
sql_update(df = sites_updates, sql_table = 'tbl_ref_sites', engine = engine, where_col = 'site_id')