### Import dependencies

In [25]:
import pandas as pd
from sodapy import Socrata
import os
from scipy.spatial import cKDTree
import endpoints
import geopandas as gpd
import numpy as np 

### Extract environment variables

In [None]:
# Read environment variables
APP_TOKEN = os.getenv('APP_TOKEN')
USERNAME = os.getenv('USERNAME')
PASSWORD = os.getenv('PASSWORD')
DATA_DIR=os.getenv('DATA_DUMP_DIR')

### Extract squirrel and tree data from datasets 

In [15]:
# Get data with app token to avoid data limits on call 
client = Socrata(endpoints.CLIENT_IDENTIFIER, APP_TOKEN, username=USERNAME, password=PASSWORD)
squirrel_census_results = client.get(endpoints.SQUIRREL_DATASET_IDENTIFIER)
tree_census_results = client.get(endpoints.TREE_CENSUS_DATASET_IDENTIFIER)

### Convert data to Pandas dataframe

In [16]:
# Convert data to Pandas dataframe
squirrel_census_results_df = pd.DataFrame.from_records(squirrel_census_results)
squirrel_column_names=squirrel_census_results_df.columns.values.tolist()

tree_census_results_df = pd.DataFrame.from_records(tree_census_results)
tree_column_names=tree_census_results_df.columns.values.tolist()

### Calculate number of squirrels to hectare and load to CSV 

In [18]:
# Squirrels per hectare
squirrels_per_hectare=squirrel_census_results_df.groupby('hectare')['unique_squirrel_id'].count().reset_index(name="squirrel_count")
squirrels_per_hectare.to_csv(f"{DATA_DIR}/squirrels_per_hectare.csv", index=False)

### Calculate number of squirrels eating at each observed elevation and load to CSV

In [19]:
# Proportion of squirrels eating aboveground vs on the ground 
eating_counts = (squirrel_census_results_df[squirrel_census_results_df['eating'] == True].groupby('above_ground_sighter')['eating'].count().reset_index(name='eating_squirrel_count'))
eating_counts.to_csv(f"{DATA_DIR}/num_squirrels_eating_per_elevation_aboveground.csv", index=False)

### Record count of recorded activity squirrel is doing at once and assign each squirrel an "activity score"

In [20]:
# Number of activities squirrel is doing at once
squirrel_census_results_df['activity_score'] = squirrel_census_results_df[['running', 'chasing', 'climbing', 'eating', 'foraging']].sum(axis=1)

### Parse date and display year, month, day, and weekday in their own columns

In [21]:
# Parse date field so its more human readable
squirrel_census_results_df['date'] = pd.to_datetime(squirrel_census_results_df['date'], format='%m%d%Y')
squirrel_census_results_df['year'] = squirrel_census_results_df['date'].dt.year
squirrel_census_results_df['month'] = squirrel_census_results_df['date'].dt.month
squirrel_census_results_df['day'] = squirrel_census_results_df['date'].dt.day
squirrel_census_results_df['weekday'] = squirrel_census_results_df['date'].dt.day_name()

### Calculate distance from each squirrel to nearest recorded tree. 

Note that this dataset does not seem to include trees within Central Park. I could not find data on that within the NYC Open Data website. This is more of a proof-of-concept pipeline for when this data is recorded and/or becomes available. This is useful for figuring out which trees squirrels are mostly using for nests or cover, and perhaps to identify commonalities between trees squirrels live in. 

In [None]:
# Get all aboveground squirrels and manhattan trees
above_ground_df = squirrel_census_results_df[squirrel_census_results_df['location'] == 'Above Ground'].copy()
manhattan_trees_df = tree_census_results_df[tree_census_results_df['boroname']=='Manhattan'].copy()

# Coordinates of above-ground squirrels and trees 
squirrel_coords = above_ground_df[['x','y']].values
tree_coords = manhattan_trees_df[['latitude','longitude']].values

# Get GeoDataFrames
squirrels_gdf = gpd.GeoDataFrame(
    above_ground_df,
    geometry=gpd.points_from_xy(above_ground_df['x'], above_ground_df['y']), # x=lon, y=lat
    crs="EPSG:4326"
)
trees_gdf = gpd.GeoDataFrame(
    manhattan_trees_df,
    geometry=gpd.points_from_xy(manhattan_trees_df['longitude'], manhattan_trees_df['latitude']),
    crs="EPSG:4326"
)

# Project distances to meters 
squirrels_gdf = squirrels_gdf.to_crs(epsg=3857)
trees_gdf = trees_gdf.to_crs(epsg=3857)

# Get tree and squirrel coordinates 
tree_coords = np.array([(p.x, p.y) for p in trees_gdf.geometry])
squirrel_coords = np.array([(p.x, p.y) for p in squirrels_gdf.geometry])

# Get distances to nearest tree 
tree_kdtree = cKDTree(tree_coords)
distances, indices = tree_kdtree.query(squirrel_coords)

# Assign nearest tree ID and distance in meters
squirrels_gdf['nearest_tree'] = trees_gdf.iloc[indices]['tree_id'].values
squirrels_gdf['distance_meters'] = distances

# merge into dataset 
squirrel_census_results_df = squirrel_census_results_df.merge(
    squirrels_gdf[['unique_squirrel_id','nearest_tree','distance_meters']],
    on='unique_squirrel_id',
    how='left'
)

squirrel_census_results_df.to_csv(f"{DATA_DIR}/squirrel_census_transformed.csv", index=False)