# Analyzing New York City taxi data using big data tools

## The NYC taxi data

In [1]:
import arcgis
from arcgis.gis import GIS

ago_gis = GIS()  # Connect to ArcGIS Online as an anonymous user
search_subset = ago_gis.content.search("NYC_taxi_subset", item_type = "Feature Layer")
subset_item = search_subset[0]
subset_item

In [2]:
subset_map = ago_gis.map("New York, NY", zoomlevel=11)
subset_map

MapView(layout=Layout(height='400px', width='100%'), zoom=11.0)

In [3]:
subset_map.add_layer(subset_item)

In [4]:
subset_feature_layer = subset_item.layers[0]

# query the attribute information. Limit to first 5 rows
query_result = subset_feature_layer.query(where = 'OBJECTID < 5',
                                          out_fields = "*", 
                                          returnGeometry = False)

att_data_frame = query_result.sdf  # get as a Pandas dataframe
att_data_frame

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,pickup_longitude,pickup_latitude,RateCodeID,store_and_fwd_flag,dropoff_longitude,dropoff_latitude,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,ObjectId
0,2,2015-01-02 19:06:46,2015-01-02 19:11:14,1,0.82,-74.005997,40.735241,1,N,-74.00972,40.72599,1,5.0,1,0.5,1.2,0,0.3,8.0,1
1,2,2015-01-25 13:39:51,2015-01-26 13:24:58,6,5.62,-73.984688,40.748192,1,N,-74.016289,40.704849,2,19.5,0,0.5,0.0,0,0.3,20.3,2
2,2,2015-01-13 09:10:38,2015-01-13 09:19:40,1,1.99,-73.967827,40.801315,1,N,-73.959618,40.782703,2,9.0,0,0.5,0.0,0,0.3,9.8,3
3,1,2015-01-06 10:55:08,2015-01-06 11:05:48,1,1.6,-73.96756,40.801228,1,N,-73.95298,40.819118,2,9.0,0,0.5,0.0,0,0.3,9.8,4


## Searching for big data file shares

In [5]:
gis = GIS("https://pythonapi.playground.esri.com/portal", "arcgis_python", "amazing_arcgis_123")

In [6]:
arcgis.geoanalytics.is_supported()

True

In [7]:
datastores = arcgis.geoanalytics.get_datastores()

In [8]:
bigdata_fileshares = datastores.search()
bigdata_fileshares

[<Datastore title:"/bigDataFileShares/NYC_taxi_data15" type:"bigDataFileShare">,
 <Datastore title:"/bigDataFileShares/all_hurricanes" type:"bigDataFileShare">,
 <Datastore title:"/bigDataFileShares/NYCdata" type:"bigDataFileShare">,
 <Datastore title:"/bigDataFileShares/hurricanes_1848_1900" type:"bigDataFileShare">,
 <Datastore title:"/bigDataFileShares/ServiceCallsOrleans" type:"bigDataFileShare">,
 <Datastore title:"/bigDataFileShares/hurricanes_dask_csv" type:"bigDataFileShare">,
 <Datastore title:"/bigDataFileShares/hurricanes_dask_shp" type:"bigDataFileShare">]

In [9]:
data_item = bigdata_fileshares[0]

## Registering big data file shares

In [None]:
data_item = datastores.add_bigdata("NYCdata", r"\data")

In [10]:
data_item.manifest

{'datasets': [{'name': '2015',
   'format': {'quoteChar': '"',
    'fieldDelimiter': ',',
    'hasHeaderRow': True,
    'encoding': 'UTF-8',
    'recordTerminator': '\n',
    'type': 'delimited',
    'extension': 'csv'},
   'schema': {'fields': [{'name': 'VendorID',
      'type': 'esriFieldTypeBigInteger'},
     {'name': 'tpep_pickup_datetime', 'type': 'esriFieldTypeString'},
     {'name': 'tpep_dropoff_datetime', 'type': 'esriFieldTypeString'},
     {'name': 'passenger_count', 'type': 'esriFieldTypeBigInteger'},
     {'name': 'trip_distance', 'type': 'esriFieldTypeDouble'},
     {'name': 'pickup_longitude', 'type': 'esriFieldTypeDouble'},
     {'name': 'pickup_latitude', 'type': 'esriFieldTypeDouble'},
     {'name': 'RateCodeID', 'type': 'esriFieldTypeBigInteger'},
     {'name': 'store_and_fwd_flag', 'type': 'esriFieldTypeString'},
     {'name': 'dropoff_longitude', 'type': 'esriFieldTypeDouble'},
     {'name': 'dropoff_latitude', 'type': 'esriFieldTypeDouble'},
     {'name': 'payment

## Performing data aggregation

In [11]:
search_result = gis.content.search("", item_type = "big data file share")
search_result

[<Item title:"bigDataFileShares_all_hurricanes" type:Big Data File Share owner:api_data_owner>,
 <Item title:"bigDataFileShares_hurricanes_dask_csv" type:Big Data File Share owner:atma.mani>,
 <Item title:"bigDataFileShares_hurricanes_dask_shp" type:Big Data File Share owner:atma.mani>,
 <Item title:"bigDataFileShares_NYC_taxi_data15" type:Big Data File Share owner:api_data_owner>]

In [12]:
data_item = search_result[3]
data_item

In [13]:
data_item.layers

[<Layer url:"https://pythonapi.playground.esri.com/ga/rest/services/DataStoreCatalogs/bigDataFileShares_NYC_taxi_data15/BigDataCatalogServer/2015">]

In [14]:
year_2015 = data_item.layers[0]
year_2015

<Layer url:"https://pythonapi.playground.esri.com/ga/rest/services/DataStoreCatalogs/bigDataFileShares_NYC_taxi_data15/BigDataCatalogServer/2015">

## Aggregate points tool

In [15]:
from arcgis.geoanalytics.summarize_data import aggregate_points

In [16]:
year_2015.properties['spatialReference']

{'wkid': 4326}

In [17]:
arcgis.env.process_spatial_reference = 3857

In [18]:
arcgis.env.verbose = True

In [19]:
agg_result = aggregate_points(year_2015,
                              bin_type = 'square',
                              bin_size = 1,
                              bin_size_unit = 'Kilometers')

Submitted.
Executing...
Executing (AggregatePoints): AggregatePoints "Feature Set" Square 1 Kilometers # # # # # # # "{"serviceProperties": {"name": "Aggregate_Points_Analysis_95O6MR", "serviceUrl": "https://pythonapi.playground.esri.com/server/rest/services/Hosted/Aggregate_Points_Analysis_95O6MR/FeatureServer"}, "itemProperties": {"itemId": "e30dbe3d177149ab9b06195ce8317807"}}" "{"defaultAggregationStyles": false, "processSR": {"wkid": 3857}}"
Start Time: Wed Oct  9 12:21:02 2019
Using URL based GPRecordSet param: https://pythonapi.playground.esri.com/ga/rest/services/DataStoreCatalogs/bigDataFileShares_NYC_taxi_data15/BigDataCatalogServer/2015
{"messageCode":"BD_101033","message":"'pointLayer' will be projected into the processing spatial reference.","params":{"paramName":"pointLayer"}}
{"messageCode":"BD_101028","message":"Starting new distributed job with 236 tasks.","params":{"totalTasks":"236"}}
{"messageCode":"BD_101029","message":"0/236 distributed tasks completed.","params":{

## Inspect the results

In [20]:
processed_map = gis.map('New York, NY', zoomlevel=11)
processed_map

MapView(layout=Layout(height='400px', width='100%'), zoom=11.0)

In [21]:
processed_map.add_layer(agg_result)

In [22]:
agg_result.share(org=True)

{'results': [{'itemId': 'e30dbe3d177149ab9b06195ce8317807',
   'success': True,
   'notSharedWith': []}]}

In [23]:
map2 = gis.map("New York, NY", zoomlevel=11)
map2

MapView(layout=Layout(height='400px', width='100%'), zoom=11.0)

In [24]:
map2.add_layer(agg_result, {
    "renderer" : "ClassedColorRenderer",
    "field_name" : "MAX_tip_amount",
    "normalizationField" : 'MAX_trip_distance',
    "classificationMethod" : 'natural-breaks',
    "opacity" : 0.75
})