# Descriptive (Spatial) Analytics

Analyze taxi demand patterns for the relevant one-year period and 
city (please check carefully which year your team has been allocated). 

Specifically show how these
patterns (start time, trip length, start and end location, price, average idle time between trips, and so 
on) for the given sample varies in different spatio-temporal resolution (i.e., census tract vs. varying
hexagon diameter and/or temporal bin sizes). 

Give possible reasons for the observed patterns.

Notes:

- Histogram for distribution
- trip length on census tract -> districts
- trip length on varying hexagon diamater
- trip length on tempotal bin sizes

Features: 
- for both trip minutes and trip miles

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from h3 import h3

import descriptive as desc

  exploded_city_bounding_poly = gdf.explode()


In [2]:
#import dataset
df = pd.read_csv("./data/datasets/df_chicago.csv.zip") ## df_chicago
df.info()

BadZipFile: File is not a zip file

In [None]:
#How many rides start per district
grouped = df[["Pickup_Census_Tract", "Trip_Miles"]].groupby("Pickup_Census_Tract").count().sort_values(by="Trip_Miles").rename(columns={"Trip_Miles": "Count"})
grouped.tail()

Unnamed: 0_level_0,Trip_Miles
Pickup_Census_Tract,Unnamed: 1_level_1
17031280000.0,280527
17031080000.0,363515
17031080000.0,382609
17031320000.0,485957
17031840000.0,814282


In [None]:
#How many rides end per district
grouped = df[["Dropoff_Census_Tract", "Trip_Miles"]].groupby("Dropoff_Census_Tract").count().sort_values(by="Trip_Miles").rename(columns={"Trip_Miles": "Count"})
grouped.tail()

Unnamed: 0_level_0,Trip_Miles
Dropoff_Census_Tract,Unnamed: 1_level_1
17031280000.0,283609
17031080000.0,286058
17031080000.0,309451
17031320000.0,410715
17031840000.0,741563


In [None]:
#How many minutes start per district
grouped = df[["Pickup_Census_Tract", "Trip_Miles"]].groupby("Pickup_Census_Tract").sum().sort_values(by="Trip_Miles").rename(columns={"Trip_Miles": "Sum"})
grouped.tail()

Unnamed: 0_level_0,Trip_Miles
Pickup_Census_Tract,Unnamed: 1_level_1
17031980000.0,542587.71
17031080000.0,589789.58
17031080000.0,656860.28
17031320000.0,836171.04
17031840000.0,1323167.61


In [None]:
#How many minutes end per district
grouped = df[["Dropoff_Census_Tract", "Trip_Miles"]].groupby("Dropoff_Census_Tract").sum().sort_values(by="Trip_Miles").rename(columns={"Trip_Miles": "Sum"})
grouped.tail()

Unnamed: 0_level_0,Trip_Miles
Dropoff_Census_Tract,Unnamed: 1_level_1
17031080000.0,457166.13
17031330000.0,472786.75
17031980000.0,644929.81
17031320000.0,651340.93
17031840000.0,1090535.94


In [None]:
#How many minutes start on average per district
grouped = df[["Pickup_Census_Tract", "Trip_Miles"]].groupby("Pickup_Census_Tract").mean().sort_values(by="Trip_Miles").rename(columns={"Trip_Miles": "Average"})
grouped.tail()

Unnamed: 0_level_0,Trip_Miles
Pickup_Census_Tract,Unnamed: 1_level_1
17031190000.0,11.4
17031430000.0,11.433333
17031450000.0,11.8
17031210000.0,12.3
17031020000.0,12.7


In [None]:
#How many minutes end on average per district
grouped = df[["Dropoff_Census_Tract", "Trip_Miles"]].groupby("Dropoff_Census_Tract").mean().sort_values(by="Trip_Miles").rename(columns={"Trip_Miles": "Average"})
grouped.tail()

Unnamed: 0_level_0,Trip_Miles
Dropoff_Census_Tract,Unnamed: 1_level_1
17031400000.0,9.75
17031250000.0,10.3
17031580000.0,10.4
17031640000.0,11.3
17031770000.0,11.4


### Different Hexagon Resolutions

In [None]:
#How many rides start per district
grouped = df[["h3_hex_id_low_res", "Trip_Miles"]].groupby("h3_hex_id_low_res").count().sort_values(by="Trip_Miles").rename(columns={"Trip_Miles": "Count"})
grouped.tail()

Unnamed: 0_level_0,Trip_Miles
h3_hex_id_low_res,Unnamed: 1_level_1
862664ca7ffffff,298214
862664d8fffffff,425809
862664cafffffff,765438
862664c17ffffff,1951115
862664c1fffffff,5220199


In [None]:
#How many minutes start per district
grouped = df[["h3_hex_id_low_res", "Trip_Miles"]].groupby("h3_hex_id_low_res").sum().sort_values(by="Trip_Miles").rename(columns={"Trip_Miles": "Sum"})
grouped.tail()

Unnamed: 0_level_0,Trip_Miles
h3_hex_id_low_res,Unnamed: 1_level_1
862664ca7ffffff,841351.16
862664d8fffffff,1235793.03
862664cafffffff,1807736.78
862664c17ffffff,4355317.81
862664c1fffffff,10184633.47


In [None]:
#How many minutes start on average per district
grouped = df[["h3_hex_id_low_res", "Trip_Miles"]].groupby("h3_hex_id_low_res").mean().sort_values(by="Trip_Miles").rename(columns={"Trip_Miles": "Average"})
grouped.tail()

Unnamed: 0_level_0,Trip_Miles
h3_hex_id_low_res,Unnamed: 1_level_1
86266452fffffff,4.573803
86266456fffffff,4.675
862664d87ffffff,6.016
862759347ffffff,7.327827
862664527ffffff,7.544964


In [None]:
#How many rides start per district
grouped = df[["h3_hex_id_medium_res", "Trip_Miles"]].groupby("h3_hex_id_medium_res").count().sort_values(by="Trip_Miles").rename(columns={"Trip_Miles": "Count"})
grouped.tail()

Unnamed: 0_level_0,Trip_Miles
h3_hex_id_medium_res,Unnamed: 1_level_1
872664cacffffff,383544
872664c13ffffff,517422
872664c16ffffff,702683
872664c1affffff,1654640
872664c1effffff,3277479


In [None]:
#How many minutes start per district
grouped = df[["h3_hex_id_medium_res", "Trip_Miles"]].groupby("h3_hex_id_medium_res").sum().sort_values(by="Trip_Miles").rename(columns={"Trip_Miles": "Sum"})
grouped.tail()

Unnamed: 0_level_0,Trip_Miles
h3_hex_id_medium_res,Unnamed: 1_level_1
872664cacffffff,877021.45
872664c13ffffff,1066992.94
872664c16ffffff,1776840.4
872664c1affffff,3321996.97
872664c1effffff,6264143.47


In [None]:
#How many minutes start on average per district
grouped = df[["h3_hex_id_medium_res", "Trip_Miles"]].groupby("h3_hex_id_medium_res").mean().sort_values(by="Trip_Miles").rename(columns={"Trip_Miles": "Average"})
grouped.tail()

Unnamed: 0_level_0,Trip_Miles
h3_hex_id_medium_res,Unnamed: 1_level_1
872759343ffffff,7.782336
872664521ffffff,7.939956
872664c83ffffff,9.35
872664c8bffffff,10.25
872664cebffffff,11.433333


In [None]:
#How many rides start per district
grouped = df[["h3_hex_id_high_res", "Trip_Miles"]].groupby("h3_hex_id_high_res").count().sort_values(by="Trip_Miles").rename(columns={"Trip_Miles": "Count"})
grouped.tail()

Unnamed: 0_level_0,Trip_Miles
h3_hex_id_high_res,Unnamed: 1_level_1
882664c163fffff,517108
882664c1e3fffff,726653
882664c1edfffff,749389
882664c1e1fffff,798286
882664c1a9fffff,1070522


In [None]:
#How many minutes start per district
grouped = df[["h3_hex_id_high_res", "Trip_Miles"]].groupby("h3_hex_id_high_res").sum().sort_values(by="Trip_Miles").rename(columns={"Trip_Miles": "Sum"})
grouped.tail()

Unnamed: 0_level_0,Trip_Miles
h3_hex_id_high_res,Unnamed: 1_level_1
882664c1e3fffff,1264562.66
882664c163fffff,1290254.5
882664c1e1fffff,1357848.39
882664c1edfffff,1985703.86
882664c1a9fffff,2292197.85


In [None]:
#How many minutes start on average per district
grouped = df[["h3_hex_id_high_res", "Trip_Miles"]].groupby("h3_hex_id_high_res").mean().sort_values(by="Trip_Miles").rename(columns={"Trip_Miles": "Average"})
grouped.tail()

Unnamed: 0_level_0,Trip_Miles
h3_hex_id_high_res,Unnamed: 1_level_1
882664cce1fffff,10.6
88275936bbfffff,10.9
882664ca25fffff,11.4
882664ceb1fffff,11.433333
882664d8edfffff,12.7


### Different time bins

## Census tract

In [2]:
import geopandas as gpd
import folium
from IPython.display import display

In [4]:
H3_HEXAGON_HIGH_RESOLUTION = 8
H3_HEXAGON_MEDIUM_RESOLUTION = 7
H3_HEXAGON_LOW_RESOLUTION = 6
time_bins = ["h", "2h", "6h", "24h"]
resolution = ["low", "med", "hig"]

In [5]:
for j in resolution:
    for i in time_bins:
        df = pd.read_csv('./data/datasets/df_{time}_hex{res}.csv'.format(time=i,res=j))

        column = "low"
        res = H3_HEXAGON_LOW_RESOLUTION
        base = h3.polyfill(desc.cityBoundingPolygonBig, res = res, geo_json_conformant = True)
        if(j == "low"):
            res = H3_HEXAGON_LOW_RESOLUTION
            base = h3.polyfill(desc.cityBoundingPolygonBig, res = res, geo_json_conformant = True)
            column = "low"
        elif(j== "med"):
            res = H3_HEXAGON_MEDIUM_RESOLUTION
            base = h3.polyfill(desc.cityBoundingPolygonBig, res = res, geo_json_conformant = True)
            column = "medium"
        elif(j== "hig"):
            res = H3_HEXAGON_HIGH_RESOLUTION
            # base = h3.polyfill(desc.cityBoundingPolygonSmall, res = res, geo_json_conformant = True)
            column = "high"
            base = h3.polyfill(desc.cityBoundingPolygonBig, res = res, geo_json_conformant = True)

        grouped_sum = desc.descr_stat(df, columns=['h3_hex_id_{res}_res'.format(res=column), "Trip_Miles_x"], group_by=['h3_hex_id_{res}_res'.format(res=column)], sort=True, sort_by=["Trip_Miles_x"], as_index=True, agg_mode="sum", plot=False, plot_map=True)
        prop = 'h3_hex_id_{res}_res'.format(res=column)
        grouped_sum = grouped_sum[0] #deconstruct from tuple

        low_val = grouped_sum.iloc[0]["Sum"]
        low_elems = grouped_sum[grouped_sum["Sum"] == low_val]
        if len(low_elems < 5): #if less than 5 are at lowest level, then use all 5 lowest with head function
            low_elems = grouped_sum.head()

        high_val = grouped_sum.iloc[len(grouped_sum)-1]["Sum"]
        high_elems = grouped_sum[grouped_sum["Sum"] == high_val]
        if len(high_elems) < 5: #if less than 5 are at highest level, then use all 5 highest with head function
            high_elems = grouped_sum.tail()

        sum_map = desc.visualize_hexagons(base, color="darkblue")
        sum_map = desc.visualize_hexagons(high_elems.index, color="darkgreen", folium_map=sum_map)
        sum_map = desc.visualize_hexagons(low_elems.index, color="darkred", folium_map=sum_map)

        print("Sum map: \n")    
        display(sum_map)

        grouped_avg = desc.descr_stat(df, columns=['h3_hex_id_{res}_res'.format(res=column), "Trip_Miles_x"], group_by=['h3_hex_id_{res}_res'.format(res=column)], sort=True, sort_by=["Trip_Miles_x"], as_index=True, agg_mode="mean", plot=False, plot_map=True)
        prop = 'h3_hex_id_{res}_res'.format(res=column)
        grouped_avg = grouped_avg[0] #deconstruct from tuple

        low_val = grouped_avg.iloc[0]["Average"]
        low_elems = grouped_avg[grouped_avg["Average"] == low_val]
        if len(low_elems) < 5: #if less than 5 are at lowest level, then use all 5 lowest with head function
            low_elems = grouped_avg.head()

        high_val = grouped_avg.iloc[len(grouped_avg)-1]["Average"]
        high_elems = grouped_avg[grouped_avg["Average"] == high_val]
        if len(high_elems < 5): #if less than 5 are at highest level, then use all 5 highest with head function
            high_elems = grouped_avg.tail()

        avg_map = desc.visualize_hexagons(base, color="darkblue")
        avg_map = desc.visualize_hexagons(high_elems.index, color="darkgreen", folium_map=avg_map)
        avg_map = desc.visualize_hexagons(low_elems.index, color="darkred", folium_map=avg_map)

        print("Avg map: \n")    
        display(avg_map)

Sum map: 



Avg map: 



Sum map: 



Avg map: 



Sum map: 



Avg map: 



Sum map: 



Avg map: 



Sum map: 



Avg map: 



Sum map: 



Avg map: 



Sum map: 



Avg map: 



Sum map: 



Avg map: 



Sum map: 



Avg map: 



Sum map: 



Avg map: 



Sum map: 



Avg map: 



Sum map: 



Avg map: 



In [None]:
time_bins = ["h", "2h", "6h", "24h"]
opts = ["p", "d"]
CHICAGO_COORD = [41.86364, -87.72645]
geo_json = gpd.read_file("data/census_tract.geojson")

map_list = []

for i in time_bins:
    for j in opts:

        df = pd.read_csv('./data/datasets/df_{time}_cens_{opt}.csv'.format(time=i,opt=j))

        prop = 'Pickup_Census_Tract'
        if(j == "d"):
            prop = 'Dropoff_Census_Tract'

        grouped_sum = desc.descr_stat(df, columns=[prop, "Trip_Miles_x"], group_by=[prop], sort=True, sort_by=["Trip_Miles_x"], as_index=True, agg_mode="sum", plot=False, plot_map=True)
        grouped_sum = grouped_sum[0] #deconstruct from tuple
        
        low_val = grouped_sum.iloc[0]["Sum"]
        low_elems = grouped_sum[grouped_sum["Sum"] == low_val]
        if len(low_elems < 5): #if less than 5 are at lowest level, then use all 5 lowest with head function
            low_elems = grouped_sum.head()

        
        high_val = grouped_sum.iloc[len(grouped_sum)-1]["Sum"]
        high_elems = grouped_sum[grouped_sum["Sum"] == high_val]
        if len(high_elems) < 5: #if less than 5 are at highest level, then use all 5 highest with head function
            high_elems = grouped_sum.tail()

        base_map = folium.Map(location=CHICAGO_COORD, tiles="cartodbpositron")
         
        folium.GeoJson(
            data=geo_json,
            popup=folium.GeoJsonPopup(fields=["geoid10","commarea_n","name10"]),
            style_function=lambda x: {"fillColor": "green", "color": "black", "weight": 1},
        ).add_to(base_map)
        
        geo_json["geoid10"] = geo_json["geoid10"].astype(float)

        for high in high_elems.index:
            filtered = geo_json[geo_json["geoid10"] == high]
            folium.GeoJson(
                data=filtered,
                popup=folium.GeoJsonPopup(fields=["geoid10","commarea_n","name10"]),
                style_function=lambda x: {"fillColor": "blue", "color": "black", "weight": 3},
            ).add_to(base_map)

        for low in low_elems.index:
            filtered = geo_json[geo_json["geoid10"] == low]
            folium.GeoJson(
                data=filtered,
                popup=folium.GeoJsonPopup(fields=["geoid10","commarea_n","name10"]),
                style_function=lambda x: {"fillColor": "red", "color": "black", "weight": 2},
            ).add_to(base_map)

        print("Sum map: \n")
        display(base_map)
        
        grouped_avg = desc.descr_stat(df, columns=[prop, "Trip_Miles_x"], group_by=[prop], sort=True, sort_by=["Trip_Miles_x"], as_index=True, agg_mode="mean", plot=False, plot_map=True)
        grouped_avg = grouped_avg[0] #deconstruct from tuple
        
        low_val = grouped_avg.iloc[0]["Average"]
        low_elems = grouped_avg[grouped_avg["Average"] == low_val]
        if len(low_elems < 5): #if less than 5 are at lowest level, then use all 5 lowest with head function
            low_elems = grouped_avg.head()

        
        high_val = grouped_avg.iloc[len(grouped_avg)-1]["Average"]
        high_elems = grouped_avg[grouped_avg["Average"] == high_val]
        if len(high_elems) < 5: #if less than 5 are at highest level, then use all 5 highest with head function
            high_elems = grouped_avg.tail()

        base_map = folium.Map(location=CHICAGO_COORD, tiles="cartodbpositron")
         
        folium.GeoJson(
            data=geo_json,
            popup=folium.GeoJsonPopup(fields=["geoid10","commarea_n","name10"]),
            style_function=lambda x: {"fillColor": "green", "color": "black", "weight": 1},
        ).add_to(base_map)
        
        geo_json["geoid10"] = geo_json["geoid10"].astype(float)

        for high in high_elems.index:
            filtered = geo_json[geo_json["geoid10"] == high]
            folium.GeoJson(
                data=filtered,
                popup=folium.GeoJsonPopup(fields=["geoid10","commarea_n","name10"]),
                style_function=lambda x: {"fillColor": "blue", "color": "black", "weight": 3},
            ).add_to(base_map)

        for low in low_elems.index:
            filtered = geo_json[geo_json["geoid10"] == low]
            folium.GeoJson(
                data=filtered,
                popup=folium.GeoJsonPopup(fields=["geoid10","commarea_n","name10"]),
                style_function=lambda x: {"fillColor": "red", "color": "black", "weight": 2},
            ).add_to(base_map)

        print("Average map: \n")
        display(base_map)
        