## Scenario/Stakeholder Based Analysis of NYC taxi rides data
##### Authors: Panini Mokrala, Dmitrii Danilov

In [1]:
from google.colab import drive
from os.path import join

ROOT = '/content/drive'
PROJ = 'MyDrive/Milestones/Milestone1'

drive.mount(ROOT)
PROJECT_PATH = join(ROOT, PROJ)
%cd "$PROJECT_PATH"
%pwd

Mounted at /content/drive
/content/drive/MyDrive/Milestones/Milestone1


'/content/drive/MyDrive/Milestones/Milestone1'

In [4]:
!pip install geopandas

import io
import json
import pandas as pd
import numpy as np
import geopandas as gpd
from shapely.ops import cascaded_union
from google.cloud import bigquery
from google.oauth2 import service_account
import datetime as dt
import altair as alt

alt.data_transformers.enable('json')



DataTransformerRegistry.enable('json')

In [5]:
taxi_zones = gpd.read_file('https://s3.amazonaws.com/nyc-tlc/misc/taxi_zones.zip')
taxi_zones.to_crs(epsg=4326, inplace=True)
taxi_zones['centroid_lon'] = taxi_zones['geometry'].centroid.x
taxi_zones['centroid_lat'] = taxi_zones['geometry'].centroid.y

taxi_zones_b = taxi_zones.groupby('borough')['geometry'].agg(lambda x: cascaded_union(x).centroid).to_frame()
taxi_zones_b.columns = ['geometry']
taxi_zones_b.reset_index(inplace=True)
taxi_zones_b['centroid_lon'] = taxi_zones_b['geometry'].centroid.x
taxi_zones_b['centroid_lat'] = taxi_zones_b['geometry'].centroid.y



  This is separate from the ipykernel package so we can avoid doing imports until

  after removing the cwd from sys.path.


In [6]:
tz_geo = json.loads(taxi_zones.to_json())['features']
tz_geo_b = json.loads(taxi_zones_b.to_json())['features']

alt.themes.enable('opaque')

base = alt.Chart(alt.Data(values=tz_geo)).mark_geoshape(
        stroke='black',
        strokeWidth=1
    ).encode(
        color=alt.Color('properties.borough:N', legend=None)
    ).properties(
        width=800,
        height=800
    )

labels = alt.Chart(alt.Data(values=tz_geo)).mark_text(
    baseline='top',
     ).properties(
        width=800,
        height=800
     ).encode(
         longitude='properties.centroid_lon:Q',
         latitude='properties.centroid_lat:Q',
         text='properties.LocationID:O',
         size=alt.value(8),
         opacity=alt.value(1)
     )

boroughs = alt.Chart(alt.Data(values=tz_geo_b)).mark_text(
    color='white',
    stroke='black',
    fontWeight='bold',
    strokeWidth=0.7,
    baseline='top'
     ).properties(
        width=800,
        height=800,
        title=alt.Text(text="NYC boroughs and taxi zones", fontSize=22)
     ).encode(
         longitude='properties.centroid_lon:Q',
         latitude='properties.centroid_lat:Q',
         text='properties.borough:N',
         size=alt.value(26),
         opacity=alt.value(1)
     )


base + labels + boroughs

In [7]:
key_path = 'auth.json'
credentials = service_account.Credentials.from_service_account_file(key_path)
client = bigquery.Client(credentials=credentials, project=credentials.project_id)

sql = '''SELECT dropoff_location_id, count(*) as count
FROM bigquery-public-data.new_york_taxi_trips.tlc_yellow_trips_2017 
where dropoff_datetime between '2017-01-01' and '2018-01-01' 
group by dropoff_location_id;'''
dropoff_2017_df = client.query(sql).to_dataframe()

In [None]:
dropoff_2017_df.rename(columns={'dropoff_location_id': 'LocationID'}, inplace=True)
dropoff_2017_df['LocationID'] = dropoff_2017_df['LocationID'].astype('int64')

dropoff_2017 = taxi_zones.merge(dropoff_2017_df, on='LocationID')
dropoff_2017 = json.loads(dropoff_2017.to_json())['features']

base = alt.Chart(alt.Data(values=dropoff_2017)).mark_geoshape(
        stroke='black',
        strokeWidth=1
    ).encode(
        color=alt.Color('properties.count:Q', scale=alt.Scale(type='log'), legend=alt.Legend(title="Drop-off count"))
    ).properties(
        title=alt.Text(text="NYC taxi drop-off zones popularity", fontSize=22),
        width=800,
        height=800
    )

labels = alt.Chart(alt.Data(values=tz_geo)).mark_text(
    baseline='top',
     ).properties(
        width=800,
        height=800
     ).encode(
         longitude='properties.centroid_lon:Q',
         latitude='properties.centroid_lat:Q',
         text='properties.LocationID:O',
         size=alt.value(8),
         opacity=alt.value(1)
     )

base + labels

In [None]:
key_path = 'auth.json'
credentials = service_account.Credentials.from_service_account_file(key_path)

sql = '''SELECT pickup_location_id, count(*) as count
FROM bigquery-public-data.new_york_taxi_trips.tlc_yellow_trips_2017 
where pickup_datetime between '2017-01-01' and '2018-01-01' 
group by pickup_location_id;'''
pickup_2017_df = client.query(sql).to_dataframe()

In [None]:
pickup_2017_df.rename(columns={'pickup_location_id': 'LocationID'}, inplace=True)
pickup_2017_df['LocationID'] = pickup_2017_df['LocationID'].astype('int64')

pickup_2017 = taxi_zones.merge(pickup_2017_df, on='LocationID')
pickup_2017 = json.loads(pickup_2017.to_json())['features']

base = alt.Chart(alt.Data(values=pickup_2017)).mark_geoshape(
        stroke='black',
        strokeWidth=1
    ).encode(
        color=alt.Color('properties.count:Q', scale=alt.Scale(type='log'), legend=alt.Legend(title="Pickup count"))
    ).properties(
        title=alt.Text(text="NYC taxi pickup zones popularity", fontSize=22),
        width=800,
        height=800
    )
    
labels = alt.Chart(alt.Data(values=tz_geo)).mark_text(
    baseline='top',
     ).properties(
        width=800,
        height=800
     ).encode(
         longitude='properties.centroid_lon:Q',
         latitude='properties.centroid_lat:Q',
         text='properties.LocationID:O',
         size=alt.value(8),
         opacity=alt.value(1)
     )

base + labels

In [None]:
sql = '''
SELECT 
dropoff_location_id, avg(fare_amount) as avg_fare
FROM 
bigquery-public-data.new_york_taxi_trips.tlc_yellow_trips_2018
WHERE dropoff_datetime > '2018-01-01' and dropoff_datetime < '2019-01-01'
AND fare_amount > 0 and fare_amount < 1000
GROUP BY dropoff_location_id;
'''
avg_fare_2018_df = client.query(sql).to_dataframe()

In [None]:
avg_fare_2018_df.rename(columns={'dropoff_location_id': 'LocationID'}, inplace=True)
avg_fare_2018_df['LocationID'] = avg_fare_2018_df['LocationID'].astype('int64')
avg_fare_2018_df['avg_fare'] = avg_fare_2018_df['avg_fare'].astype('float64')

avg_fare_2018 = taxi_zones.merge(avg_fare_2018_df, on='LocationID')
avg_fare_2018 = json.loads(avg_fare_2018.to_json())['features']

base = alt.Chart(alt.Data(values=avg_fare_2018)).mark_geoshape(
        stroke='black',
        strokeWidth=1
    ).encode(
        color=alt.Color('properties.avg_fare:Q', legend=alt.Legend(title="Avg. fare"))
    ).properties(
        title=alt.Text(text="NYC average fare by taxi zone", fontSize=22),
        width=800,
        height=800
    )
    
labels = alt.Chart(alt.Data(values=tz_geo)).mark_text(
    baseline='top',
     ).properties(
        width=800,
        height=800
     ).encode(
         longitude='properties.centroid_lon:Q',
         latitude='properties.centroid_lat:Q',
         text='properties.LocationID:O',
         size=alt.value(8),
         opacity=alt.value(1)
     )

base + labels