## Scenario/Stakeholder Based Analysis of NYC taxi rides data
##### Authors: Panini Mokrala, Dmitrii Danilov

In [None]:
from google.colab import drive
from os.path import join

ROOT = '/content/drive'
PROJ = 'MyDrive/Milestones/Milestone1'

drive.mount(ROOT)
PROJECT_PATH = join(ROOT, PROJ)
%cd "$PROJECT_PATH"
%pwd

Mounted at /content/drive
/content/drive/MyDrive/Milestones/Milestone1


'/content/drive/MyDrive/Milestones/Milestone1'

In [None]:
!pip install geopandas
!pip install altair_data_server
!pip install sodapy

import io
import json
import pandas as pd
import numpy as np
import geopandas as gpd
from shapely.ops import cascaded_union
from google.cloud import bigquery
from google.oauth2 import service_account
from sodapy import Socrata
from ipywidgets import interact, interactive, fixed, interact_manual, Layout
import ipywidgets as widgets
import datetime as dt
import altair as alt
import urllib



key_path = 'auth.json'
credentials = service_account.Credentials.from_service_account_file(key_path)
bq_client = bigquery.Client(credentials=credentials, project=credentials.project_id)
soc_client = Socrata('data.cityofnewyork.us', 'erkBtGgCm1QXwrGaILeRCD1Xw', timeout=500)



In [None]:
taxi_zones = gpd.read_file('https://s3.amazonaws.com/nyc-tlc/misc/taxi_zones.zip')
taxi_zones.to_crs(epsg=4326, inplace=True)
taxi_zones['centroid_lon'] = taxi_zones['geometry'].centroid.x
taxi_zones['centroid_lat'] = taxi_zones['geometry'].centroid.y

taxi_zones_b = taxi_zones.groupby('borough')['geometry'].agg(lambda x: cascaded_union(x).centroid).to_frame()
taxi_zones_b.columns = ['geometry']
taxi_zones_b.reset_index(inplace=True)
taxi_zones_b['centroid_lon'] = taxi_zones_b['geometry'].centroid.x
taxi_zones_b['centroid_lat'] = taxi_zones_b['geometry'].centroid.y

tz_geo = json.loads(taxi_zones.to_json())['features']
tz_geo_b = json.loads(taxi_zones_b.to_json())['features']


  This is separate from the ipykernel package so we can avoid doing imports until

  after removing the cwd from sys.path.


In [None]:
alt.themes.enable('opaque')

base = alt.Chart(alt.Data(values=tz_geo)).mark_geoshape(
        stroke='black',
        strokeWidth=1
    ).encode(
        color=alt.Color('properties.borough:N', legend=None)
    ).properties(
        width=800,
        height=800
    )

labels = alt.Chart(alt.Data(values=tz_geo)).mark_text(
    baseline='top',
     ).properties(
        width=800,
        height=800
     ).encode(
         longitude='properties.centroid_lon:Q',
         latitude='properties.centroid_lat:Q',
         text='properties.LocationID:O',
         size=alt.value(8),
         opacity=alt.value(1)
     )

boroughs = alt.Chart(alt.Data(values=tz_geo_b)).mark_text(
    color='white',
    stroke='black',
    fontWeight='bold',
    strokeWidth=0.7,
    baseline='top'
     ).properties(
        width=800,
        height=800,
        title=alt.Text(text="NYC boroughs and taxi zones", fontSize=22)
     ).encode(
         longitude='properties.centroid_lon:Q',
         latitude='properties.centroid_lat:Q',
         text='properties.borough:N',
         size=alt.value(26),
         opacity=alt.value(1)
     )


base + labels + boroughs

Output hidden; open in https://colab.research.google.com to view.

In [None]:
sql = '''SELECT dropoff_location_id, count(*) as count
FROM bigquery-public-data.new_york_taxi_trips.tlc_yellow_trips_2017 
where dropoff_datetime between '2017-01-01' and '2018-01-01' 
group by dropoff_location_id;'''
dropoff_2017_df = bq_client.query(sql).to_dataframe()

In [None]:
dropoff_2017_df.rename(columns={'dropoff_location_id': 'LocationID'}, inplace=True)
dropoff_2017_df['LocationID'] = dropoff_2017_df['LocationID'].astype('int64')

dropoff_2017 = taxi_zones.merge(dropoff_2017_df, on='LocationID')
dropoff_2017 = json.loads(dropoff_2017.to_json())['features']

base = alt.Chart(alt.Data(values=dropoff_2017)).mark_geoshape(
        stroke='black',
        strokeWidth=1
    ).encode(
        color=alt.Color('properties.count:Q', scale=alt.Scale(type='log'), legend=alt.Legend(title="Drop-off count"))
    ).properties(
        title=alt.Text(text="NYC taxi drop-off zones popularity", fontSize=22),
        width=800,
        height=800
    )

labels = alt.Chart(alt.Data(values=tz_geo)).mark_text(
    baseline='top',
     ).properties(
        width=800,
        height=800
     ).encode(
         longitude='properties.centroid_lon:Q',
         latitude='properties.centroid_lat:Q',
         text='properties.LocationID:O',
         size=alt.value(8),
         opacity=alt.value(1)
     )

base + labels

In [None]:
sql = '''SELECT pickup_location_id, count(*) as count
FROM bigquery-public-data.new_york_taxi_trips.tlc_yellow_trips_2017 
where pickup_datetime between '2017-01-01' and '2018-01-01' 
group by pickup_location_id;'''
pickup_2017_df = bq_client.query(sql).to_dataframe()

In [None]:
pickup_2017_df.rename(columns={'pickup_location_id': 'LocationID'}, inplace=True)
pickup_2017_df['LocationID'] = pickup_2017_df['LocationID'].astype('int64')

pickup_2017 = taxi_zones.merge(pickup_2017_df, on='LocationID')
pickup_2017 = json.loads(pickup_2017.to_json())['features']

base = alt.Chart(alt.Data(values=pickup_2017)).mark_geoshape(
        stroke='black',
        strokeWidth=1
    ).encode(
        color=alt.Color('properties.count:Q', scale=alt.Scale(type='log'), legend=alt.Legend(title="Pickup count"))
    ).properties(
        title=alt.Text(text="NYC taxi pickup zones popularity", fontSize=22),
        width=800,
        height=800
    )
    
labels = alt.Chart(alt.Data(values=tz_geo)).mark_text(
    baseline='top',
     ).properties(
        width=800,
        height=800
     ).encode(
         longitude='properties.centroid_lon:Q',
         latitude='properties.centroid_lat:Q',
         text='properties.LocationID:O',
         size=alt.value(8),
         opacity=alt.value(1)
     )

base + labels

In [None]:
sql = '''
SELECT 
dropoff_location_id, avg(fare_amount) as avg_fare
FROM 
bigquery-public-data.new_york_taxi_trips.tlc_yellow_trips_2018
WHERE dropoff_datetime > '2018-01-01' and dropoff_datetime < '2019-01-01'
AND fare_amount > 0 and fare_amount < 1000
GROUP BY dropoff_location_id;
'''
avg_fare_2018_df = client.query(sql).to_dataframe()

In [None]:
avg_fare_2018_df.rename(columns={'dropoff_location_id': 'LocationID'}, inplace=True)
avg_fare_2018_df['LocationID'] = avg_fare_2018_df['LocationID'].astype('int64')
avg_fare_2018_df['avg_fare'] = avg_fare_2018_df['avg_fare'].astype('float64')

avg_fare_2018 = taxi_zones.merge(avg_fare_2018_df, on='LocationID')
avg_fare_2018 = json.loads(avg_fare_2018.to_json())['features']

base = alt.Chart(alt.Data(values=avg_fare_2018)).mark_geoshape(
        stroke='black',
        strokeWidth=1
    ).encode(
        color=alt.Color('properties.avg_fare:Q', legend=alt.Legend(title="Avg. fare"))
    ).properties(
        title=alt.Text(text="NYC average fare by taxi zone", fontSize=22),
        width=800,
        height=800
    )
    
labels = alt.Chart(alt.Data(values=tz_geo)).mark_text(
    baseline='top',
     ).properties(
        width=800,
        height=800
     ).encode(
         longitude='properties.centroid_lon:Q',
         latitude='properties.centroid_lat:Q',
         text='properties.LocationID:O',
         size=alt.value(8),
         opacity=alt.value(1)
     )

base + labels

In [None]:
weather = pd.read_csv('2398761.csv')
weather_h = weather[weather['REPORT_TYPE'].isin(['FM-15', 'SY-MT'])]
weather_hdf = weather_h.filter(items=['HourlyDryBulbTemperature', 'HourlyPrecipitation', 'HourlyPresentWeatherType'])
weather_hdf.rename(columns={'HourlyDryBulbTemperature': 'avg_temp',
                          'HourlyPrecipitation': 'precip_depth',
                          'HourlyPresentWeatherType': 'precip_type'}, inplace=True)
weather_hdf.loc[:, 'datetime'] = weather_h['DATE']
weather_hdf['datetime'] = pd.to_datetime(weather_hdf['datetime'], format="%Y-%m-%dT%H:%M:%S")
weather_hdf['precip_depth'].replace(to_replace='T', value=0.0, inplace=True)
weather_hdf['precip_depth'].replace(to_replace='[a-zA-Z]', value='', regex=True, inplace=True)
weather_hdf['avg_temp'].replace(to_replace='[a-zA-Z]', value='', regex=True, inplace=True)
weather_hdf['precip_depth'] = weather_hdf['precip_depth'].astype('double')
weather_hdf['precip_type'] = weather_hdf['precip_type'].astype('str')
weather_hdf['precip_type'] = ['rain' if 'RA' in x else 'snow' if ('SN' in x or 'SG' in x or 'IC' in x or 'PL' in x) else np.nan for x in weather_hdf['precip_type']]
weather_hdf['avg_temp'] = weather_hdf['avg_temp'].astype('double')


weather_d = weather[weather['REPORT_TYPE'] == 'SOD  ']
weather_ddf = weather_d.filter(items=['DailyAverageDryBulbTemperature', 'DailyPrecipitation', 'DailySnowDepth', 'DailySnowfall'])
weather_ddf.rename(columns={'DailyAverageDryBulbTemperature': 'avg_temp',
                          'DailyPrecipitation': 'precip_depth',
                          'DailySnowDepth': 'snow_depth',
                          'DailySnowfall': 'snow_fall'}, inplace=True)
weather_ddf.loc[:, 'datetime'] = weather_d['DATE']
weather_ddf['datetime'] = pd.to_datetime(weather_ddf['datetime'], format="%Y-%m-%dT%H:%M:%S")
weather_ddf['precip_depth'].replace(to_replace='T', value=0.0, inplace=True)
weather_ddf['snow_depth'].replace(to_replace='T', value=0.0, inplace=True)
weather_ddf['snow_fall'].replace(to_replace='T', value=0.0, inplace=True)
weather_ddf['precip_depth'] = weather_ddf['precip_depth'].astype('double')
weather_ddf['snow_depth'] = weather_ddf['snow_depth'].astype('double')
weather_ddf['snow_fall'] = weather_ddf['snow_fall'].astype('double')


def create_and_populate_weather_tables():
  daily_table_id = 'mads-milestone-1.weather.daily'
  hourly_table_id = 'mads-milestone-1.weather.hourly'

  try:
    bq_client.get_table(daily_table_id)
    print('Weather tables already exist.')
    return
  except NotFound:
    print('Weather tables not found, creating...')

  bq_client.create_dataset('mads-milestone-1.weather')

  daily_schema = [
      bigquery.SchemaField("datetime", "TIMESTAMP", mode="REQUIRED"),
      bigquery.SchemaField("avg_temp", "FLOAT"),
      bigquery.SchemaField("precip_depth", "FLOAT"),
      bigquery.SchemaField("snow_depth", "FLOAT"),
      bigquery.SchemaField("snow_fall", "FLOAT")
  ]

  daily_table = bigquery.Table(daily_table_id, schema=daily_schema)
  daily_table = bq_client.create_table(daily_table)

  hourly_schema = [
      bigquery.SchemaField("datetime", "TIMESTAMP", mode="REQUIRED"),
      bigquery.SchemaField("avg_temp", "FLOAT"),
      bigquery.SchemaField("precip_type", "STRING"),
      bigquery.SchemaField("precip_depth", "FLOAT")
  ]

  hourly_table = bigquery.Table(hourly_table_id, schema=hourly_schema)
  hourly_table = bq_client.create_table(hourly_table)

  job_config_d = bigquery.LoadJobConfig(
      schema=daily_schema, source_format=bigquery.SourceFormat.CSV
  )
  job_config_h = bigquery.LoadJobConfig(
      schema=hourly_schema, source_format=bigquery.SourceFormat.CSV
  )

  bg_daily_job = bq_client.load_table_from_dataframe(weather_ddf, daily_table, job_config=job_config_d)
  bg_daily_job.result()

  bg_hourly_job = bq_client.load_table_from_dataframe(weather_hdf, hourly_table, job_config=job_config_h)
  bg_hourly_job.result()

  interactivity=interactivity, compiler=compiler, result=result)


### Create and populate weather data tables

In [None]:
create_and_populate_weather_tables()

Weather tables already exist.


## Prepare SQL queries

In [None]:
CACHE_SQL_RESULTS = True

def get_location_ids(loc, str_output=False):
  location_ids = {
      'JFK': taxi_zones[taxi_zones['zone'] == 'JFK Airport']['LocationID'],
      'LGA': taxi_zones[taxi_zones['zone'] == 'LaGuardia Airport']['LocationID'],
      'EWR': taxi_zones[taxi_zones['zone'] == 'Newark Airport']['LocationID'],
      'Bronx': taxi_zones[taxi_zones['borough'] == 'Bronx']['LocationID'],
      'Brooklyn': taxi_zones[taxi_zones['borough'] == 'Brooklyn']['LocationID'],
      'Queens': taxi_zones[taxi_zones['borough'] == 'Queens']['LocationID'],
      'Staten_Island': taxi_zones[taxi_zones['borough'] == 'Staten Island']['LocationID'],
      'Manhattan': taxi_zones[taxi_zones['borough'] == 'Manhattan']['LocationID']
  }
  if str_output:
    return ','.join([f"'{id}'" for id in location_ids[loc].astype(str).tolist()])
  return ','.join(location_ids[loc].astype(str).tolist())


def build_query_soc(year, source, dest):
  if year == '2017':
    start, end = '2017-01-01', '2018-01-01'
  elif year == '2018':
    start, end = '2018-01-01', '2019-01-01'
  else:
    start, end = '2019-01-01', '2020-01-01'

  base_sql = f'''select date_trunc_ymd(tpep_dropoff_datetime) as day, 
                  avg(fare_amount) as avg_fare,
                  avg(trip_distance) as avg_dist,
                  --stddev_samp(fare_amount) as std_fare,
                  --stddev_samp(trip_distance) as std_dist,
                  count(*) as count
                where tpep_dropoff_datetime > '{start}'
                  and tpep_dropoff_datetime < '{end}'
                  and fare_amount > 5 and fare_amount < 100
                  and trip_distance > 0 and trip_distance < 100
                  and pulocationid in ({get_location_ids(source)})
                  and dolocationid in ({get_location_ids(dest)})
                group by day 
                order by day'''
  return base_sql


def build_query_bq(year, source, dest):
  if year == '2017':
    start, end = '2017-01-01', '2018-01-01'
  elif year == '2018':
    start, end = '2018-01-01', '2019-01-01'
  else:
    start, end = '2019-01-01', '2020-01-01'

  base_sql = f'''SELECT
                  datetime_trunc(dropoff_datetime, day) as day,
                  AVG(fare_amount) as avg_fare,
                  AVG(trip_distance) as avg_dist,
                  --stddev(fare_amount) as std_fare,
                  --stddev(trip_distance) as std_dist,
                  COUNT(*) as count,
                  AVG(datetime_diff(dropoff_datetime, pickup_datetime, minute)) as avg_duration,
                  --stddev(datetime_diff(dropoff_datetime, pickup_datetime, minute)) as std_duration
                FROM
                  bigquery-public-data.new_york_taxi_trips.tlc_yellow_trips_{year}
                WHERE
                  dropoff_datetime > '{start}'
                  AND dropoff_datetime < '{end}'
                  AND pickup_location_id in ({get_location_ids(source, str_output=True)})
                  AND dropoff_location_id in ({get_location_ids(dest, str_output=True)})
                GROUP BY day
                ORDER BY day'''
  return base_sql


sql_dict_soc = {
    
}

sql_dict_bq = {
    'dropoff_count_avg_temp_by_hour': '''SELECT datetime_trunc(t.dropoff_datetime, HOUR) as dropoff_hour, count(*) as taxi_avail, avg(w.avg_temp) as avg_temp
                                    FROM bigquery-public-data.new_york_taxi_trips.tlc_yellow_trips_2018 t
                                    LEFT JOIN mads-milestone-1.weather.hourly w
                                    ON CAST(TIMESTAMP_TRUNC(w.datetime, HOUR) as DATETIME) = DATETIME_TRUNC(t.dropoff_datetime, HOUR)
                                    WHERE t.dropoff_datetime > '2018-01-01' and t.dropoff_datetime < '2019-01-01'
                                    GROUP BY dropoff_hour;''',
            
  'dropoff_count_avg_temp_by_day': '''SELECT datetime_trunc(t.dropoff_datetime, DAY) as dropoff_day, count(*) as taxi_avail, avg(w.avg_temp) as avg_temp
                                    FROM bigquery-public-data.new_york_taxi_trips.tlc_yellow_trips_2018 t
                                    LEFT JOIN mads-milestone-1.weather.daily w
                                    ON CAST(TIMESTAMP_TRUNC(w.datetime, DAY) as DATETIME) = DATETIME_TRUNC(t.dropoff_datetime, DAY)
                                    WHERE t.dropoff_datetime > '2018-01-01' and t.dropoff_datetime < '2019-01-01'
                                    GROUP BY dropoff_day;'''
}

def populate_soc_sql_dict():
  years=['2017', '2018', '2019']
  locs=['JFK', 'LGA', 'EWR', 'Bronx', 'Brooklyn', 'Manhattan', 'Staten Island', 'Queens']

  #sql_dict_soc.clear()

  for year in years:
    for source in locs:
      for dest in locs:
        key = '_'.join([f'fare_dist_dura_avail_soc_{year}', source.replace(' ', '_'), dest.replace(' ', '_')])
        value = build_query_soc(year, source.replace(' ', '_'), dest.replace(' ', '_'))
        sql_dict_soc[key] = value


def populate_bq_sql_dict():
  years=['2017', '2018']
  locs=['JFK', 'LGA', 'EWR', 'Bronx', 'Brooklyn', 'Manhattan', 'Staten Island', 'Queens']

  #sql_dict_bq.clear()

  for year in years:
    for source in locs:
      for dest in locs:
        key = '_'.join([f'fare_dist_dura_avail_bq_{year}', source.replace(' ', '_'), dest.replace(' ', '_')])
        value = build_query_bq(year, source.replace(' ', '_'), dest.replace(' ', '_'))
        sql_dict_bq[key] = value

populate_soc_sql_dict()
populate_bq_sql_dict()

def run_cached_bq(sql_name):
  if not CACHE_SQL_RESULTS:
    sql_query = sql_dict_bq[sql_name]
    #print('Caching is disabled, querying database...')
    return bq_client.query(sql_query).to_dataframe()
  try:
    #print('Reading dataframe from cache...')
    return pd.read_pickle(''.join(['./cache/', sql_name, '.gz']))
  except FileNotFoundError:
    #print('Dataframe not found in cache, querying database..')
    sql_query = sql_dict_bq[sql_name]
    df = bq_client.query(sql_query).to_dataframe()
    #print('Caching resulting dataframe...')
    df.to_pickle(''.join(['./cache/', sql_name, '.gz']))
    #print('Dataframe saved to cache')
    return df

def run_cached_soc(sql_name, year):
  soc_client_dict = {'2017': 'biws-g3hs', '2018': 't29m-gskq', '2019': '2upf-qytp'}
  if not CACHE_SQL_RESULTS:
    sql_query = sql_dict_soc[sql_name]
    #print('Caching is disabled, querying database...')
    results = soc_client.get(soc_client_dict[year], query=sql_query)
    return pd.DataFrame.from_records(results)
  try:
    #print('Reading dataframe from cache...')
    return pd.read_pickle(''.join(['./cache/', sql_name, '.gz']))
  except FileNotFoundError:
    #print('Dataframe not found in cache, querying database..')
    sql_query = sql_dict_soc[sql_name]
    results = soc_client.get(soc_client_dict[year], query=sql_query)
    #print(f'results: {results}')
    df = pd.DataFrame.from_records(results)
    #print('Caching resulting dataframe...')
    df.to_pickle(''.join(['./cache/', sql_name, '.gz']))
    #print('Dataframe saved to cache')
    return df


In [None]:
alt.data_transformers.enable('data_server')

dropoff_cnt_by_hour = run_cached_bq('dropoff_count_avg_temp_by_hour')
dropoff_cnt_by_hour['taxi_avail'] = dropoff_cnt_by_hour.apply(lambda r: r['taxi_avail'] / 2 if r['dropoff_hour'].month == 3 else r['taxi_avail'], axis=1)

by_hour = alt.Chart(dropoff_cnt_by_hour).transform_calculate(
          freezing = 'datum.avg_temp < 32'
      ).mark_point(opacity=0.5, size=14, filled=True).encode(
          y = alt.Y('taxi_avail:Q', title='Available taxi count'),
          x = alt.X('dropoff_hour:T'),
        color=alt.Color('freezing:N')
      ).properties(
        width=1000,
        height=300
      )
  
by_hour

In [None]:
dropoff_cnt_by_day = run_cached_bq('dropoff_count_avg_temp_by_day')
dropoff_cnt_by_day['taxi_avail'] = dropoff_cnt_by_day.apply(lambda r: r['taxi_avail'] / 2 if r['dropoff_day'].month == 3 else r['taxi_avail'], axis=1)

by_day = alt.Chart(dropoff_cnt_by_day).transform_calculate(
        freezing = 'datum.avg_temp < 32'
      ).mark_point(opacity=0.5, size=20, filled=True).encode(
        y = alt.Y('taxi_avail:Q', title='Available taxi count', scale=alt.Scale(zero=False)),
        x = alt.X('dropoff_day:T'),
      color=alt.Color('freezing:N')
  ).properties(
        width=1000,
        height=300
  )

by_day

In [None]:
def draw_chart(df, var):

  color_dict = {'avg_fare': '#57A44C', 'avg_dist': '#26d1b2', 'avg_duration': '#d1c526', 'count': '#a44c71'}
  axis_title_dict = {'avg_fare': 'Avg. Fare ($)', 'avg_dist': 'Avg. Distance (mi)', 'avg_duration': 'Avg. Duration (mins)', 'count': 'Trip Count'}


  base = alt.Chart(df).encode(
    alt.X('day:T', axis=alt.Axis(title=None))
  )
  line1 = base.mark_point(color=f'{color_dict[var]}', opacity=0.6, filled=True).encode(
      alt.Y(f'{var}:Q', axis=alt.Axis(title=f'{axis_title_dict[var]}', titleColor=f'{color_dict[var]}'))
  )
  line2 = base.mark_line(stroke='#5276A7', opacity=0.5, interpolate='monotone').encode(
      alt.Y('avg_temp:Q', axis=alt.Axis(title='Avg. Temperature (°F)', titleColor='#5276A7'))
  )
  chart = alt.layer(line1, line2).resolve_scale(
      y = 'independent'
  ).properties(
      width=1000,
      height=300
  )

  return chart

year = widgets.Dropdown(options=['2017', '2018', '2019'], description='Year: ')
source = widgets.Dropdown(options=['JFK Airport', 'LaGuardia Airport', 'Newark Airport', 'Bronx', 'Brooklyn', 'Manhattan', 'Staten Island', 'Queens'], description='Origin: ')
dest = widgets.Dropdown(options=['JFK Airport', 'LaGuardia Airport', 'Newark Airport', 'Bronx', 'Brooklyn', 'Manhattan', 'Staten Island', 'Queens'], description='Destination: ')
tab_0 = widgets.Output()
tab_1 = widgets.Output()
tab_2 = widgets.Output()
tab_3 = widgets.Output()
tab_contents = ['Fare', 'Trip distance', 'Trip duration', 'Trip count']
item_layout = widgets.Layout(margin='20px 0 0 0')
tab = widgets.Tab([tab_0, tab_1, tab_2, tab_3], layout=item_layout)
for i in range(len(tab_contents)):
    tab.set_title(i, tab_contents[i])

@widgets.interact_manual(
  year=year,  
  source=source,
  dest=dest
)
def fill_tab(year, source, dest):

  alt.data_transformers.enable('default')

  for child_tab in tab.children:
    child_tab.clear_output()

  source = 'JFK' if source == 'JFK Airport' else 'LGA' if source == 'LaGuardia Airport' else 'EWR' if source == 'Newark Airport' else source
  dest = 'JFK' if dest == 'JFK Airport' else 'LGA' if dest == 'LaGuardia Airport' else 'EWR' if dest == 'Newark Airport' else dest

  # if (year == '2019') & (tab.selected_index == 2):
  #   children = list(tab.children)
  #   children[2] = widgets.HTML('<h2 style="color:red">Trip duration information not available for 2019</h2>')
  #   tab.children = children
  
  sql_name_soc = '_'.join([f'fare_dist_dura_avail_soc_{year}', source.replace(' ', '_'), dest.replace(' ', '_')])
  sql_name_bq = '_'.join([f'fare_dist_dura_avail_bq_{year}', source.replace(' ', '_'), dest.replace(' ', '_')])

  soc_df = run_cached_soc(sql_name_soc, year)
  
  if year != '2019':
    bq_df = run_cached_bq(sql_name_bq)
    bq_df['day'] = pd.to_datetime(bq_df['day'])

  #print(soc_df.head())
  #print(bq_df.head())

  if soc_df.shape == (0,0):
    soc_df = soc_df.reindex(columns = ['day', 'avg_fare', 'avg_dist', 'count'])

  soc_df['day'] = pd.to_datetime(soc_df['day'])
  soc_df['avg_fare'] = soc_df['avg_fare'].astype('double')
  soc_df['avg_dist'] = soc_df['avg_dist'].astype('double')
  soc_df['count'] = soc_df['count'].astype('double')

  

  weather_ddf['day'] = pd.to_datetime(weather_ddf['datetime'].dt.date)

  combined_df = weather_ddf[weather_ddf['day'].dt.year == int(year)]\
                  .merge(soc_df[['day', 'avg_fare', 'avg_dist', 'count']], how='left', on='day')

  if year != '2019':
    combined_df = combined_df.merge(bq_df[['day', 'avg_duration']], how='left', on='day')

  if year == '2018':
    combined_df['count'] = combined_df.apply(lambda r: r['count'] / 2 if r['day'].month == 3 else r['count'], axis=1)

  with tab_0:
    fare_chart = draw_chart(combined_df, 'avg_fare') 
    display(fare_chart)

  with tab_1:
    dist_chart = draw_chart(combined_df, 'avg_dist') 
    display(dist_chart)

  with tab_2:
    tab_2_content = draw_chart(combined_df, 'avg_duration') if year != '2019' else \
                        widgets.HTML('<span style="color: red">Trip duration information not available for 2019</span>')
    display(tab_2_content)

  with tab_3:
    count_chart = draw_chart(combined_df, 'count') 
    display(count_chart)

display(tab)


interactive(children=(Dropdown(description='Year: ', options=('2017', '2018', '2019'), value='2017'), Dropdown…

Tab(children=(Output(), Output(), Output(), Output()), layout=Layout(margin='20px 0 0 0'), _titles={'0': 'Fare…