## Build serice measures

This notebook constructs three station-level measures of service from the raw data:
- frequency the station had no bikes or no docks during morning or evening hours
- median duation of instances with no bikes or no docks
- portion of docks holding broken or out-of-service bikes

Before running this notebook, you will need to record data and construct `dataset.parquet` and `stations_geo.geojson` with [`Build dataset`](../Build%20dataset.ipynb)

In [5]:
import numpy as np
import pandas as pd
import geopandas as gpd
from tqdm import tqdm

In [6]:
dataset = pd.read_parquet('../dataset.parquet') 

In [7]:
stations_geo = gpd.read_file('../stations_geo.geojson').set_index('station_id')

#### Morning or evening frequency unavailable

subset to times of day/day of week.

In [9]:
am_rush = (
    dataset
    .unstack(level='station_id')
    .between_time('07:00','10:59')
    .stack()
)

In [11]:
evening = (
    dataset
    .unstack(level='station_id')
    .between_time('16:00','21:59')
    .stack()
)

In [None]:
am_n_samples = am_rush.reset_index()['last_updated'].nunique()

am_n_samples

In [None]:
evening_n_samples = evening.reset_index()['last_updated'].nunique()

evening_n_samples

In [15]:
stations_geo['freq_am_or_evening_no_bikes_or_no_docks'] = (
    (
        (
            am_rush
            [
                (am_rush['num_bikes_available'] == 0) 
                | (am_rush['num_docks_available'] == 0) 
            ]
            .index.get_level_values('station_id')
            .value_counts()
            .reindex(stations_geo.index)
            .fillna(0)
        ).add(
            evening
            [
                (evening['num_bikes_available'] == 0)
                | (evening['num_docks_available'] == 0) 
            ]
            .index.get_level_values('station_id')
            .value_counts()
            .reindex(stations_geo.index)
            .fillna(0)
        )
    ).div(
        (am_n_samples + evening_n_samples)
    )
)

#### Docks with broken bikes

We found that there are a few instances (across many stations and many times) when the number of bikes exceeds station capacity. That should not be allowed. So dropping those instances for computation involving capacity. 


In [16]:

dataset['pct_of_docks_w_disabled_bikes'] = (
    (dataset['num_bikes_disabled'].div(dataset['capacity']))
    .where(
        dataset['capacity'] >= (dataset['num_bikes_available'] + dataset['num_bikes_disabled'])
    )
    .replace([-np.inf,np.inf],np.nan)
)

In [17]:
stations_geo = (
    stations_geo
    .join(
        (
            dataset
            .groupby('station_id')
            ['pct_of_docks_w_disabled_bikes']
            .agg(['median','mean'])
            .add_prefix('pct_of_docks_w_disabled_bikes_')
        ),
        how='left'   
    )
)

#### Duration zero docks or zero bikes in 6:00am-midnight hours

In [20]:
dataset = dataset.reorder_levels(['last_updated','station_id'])

In [22]:
daytime_data = (
    dataset
    .unstack(level='station_id')
    .between_time('06:00','23:59')
    .stack()
    .reorder_levels(['station_id','last_updated']).sort_index()
)

- loop through each sample at each station. 
- start a duration counter when there are zero bikes or zero docks, when there were not zero in the previous instance (separat ecounters for bikes and docks)
- stop the counter if there is a gap in the samples > 40 minutes. 
- or, stop the counter when there is >0 docks or bikes

In [None]:
zero_daytime_dock_durations = []
zero_daytime_bike_durations = []
ended_due_to_data_gap = []

for station_id in tqdm(stations_geo.index):
    
    station_subset = (
        (
            daytime_data.loc[station_id]
            [[
                'num_bikes_available',
                'num_docks_available',
                'is_renting',
                'is_returning'
            ]]
            .reset_index()
            .assign(
                previous_time = lambda row: row.shift(1)['last_updated'],
                previous_docks = lambda row: row.shift(1)['num_docks_available'],
                previous_bikes = lambda row: row.shift(1)['num_bikes_available']
            )
        )
    )

    zero_dock_start = None
    zero_bike_start = None
    zero_docks = False
    zero_bikes = False

    restarted_counter = False

    for i,row in station_subset.iloc[1:].iterrows(): 

        if (
            (row['num_docks_available'] == 0) and 
            (row['is_returning'] == 1) and
            (not zero_docks)
        ): 
            zero_dock_start = row['last_updated']
            restarted_counter = True
            zero_docks = True

        if (
            (row['num_bikes_available'] == 0) and
            (row['is_renting'] == 1) and
            (not zero_bikes)
        ): 
            zero_bike_start = row['last_updated']
            restarted_counter = True
            zero_bikes = True

        if restarted_counter:  # if the timer has just started, skip the rest
            restarted_counter = False
            continue

        # on data gap > 40 minutes, end the timer and store the instance

        if (row['last_updated'] - row['previous_time']) > pd.Timedelta('40min'):
            
            if zero_docks:
                zero_dock_end = row['previous_time']
                zero_daytime_dock_durations.append((
                    station_id,
                    zero_dock_start,
                    zero_dock_end,
                    (zero_dock_end - zero_dock_start)
                ))
                ended_due_to_data_gap.append((
                    'dock',
                    station_id,
                    zero_dock_start,
                    zero_dock_end,
                    (zero_dock_end - zero_dock_start),
                    row['last_updated']
                ))
                zero_dock_start = None
                zero_dock_end = None
                zero_docks = False

            if zero_bikes:
                zero_bike_end = row['previous_time']
                zero_daytime_bike_durations.append((
                    station_id,
                    zero_bike_start,
                    zero_bike_end,
                    (zero_bike_end - zero_bike_start)
                ))
                ended_due_to_data_gap.append((
                    'bike',
                    station_id,
                    zero_bike_start,
                    zero_bike_end,
                    (zero_bike_end - zero_bike_start),
                    row['last_updated']
                ))
                zero_bike_start = None
                zero_bike_end = None
                zero_bikes = False

            continue

        # end counter when this row is no longer zero

        assert ((row['last_updated'] - row['previous_time']) <= pd.Timedelta('40min'))

        if (
            zero_docks 
            and 
            (row['num_docks_available'] != 0)
        ): 
            zero_dock_end = row['last_updated']
            zero_daytime_dock_durations.append(
                (station_id,
                zero_dock_start,
                zero_dock_end,
                (zero_dock_end - zero_dock_start)
            ))
            zero_dock_start = None
            zero_dock_end = None
            zero_docks = False

        if (
            zero_bikes 
            and 
            (row['num_bikes_available'] != 0)
        ): 
            zero_bike_end = row['last_updated']
            zero_daytime_bike_durations.append((
                station_id,
                zero_bike_start,
                zero_bike_end,
                (zero_bike_end - zero_bike_start)
            ))
            zero_bike_start = None
            zero_bike_end = None
            zero_bikes = False

In [25]:
zero_daytime_dock_durations_data = pd.DataFrame.from_records(
    zero_daytime_dock_durations, 
    columns=['station_id','zero_dock_start','zero_dock_end', 'zero_dock_duration']
)

zero_daytime_bike_durations_data = pd.DataFrame.from_records(
    zero_daytime_bike_durations, 
    columns=['station_id','zero_bike_start','zero_bike_end', 'zero_bike_duration']
)


In [26]:
zero_daytime_durations = pd.concat([
    (
        zero_daytime_dock_durations_data
        [['station_id','zero_dock_duration']]
        .rename(columns={'zero_dock_duration':'zero_duration'})
        .assign(duration_type = 'dock')
    ),
    (
        zero_daytime_bike_durations_data
        [['station_id','zero_bike_duration']]
        .rename(columns={'zero_bike_duration':'zero_duration'})
        .assign(duration_type = 'bike')
    )
])

In [27]:
stations_geo = (
    stations_geo
    # .join(
    #     zero_daytime_dock_durations_data
    #     .groupby('station_id')
    #     ['zero_dock_duration']
    #     .agg(['max','mean','median'])
    #     .add_prefix('zero_dock_daytime_duration_')
    #     .reindex(index=stations_geo.index)
    #     .fillna(pd.Timedelta(0))
    #     .div(pd.Timedelta('1hour'))
    # )
    # .join(
    #     zero_daytime_bike_durations_data
    #     .groupby('station_id')
    #     ['zero_bike_duration']
    #     .agg(['max','mean','median'])
    #     .add_prefix('zero_bike_daytime_duration_')
    #     .reindex(index=stations_geo.index)
    #     .fillna(pd.Timedelta(0))
    #     .div(pd.Timedelta('1hour'))
    # )
    .join(
        zero_daytime_durations
        .groupby('station_id')
        ['zero_duration']
        .agg(['max','mean','median'])
        .add_prefix('zero_daytime_duration_')
        .reindex(index=stations_geo.index)
        .fillna(pd.Timedelta(0))
        .div(pd.Timedelta('1hour'))
    )
)

save out

In [28]:
stations_geo.to_file('../stations_service_measures.geojson')

In [None]:
stations_geo