Author: **Mathis Konarski** </br>
Date: **17/06/2022**

This notebook make predictions based on historical average for NYC bike and NYC taxi data.

In [None]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt

import Training_functions as t_func

In [None]:
TRAIN_PERIOD = 70*24*2 # TIME_PERIOD
MIN_VOL_METRICS = 10

In [None]:
def HA_by_zone_and_period(data_fn_df, train_period):
    '''
    Count the demand for each zone during one period of time
    
    Parameters
    ----------
    data_fn_df : pandas.DataFrame
    train_period : int with length of the train period
    '''
    train_start_df = data_fn_df[data_fn_df.starttime_period <= train_period]
    start_count_df = train_start_df.value_counts(['starttime_period', 'start_lat_zone', 'start_lon_zone', 'weekday', 'hour']).reset_index()
    start_count_df.rename(columns = {0:'n_trips',
                                    'starttime_period':'period',
                                    'start_lat_zone':'lat',
                                    'start_lon_zone':'lon'}, inplace=True)
    start_count_df = start_count_df.groupby(['lat','lon','weekday','hour'])['n_trips'].mean().reset_index()

    train_stop_df = data_fn_df[data_fn_df.stoptime_period <= train_period]
    stop_count_df = train_stop_df.value_counts(['stoptime_period', 'end_lat_zone', 'end_lon_zone', 'weekday', 'hour']).reset_index()
    stop_count_df.rename(columns = {0:'n_trips',
                                    'stoptime_period':'period',
                                    'end_lat_zone':'lat',
                                    'end_lon_zone':'lon'}, inplace=True)
    stop_count_df = stop_count_df.groupby(['lat','lon','weekday','hour'])['n_trips'].mean().reset_index()
    return start_count_df, stop_count_df

In [None]:
def HA_formatting(count_df, train_period, val_shape, starting_hour, starting_weekday):
    '''
    Changes the format of the data to make it match the prediction format
    
    Parameters
    ----------
    count_df : pandas.DataFrame with the number of trips by period
    train_period : int with length of the train period
    val_shape : tuple with target format
    starting_hour : int corresponding to the first hour of the training period
    starting_weekday : int corresponding to the first weekday of the training period
    '''
    pred_arr = np.empty(val_shape)
    hour = starting_hour
    weekday = starting_weekday
    for i in tqdm(range(len(pred_arr))):
        value_df = count_df[(count_df.hour == hour) & (count_df.weekday == weekday)]
        for lat in value_df.lat.unique():
            value_lat_df = value_df[value_df.lat == lat]
            for lon in value_lat_df.lon.unique():
                value_lat_lon_df = value_lat_df[value_lat_df.lon == lon]
                pred_arr[i, lat-1, lon-1] = value_lat_lon_df.iloc[0].n_trips
        if i%2 == 0: # Day and hour evolution
            hour += 1
            if hour==24:
                hour=0
        if i%24 == 0:
            weekday += 1
            if weekday == 7:
                weekday = 0
    return pred_arr

In [None]:
def HA_scoring(data_fn_df, train_period, min_vol_metrics):
    '''
    Compute and test historical average for one dataset
    
    Parameters
    ----------
    data_fn_df : pandas.DataFrame with the data to be tested
    train_period : int with length of the train period
    min_vol_metrics : int with minimal number of trips demand to be considered for scoring
    '''
    # Validation data
    data_vol_arr = t_func.volume_data(data_fn_df)
    val_start_arr = data_vol_arr[train_period:,:,:,0]
    val_stop_arr = data_vol_arr[train_period:,:,:,1]

    # HA formatting
    starting_hour = data_fn_df[data_fn_df.starttime_period == train_period+1].iloc[0].hour
    starting_weekday = data_fn_df[data_fn_df.starttime_period == train_period+1].iloc[0].weekday

    count_start_df, count_stop_df = HA_by_zone_and_period(data_fn_df, train_period)
    pred_start_arr = HA_formatting(count_start_df, train_period, val_start_arr.shape, starting_hour, starting_weekday)
    pred_stop_arr = HA_formatting(count_stop_df, train_period, val_stop_arr.shape, starting_hour, starting_weekday)
    
    val_start_minv_arr = val_start_arr[val_start_arr>min_vol_metrics]
    pred_start_minv_arr = pred_start_arr[val_start_arr>min_vol_metrics]
    val_stop_minv_arr = val_stop_arr[val_stop_arr>min_vol_metrics]
    pred_stop_minv_arr = pred_stop_arr[val_stop_arr>min_vol_metrics]
    print("Start RMSE = %.2f" %np.sqrt(((pred_start_minv_arr-val_start_minv_arr)**2).mean()),
          "| Start MAPE = %.2f" %((abs(pred_start_minv_arr-val_start_minv_arr)/val_start_minv_arr).mean()*100), "%",
          "| Stop RMSE = %.2f" %np.sqrt(((pred_stop_minv_arr-val_stop_minv_arr)**2).mean()),
          "| Stop MAPE = %.2f" %((abs(pred_stop_minv_arr-val_stop_minv_arr)/val_stop_minv_arr).mean()*100), "%")

In [None]:
data_bike_df = pd.read_csv("Datasets/bike_prepared.csv", index_col=0)
HA_scoring(data_bike_df, TRAIN_PERIOD, MIN_VOL_METRICS)

In [None]:
data_ytaxi_df = pd.read_csv("Datasets/ytaxi_prepared.csv", index_col=0)
HA_scoring(data_ytaxi_df, TRAIN_PERIOD, MIN_VOL_METRICS)

In [None]:
data_gtaxi_df = pd.read_csv("Datasets/gtaxi_prepared.csv", index_col=0)
HA_scoring(data_gtaxi_df, TRAIN_PERIOD, MIN_VOL_METRICS)