In [1]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

# Data Preparation for Demand Prediction Models

In this notebook we prepare the taxi trip data for the prediction models by creating dataframes for each spatio-temporal resolution.

First we read in the cleaned taxi trips data and drop unnecessary columns. Then we rename the remaining columns because we do not need the info pickup anymore in the column names.

In [2]:
df = pd.read_parquet('../../data/rides/Taxi_Trips_Cleaned.parquet')

df.drop(columns=[
    # drop trip specific columns
    "taxi_id",
    "trip_seconds",
    "trip_miles",
    "fare",
    "tips",
    "tolls",
    "Extras",
    "trip_total",
    "payment_type",
    "Company",
    # drop time columns except start timestamp
    "trip_end_timestamp",
    "hour_end",
    "4_hour_block_end",
    "day_end",
    "week_end",
    "dayofweek_end",
    "month_end",
    "hour_start",
    "4_hour_block_start",
    "day_start",
    "week_start",
    "dayofweek_start",
    "month_start",
    "is_weekday",
    # drop end location columns
    "dropoff_community_area",
    "dropoff_census_tract",
    "dropoff_centroid",
    "h3_05_dropoff",
    "h3_06_dropoff",
    "h3_07_dropoff",
    "h3_08_dropoff",
    "h3_09_dropoff",
    # drop Pickup Census Tract and Community Area because equal to Pickup Centroid
    "pickup_census_tract",
    "pickup_community_area",
    # drop datetime because Time Start Timestamp sufficient
    "datetime"
    ],
    inplace=True)

df.columns

Index(['trip_start_timestamp', 'h3_05_pickup', 'h3_06_pickup', 'h3_07_pickup',
       'h3_08_pickup', 'h3_09_pickup', 'pickup_centroid', 'temp', 'precip'],
      dtype='object')

In [3]:
# rename columns
df.rename(columns={col:col[:-7] for col in df.columns if "h3" in col}, inplace=True) # remove _Pickup from column names
df.rename(columns={"trip_start_timestamp": "datetime", "pickup_centroid": "centroid"}, inplace=True)
df.head()

Unnamed: 0,datetime,h3_05,h3_06,h3_07,h3_08,h3_09,centroid,temp,precip
0,2015-01-01 00:00:00,852664c3fffffff,862664c1fffffff,872664c1effffff,882664c1e1fffff,892664c1e0fffff,POINT (-87.626214906 41.892507781),-7.0115,0
1,2015-01-01 00:30:00,852664c3fffffff,862664c1fffffff,872664c1effffff,882664c1e7fffff,892664c1e73ffff,POINT (-87.63186395 41.892042136),-7.0115,0
2,2015-01-01 00:30:00,852664c3fffffff,862664c1fffffff,872664c1effffff,882664c1e7fffff,892664c1e73ffff,POINT (-87.63186395 41.892042136),-7.0115,0
3,2015-01-01 00:30:00,852664c3fffffff,862664c17ffffff,872664c12ffffff,882664c129fffff,892664c1293ffff,POINT (-87.661265218 41.936159071),-7.0115,0
4,2015-01-01 00:30:00,852664cbfffffff,862664c17ffffff,872664c12ffffff,882664c12dfffff,892664c12dbffff,POINT (-87.675821928 41.935983574),-7.0115,0


We take a look at the number of areas for each location discretization.

In [4]:
location_cols = ["h3_05", "h3_06", "h3_07", "h3_08", "h3_09", "centroid"]

## print number of categories per location column
for col in location_cols:
    print(f"{col}: {df[col].nunique()}")

h3_05: 8
h3_06: 24
h3_07: 90
h3_08: 298
h3_09: 437
centroid: 442


We define a function to create a dataframe for a spatio-temporal resolution with features about time, location, weather and points of interest.

In [5]:
def create_spatio_temporal_df(df, poi_df, time_bucket_length, location_bucket, poi_data=True):
    # create time bucket
    bucket_df = df.copy()
    bucket_df["time_bucket_floored"] = bucket_df["datetime"].dt.floor(
            f"{time_bucket_length}H"
    )
    bucket_df.drop(columns=["datetime"], inplace=True)

    # create demand per time and location bucket
    demand_df = bucket_df.groupby(["time_bucket_floored", location_bucket]).size().to_frame("demand")
    features_df = bucket_df[["time_bucket_floored", location_bucket] + ["temp", "precip"]].groupby(["time_bucket_floored", location_bucket]).mean()
    demand_feat_df = features_df.merge(demand_df, left_index=True, right_index=True)
    demand_feat_df.reset_index(inplace=True)

    # derive time features
    demand_feat_df["hour_bucket"] = demand_feat_df["time_bucket_floored"].apply(lambda x: x.hour//time_bucket_length)
    demand_feat_df["day_of_week"] = demand_feat_df["time_bucket_floored"].apply(lambda x: x.dayofweek)
    demand_feat_df["is_weekday"] = demand_feat_df["day_of_week"] >= 5
    demand_feat_df["is_weekday"] = demand_feat_df["is_weekday"].astype(int)
    demand_feat_df["month"] = demand_feat_df["time_bucket_floored"].apply(lambda x: x.month)
    if time_bucket_length == 24:
        demand_feat_df.drop(columns=["hour_bucket"], inplace=True)
    demand_feat_df.drop(columns=["time_bucket_floored"], inplace=True)

    # get poi data if activated
    if poi_data:
        poi_location_bucket = poi_df[poi_df["h3_res"] == int(location_bucket[-1])]
        poi_location_bucket.drop(columns=["h3_res", "h3_incl_neighbors"], inplace=True)
        poi_location_bucket.rename(columns={"h3": location_bucket}, inplace=True)
        return demand_feat_df.merge(poi_location_bucket, on=location_bucket, how="left")

    return demand_feat_df

We list all time and location resolutions and create for each combination a dataframe which is saved. For the hexagon resolutions we add point of interest data. If a hexagon is not listed in the POI dataframe, its POI values are filled with 0.

In [6]:
time_bucket_lengths = [1, 2, 4, 6, 24]
location_buckets = ["h3_05", "h3_06", "h3_07", "h3_08", "h3_09", "centroid"]

In [7]:
# read poi data
poi_df = pd.read_parquet("../../data/poi/poi_hexagon_data.parquet")

# create dataframe for each spatio temporal combination
for time_bucket_length in time_bucket_lengths:
    for location_bucket in location_buckets:
        poi = location_bucket != "centroid"
        print(f"time bucket length {time_bucket_length} and location bucket {location_bucket}")
        df_st = create_spatio_temporal_df(df, poi_df, time_bucket_length, location_bucket, poi_data=poi)
        if poi:
            number_hexagons_nan = df_st[df_st["public_transport_poi"].isna()][location_bucket].nunique()
            print(f"{number_hexagons_nan} hexagons not in poi data and filled with 0")
            df_st.fillna(0, inplace=True)
        df_st.to_parquet(f"../../data/predictive/Taxi_Trips_Spatio_Temporal_{time_bucket_length}_{location_bucket}.parquet")

time bucket length 1 and location bucket h3_05
0 hexagons not in poi data and filled with 0
time bucket length 1 and location bucket h3_06
0 hexagons not in poi data and filled with 0
time bucket length 1 and location bucket h3_07
4 hexagons not in poi data and filled with 0
time bucket length 1 and location bucket h3_08
23 hexagons not in poi data and filled with 0
time bucket length 1 and location bucket h3_09
185 hexagons not in poi data and filled with 0
time bucket length 1 and location bucket centroid
time bucket length 2 and location bucket h3_05
0 hexagons not in poi data and filled with 0
time bucket length 2 and location bucket h3_06
0 hexagons not in poi data and filled with 0
time bucket length 2 and location bucket h3_07
4 hexagons not in poi data and filled with 0
time bucket length 2 and location bucket h3_08
23 hexagons not in poi data and filled with 0
time bucket length 2 and location bucket h3_09
185 hexagons not in poi data and filled with 0
time bucket length 2 and