In [1]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_parquet('../../data/rides/Taxi_Trips_Cleaned.parquet')

df.drop(columns=[
    # drop trip specific columns
    "Taxi ID",
    "Trip Seconds",
    "Trip Miles",
    "Fare",
    "Tips",
    "Tolls",
    "Extras",
    "Trip Total",
    "Payment Type",
    "Company",
    # drop time columns except start timestamp
    "Trip End Timestamp",
    "hour_end",
    "4_hour_block_end",
    "day_end",
    "week_end",
    "dayofweek_end",
    "month_end",
    "hour_start",
    "4_hour_block_start",
    "day_start",
    "week_start",
    "dayofweek_start",
    "month_start",
    # drop end location columns
    "Dropoff Census Tract",
    "h3_07_Dropoff",
    "h3_08_Dropoff",
    "h3_09_Dropoff",
    "Dropoff Centroid",
    # drop Pickup Census Tract because equal to Pickup Centroid
    "Pickup Census Tract",
    # drop datetime because Time Start Timestamp sufficient
    "datetime"
    ],
    inplace=True)

df.columns

Index(['Trip Start Timestamp', 'h3_07_Pickup', 'h3_08_Pickup', 'h3_09_Pickup',
       'Pickup Centroid', 'temp', 'Precip'],
      dtype='object')

In [3]:
# rename columns
df.rename(columns={col:col[:-7] for col in df.columns if "h3" in col}, inplace=True) # remove _Pickup from column names
df.rename(columns={"Trip Start Timestamp": "datetime", "Pickup Centroid": "centroid", "Precip": "precip"}, inplace=True)
df.head()

Unnamed: 0,datetime,h3_07,h3_08,h3_09,centroid,temp,precip
0,2015-01-01 00:00:00,872664c1effffff,882664c1e1fffff,892664c1e0fffff,POINT (-87.626214906 41.892507781),-7.0115,0
1,2015-01-01 00:30:00,872664c1effffff,882664c1e7fffff,892664c1e73ffff,POINT (-87.63186395 41.892042136),-7.0115,0
2,2015-01-01 00:30:00,872664c1effffff,882664c1e7fffff,892664c1e73ffff,POINT (-87.63186395 41.892042136),-7.0115,0
3,2015-01-01 00:30:00,872664c12ffffff,882664c129fffff,892664c1293ffff,POINT (-87.661265218 41.936159071),-7.0115,0
4,2015-01-01 00:30:00,872664c12ffffff,882664c12dfffff,892664c12dbffff,POINT (-87.675821928 41.935983574),-7.0115,0


In [4]:
location_cols = ["h3_07", "h3_08", "h3_09", "centroid"]

## print number of categories per location column
for col in location_cols:
    print(f"{col}: {df[col].nunique()}")

h3_07: 91
h3_08: 299
h3_09: 438
centroid: 443


In [5]:
# define function to create spatio-temporal dataframe with time, location, weather and poi data

def create_spatio_temporal_df(df, poi_df, time_bucket_length, location_bucket):
    # create time bucket
    bucket_df = df.copy()
    bucket_df["time_bucket_floored"] = bucket_df["datetime"].dt.floor(
            f"{time_bucket_length}H"
    )
    bucket_df.drop(columns=["datetime"], inplace=True)

    # create demand per time and location bucket
    demand_df = bucket_df.groupby(["time_bucket_floored", location_bucket]).size().to_frame("demand")
    features_df = bucket_df[["time_bucket_floored", location_bucket] + ["temp", "precip"]].groupby(["time_bucket_floored", location_bucket]).mean()
    demand_feat_df = features_df.merge(demand_df, left_index=True, right_index=True)
    demand_feat_df.reset_index(inplace=True)

    # derive time features
    demand_feat_df["hour_bucket"] = demand_feat_df["time_bucket_floored"].apply(lambda x: x.hour//time_bucket_length)
    demand_feat_df["day_of_week"] = demand_feat_df["time_bucket_floored"].apply(lambda x: x.dayofweek)
    demand_feat_df["is_weekday"] = demand_feat_df["day_of_week"] >= 5
    demand_feat_df["is_weekday"] = demand_feat_df["is_weekday"].astype(int)
    demand_feat_df["month"] = demand_feat_df["time_bucket_floored"].apply(lambda x: x.month)
    if time_bucket_length == 24:
        demand_feat_df.drop(columns=["hour_bucket"], inplace=True)

    # get poi data
    poi_location_bucket = poi_df[poi_df["h3_res"] == int(location_bucket[-1])]
    poi_location_bucket.drop(columns=["h3_res", "h3_incl_neighbors"], inplace=True)
    poi_location_bucket.rename(columns={"h3": location_bucket}, inplace=True)

    # return merged features
    return demand_feat_df.merge(poi_location_bucket, on=location_bucket, how="left")

In [6]:
time_bucket_lengths = [1, 2, 6, 24]
location_buckets = ["h3_07", "h3_08", "h3_09"] # "centroid" fehlt noch weil nur über langitude longitude aggregierbar 
# TODO centroid implementation in function


In [7]:
# read poi data
poi_df = pd.read_parquet("../../data/poi/poi_hexagon_data.parquet")

# create dataframe for each spatio temporal combination
for time_bucket_length in time_bucket_lengths:
    for location_bucket in location_buckets:
        print(f"create spatio-temporal df for time bucket length {time_bucket_length} and location bucket {location_bucket}")
        df_st = create_spatio_temporal_df(df, poi_df, time_bucket_length, location_bucket)
        df_st.to_parquet(f"../../data/predictive/Taxi_Trips_Spatio_Temporal_{time_bucket_length}_{location_bucket}.parquet")

create spatio-temporal df for time bucket length 1 and location bucket h3_07
create spatio-temporal df for time bucket length 1 and location bucket h3_08
create spatio-temporal df for time bucket length 1 and location bucket h3_09
create spatio-temporal df for time bucket length 2 and location bucket h3_07
create spatio-temporal df for time bucket length 2 and location bucket h3_08
create spatio-temporal df for time bucket length 2 and location bucket h3_09
create spatio-temporal df for time bucket length 6 and location bucket h3_07
create spatio-temporal df for time bucket length 6 and location bucket h3_08
create spatio-temporal df for time bucket length 6 and location bucket h3_09
create spatio-temporal df for time bucket length 24 and location bucket h3_07
create spatio-temporal df for time bucket length 24 and location bucket h3_08
create spatio-temporal df for time bucket length 24 and location bucket h3_09
