In [1]:
import pandas as pd

In [2]:
df = pd.read_parquet('../../data/rides/Taxi_Trips_Sampled_Cleaned.parquet')

df.drop(columns=[
    # drop trip specific columns
    "Trip Seconds",
    "Trip Miles",
    "Fare",
    "Tips",
    "Tolls",
    "Extras",
    "Trip Total",
    "Payment Type",
    "Company",
    # drop end time columns
    "Trip End Timestamp",
    "hour_end",
    "4_hour_block_end",
    "day_end",
    "week_end",
    "month_end",
    # drop end location columns
    "Dropoff Census Tract",
    "h3_07_Dropoff",
    "h3_08_Dropoff",
    "h3_09_Dropoff",
    "Dropoff Centroid",
    # drop Pickup Census Tract because equal to Pickup Centroid
    "Pickup Census Tract",
    # drop datetime because Time Start Timestamp sufficient
    "datetime"
    ],
    inplace=True)

df["day_of_week"] = df["Trip Start Timestamp"].apply(lambda x: x.dayofweek)
df["is_weekday"] = df["day_of_week"] >= 5

df.columns

Index(['Trip Start Timestamp', 'hour_start', '4_hour_block_start', 'day_start',
       'week_start', 'month_start', 'h3_07_Pickup', 'h3_08_Pickup',
       'h3_09_Pickup', 'Pickup Centroid', 'temp', 'Precip', 'day_of_week',
       'is_weekday'],
      dtype='object')

In [3]:
df.rename(columns={col:col[:-6] for col in df.columns if "start" in col}, inplace=True) # remove _start from column names
df.rename(columns={col:col[:-7] for col in df.columns if "h3" in col}, inplace=True) # remove _Pickup from column names
df.rename(columns={"Trip Start Timestamp": "datetime", "Pickup Centroid": "centroid", "Precip": "precip"}, inplace=True)
df.columns

Index(['datetime', 'hour', '4_hour_block', 'day', 'week', 'month', 'h3_07',
       'h3_08', 'h3_09', 'centroid', 'temp', 'precip', 'day_of_week',
       'is_weekday'],
      dtype='object')

In [4]:
location_cols = ["h3_07", "h3_08", "h3_09", "centroid"]
time_related_columns = ["datetime", "hour", "4_hour_block", "day", "week", "month", "day_of_week", "is_weekday"]
wheather_related_columns = ["temp", "precip"]

df = df[time_related_columns + wheather_related_columns + location_cols]
df.head()

Unnamed: 0,datetime,hour,4_hour_block,day,week,month,day_of_week,is_weekday,temp,precip,h3_07,h3_08,h3_09,centroid
0,2015-01-01 00:00:00,0,0,1,1,1,3,False,-7.0115,0,872664c11ffffff,882664c115fffff,892664c114fffff,POINT (-87.631717366 41.914616286)
1,2015-01-01 00:30:00,0,0,1,1,1,3,False,-7.0115,0,872664ca5ffffff,882664c165fffff,892664ca597ffff,POINT (-87.676182496 41.950545696)
2,2015-01-01 00:15:00,0,0,1,1,1,3,False,-7.0115,0,872664c1affffff,882664c1a9fffff,892664c1a8bffff,POINT (-87.632746489 41.880994471)
3,2015-01-01 00:00:00,0,0,1,1,1,3,False,-7.0115,0,872664c13ffffff,882664c13bfffff,892664c13afffff,POINT (-87.63576009 41.90749193)
4,2015-01-01 00:30:00,0,0,1,1,1,3,False,-7.0115,0,872664c1effffff,882664c1e3fffff,892664c1e2fffff,POINT (-87.620992913 41.884987192)


In [5]:
df.to_parquet("../../data/predictive/Taxi_Trips_Sampled_Predictive.parquet")