In [None]:
import pandas as pd
import numpy as np
import glob

file_path = "fhvhv_tripdata_2024-01.parquet"

df = pd.read_parquet(file_path)

print(df.columns)


Index(['hvfhs_license_num', 'dispatching_base_num', 'originating_base_num',
       'request_datetime', 'on_scene_datetime', 'pickup_datetime',
       'dropoff_datetime', 'PULocationID', 'DOLocationID', 'trip_miles',
       'trip_time', 'base_passenger_fare', 'tolls', 'bcf', 'sales_tax',
       'congestion_surcharge', 'airport_fee', 'tips', 'driver_pay',
       'shared_request_flag', 'shared_match_flag', 'access_a_ride_flag',
       'wav_request_flag', 'wav_match_flag'],
      dtype='object')


In [None]:

parquet_files = glob.glob("./fhvhv_tripdata_2024-*.parquet")


df_list = []
for file in parquet_files:
    df = pd.read_parquet(file, columns=["pickup_datetime", "PULocationID"])
    df["pickup_datetime"] = pd.to_datetime(df["pickup_datetime"])
    df["pickup_hour"] = df["pickup_datetime"].dt.floor("H")
    df_list.append(df)

df_all = pd.concat(df_list, ignore_index=True)

# group the data by PULocationID and pickup_hour
region_hourly = df_all.groupby(["PULocationID", "pickup_hour"]).size().reset_index(name="trip_count")

# add new features
region_hourly["hour"] = region_hourly["pickup_hour"].dt.hour
region_hourly["weekday"] = region_hourly["pickup_hour"].dt.dayofweek
region_hourly["is_weekend"] = (region_hourly["weekday"] >= 5).astype(int)
region_hourly["is_peak_hour"] = region_hourly["hour"].isin([7, 8, 9, 17, 18, 19]).astype(int)

holiday_list = ["2024-01-01", "2024-05-27", "2024-07-04"]
region_hourly["is_holiday"] = region_hourly["pickup_hour"].dt.date.astype(str).isin(holiday_list).astype(int)

def add_lag_features(df, group_key="PULocationID"):
    df = df.sort_values(by=[group_key, "pickup_hour"])
    df["lag_1"] = df.groupby(group_key)["trip_count"].shift(1)
    df["lag_24"] = df.groupby(group_key)["trip_count"].shift(24)
    df["rolling_mean_6"] = df.groupby(group_key)["trip_count"].shift(1).rolling(6).mean().reset_index(0, drop=True)
    df["rolling_mean_24"] = df.groupby(group_key)["trip_count"].shift(1).rolling(24).mean().reset_index(0, drop=True)
    return df

region_hourly = add_lag_features(region_hourly)
region_hourly = region_hourly.dropna().reset_index(drop=True)


region_hourly.to_csv("region_hourly_order_features_full.csv", index=False)


  df["pickup_hour"] = df["pickup_datetime"].dt.floor("H")
  df["pickup_hour"] = df["pickup_datetime"].dt.floor("H")
  df["pickup_hour"] = df["pickup_datetime"].dt.floor("H")
  df["pickup_hour"] = df["pickup_datetime"].dt.floor("H")
  df["pickup_hour"] = df["pickup_datetime"].dt.floor("H")
  df["pickup_hour"] = df["pickup_datetime"].dt.floor("H")


In [None]:
from sklearn.preprocessing import MinMaxScaler


df_transport = pd.read_csv("transport_hourly_order_features.csv")

# Define the columns to normalize
cols_to_normalize = [
    "trip_count",
    "lag_1", "lag_24",
    "rolling_mean_6", "rolling_mean_24"
]

# Initialize MinMaxScaler
scaler = MinMaxScaler()

normalized_values = scaler.fit_transform(df_transport[cols_to_normalize])

normalized_df = pd.DataFrame(
    normalized_values,
    columns=[col + "_normalized" for col in cols_to_normalize]
)

# Concatenate the normalized columns to the original dataframe
df_transport_normalized = pd.concat([df_transport, normalized_df], axis=1)


df_transport_normalized.to_csv("transport_hourly_normalized.csv", index=False)
print("Saved as transport_hourly_normalized.csv")
