In [6]:
import pandas as pd
import glob
import os

parquet_folder = 'data/train'
all_files = glob.glob(os.path.join(parquet_folder, 'yellow_tripdata_2024-*.parquet'))
df_list = [pd.read_parquet(file) for file in all_files]
df = pd.concat(df_list, ignore_index=True)

In [7]:
# keep only the columns we need
df = df[['PULocationID', 'DOLocationID', 'tpep_pickup_datetime', 'tpep_dropoff_datetime', 'trip_distance', 'total_amount']]

In [8]:
df['tpep_pickup_datetime'] = pd.to_datetime(df['tpep_pickup_datetime'])
print("Creating time bins...")
df['time_bin'] = df['tpep_pickup_datetime'].dt.floor(f'{60}min')

print("Preparing features...")
df['day_of_week'] = df['time_bin'].dt.dayofweek
df['hour'] = df['time_bin'].dt.hour
df['month'] = df['time_bin'].dt.month
df['day'] = df['time_bin'].dt.day
df['year'] = df['time_bin'].dt.year
df['weekend'] = df['day_of_week'] >= 5

output_csv = f'{parquet_folder}/train.csv'
print(f"Saving processed data to {output_csv}...")
df.to_csv(output_csv, index=False)
print("Data processing completed.")

Creating time bins...
Preparing features...
Saving processed data to data/train/pickups_aggregated.csv...
Data processing completed.
