In [2]:
import pandas as pd
import glob
import os

In [None]:
# Preliminary Preprocess code

def data_preprocess(dir: str, train_b: bool):
    if train_b:
        all_files = glob.glob(os.path.join(dir, 'yellow_tripdata_2023-*.parquet'))
    else:
        all_files = glob.glob(os.path.join(dir, 'yellow_tripdata_2024-*.parquet'))
        
    df_list = [pd.read_parquet(file) for file in all_files]
    df = pd.concat(df_list, ignore_index=True)
    
    df = df[['PULocationID', 'DOLocationID', 'tpep_pickup_datetime', 'tpep_dropoff_datetime', 'trip_distance', 'total_amount']]
    df['tpep_pickup_datetime'] = pd.to_datetime(df['tpep_pickup_datetime'])
    df['time_bin'] = df['tpep_pickup_datetime'].dt.floor(f'{60}min')
    df['day_of_week'] = df['time_bin'].dt.dayofweek
    df['hour'] = df['time_bin'].dt.hour
    df['month'] = df['time_bin'].dt.month
    df['weekend'] = df['day_of_week'].apply(lambda x: 1 if x >= 5 else 0)
    df['travel_time'] = (df['tpep_dropoff_datetime'] - df['tpep_pickup_datetime']).dt.total_seconds()
    df = df.drop(columns=['tpep_pickup_datetime', 'tpep_dropoff_datetime', 'time_bin'])
    
    # Save the preprocessed data as csv
    if train_b:
        df.to_csv('data/train.csv', index=False)
    else:
        df.to_csv('data/test.csv', index=False)
        
    return df

In [8]:
# Preprocessing the training and testing data

data_preprocess('data/train', True)
data_preprocess('data/test', False)

Unnamed: 0,PULocationID,DOLocationID,trip_distance,total_amount,day_of_week,hour,month,weekend,travel_time
0,68,236,4.39,26.78,3,0,2,0,913.0
1,48,243,7.71,45.00,3,0,2,0,862.0
2,132,261,28.69,82.69,3,0,2,0,2122.0
3,161,163,1.10,17.15,3,0,2,0,538.0
4,246,79,2.60,20.60,3,0,2,0,820.0
...,...,...,...,...,...,...,...,...,...
41169715,130,218,4.27,15.77,6,23,3,1,755.0
41169716,79,100,0.00,24.80,6,23,3,1,859.0
41169717,63,181,6.44,31.50,6,23,3,1,1949.0
41169718,161,148,3.88,31.58,6,23,3,1,1071.0
