In [2]:
import pandas as pd
import glob
import os
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
# Preliminary Preprocess code
def plot_df_column_distribution(df, column, sample_frac=0.05):
    """
    Plot the distribution of a column from a DataFrame.

    Parameters:
        df (pd.DataFrame): The input DataFrame.
        column (str): The column to plot.
        sample_frac (float): Fraction of data to sample (default: 0.05 = 5%).
    """
    if sample_frac < 1.0:
        df = df.sample(frac=sample_frac, random_state=42)

    plt.figure(figsize=(8, 4))
    sns.histplot(df[column], kde=True, bins=100)
    plt.title(f'Distribution of {column}')
    plt.xlabel(column)
    plt.ylabel('Frequency')
    plt.tight_layout()
    plt.show()

# Remove outliers using IQR method
def remove_outliers_iqr(df, cols, factor=1.5):
    """
    Removes rows with outliers in specified columns using the IQR method.
    
    Parameters:
        df (pd.DataFrame): The input DataFrame.
        cols (list): List of column names to check for outliers.
        factor (float): Multiplier for IQR (default=1.5). Use 3 for more aggressive filtering.
    
    Returns:
        pd.DataFrame: Filtered DataFrame with outliers removed.
    """
    df_clean = df.copy()
    for col in cols:
        Q1 = df_clean[col].quantile(0.25)
        Q3 = df_clean[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - factor * IQR
        upper_bound = Q3 + factor * IQR
        df_clean = df_clean[(df_clean[col] >= lower_bound) & (df_clean[col] <= upper_bound)]
    return df_clean


def data_preprocess(dir: str, train_b: bool):
    if train_b:
        all_files = glob.glob(os.path.join(dir, 'yellow_tripdata_2023-*.parquet'))
    else:
        all_files = glob.glob(os.path.join(dir, 'yellow_tripdata_2024-*.parquet'))
        
    df_list = [pd.read_parquet(file) for file in all_files]
    df = pd.concat(df_list, ignore_index=True)
    
    df = df[['PULocationID', 'DOLocationID', 'tpep_pickup_datetime', 'tpep_dropoff_datetime', 'trip_distance', 'total_amount']]
    df['PULocationID'] = pd.to_numeric(df['PULocationID'], downcast='integer')
    df['DOLocationID'] = pd.to_numeric(df['DOLocationID'], downcast='integer')
    df['tpep_pickup_datetime'] = pd.to_datetime(df['tpep_pickup_datetime'])
    df['time_bin'] = df['tpep_pickup_datetime'].dt.floor(f'{60}min')
    df['day_of_week'] = df['time_bin'].dt.dayofweek
    df['day_of_month'] = df['time_bin'].dt.day
    df['hour'] = df['time_bin'].dt.hour
    df['month'] = df['time_bin'].dt.month
    df['weekend'] = df['day_of_week'].apply(lambda x: 1 if x >= 5 else 0)
    df['travel_time'] = (df['tpep_dropoff_datetime'] - df['tpep_pickup_datetime']).dt.total_seconds()
    df = df.drop(columns=['tpep_pickup_datetime', 'tpep_dropoff_datetime', 'time_bin'])
    
    # Remove rows with negative travel time or negative total amount
    df = df[df['total_amount'] > 0]
    df = df[df['travel_time'] > 0]
    
    # Remove outliers wrt total amount and travel time
    df = remove_outliers_iqr(df, ['total_amount', 'travel_time'])
    
    # Save the preprocessed data as csv
    if train_b:
        df.to_csv('data/train.csv', index=False)
    else:
        df.to_csv('data/test.csv', index=False)
        
    return df

In [4]:
# Preprocessing the training and testing data

df_train = data_preprocess('data/train', True)
df_test = data_preprocess('data/test', False)

In [5]:
# Print info

print('Training data:')
print(df_train.info())
print('Testing data:')
print(df_test.info())

Training data:
<class 'pandas.core.frame.DataFrame'>
Index: 32606864 entries, 0 to 38310225
Data columns (total 10 columns):
 #   Column         Dtype  
---  ------         -----  
 0   PULocationID   int16  
 1   DOLocationID   int16  
 2   trip_distance  float64
 3   total_amount   float64
 4   day_of_week    int32  
 5   day_of_month   int32  
 6   hour           int32  
 7   month          int32  
 8   weekend        int64  
 9   travel_time    float64
dtypes: float64(3), int16(2), int32(4), int64(1)
memory usage: 1.8 GB
None
Testing data:
<class 'pandas.core.frame.DataFrame'>
Index: 35066064 entries, 0 to 41169719
Data columns (total 10 columns):
 #   Column         Dtype  
---  ------         -----  
 0   PULocationID   int16  
 1   DOLocationID   int16  
 2   trip_distance  float64
 3   total_amount   float64
 4   day_of_week    int32  
 5   day_of_month   int32  
 6   hour           int32  
 7   month          int32  
 8   weekend        int64  
 9   travel_time    float64
dtyp

End of File