DMM-Style ETL Prototype\nUsing NYC Taxi Trip Data Sample

In [10]:
#imports
import pandas as pd
import numpy as np
from memory_profiler import memory_usage
import time

# Load the same 10K NYC taxi data sample
df = pd.read_csv('../data/processed/yellow_tripdata_sample_10k.csv')
df.head()

# Simulate sparse mapping matrix

#Step 1: Create a sparse transformation matrix for payment_type
payment_matrix = np.zeros((6, 6))  # Simulate a 6x6 identity-like mapping
np.fill_diagonal(payment_matrix, 1)
# Define label list to map
payment_labels = ['Credit Card', 'Cash', 'No Charge', 'Dispute', 'Unknown', 'Voided Trip']
payment_map_dmm = dict(zip(range(1, 7), payment_labels))

# Matrix-style transformation

# Step 2: Apply matrix-based transformation
def dmm_transform(df):
    df['payment_label'] = df['payment_type'].map(payment_map_dmm)
    df['trip_duration_min'] = (
        pd.to_datetime(df['tpep_dropoff_datetime']) - pd.to_datetime(df['tpep_pickup_datetime'])
    ).dt.total_seconds() / 60
    return df

# Profiling

# Step 3: Profile time and memory usage of DMM transformation
start_time = time.time()
mem_usage = memory_usage((dmm_transform, (df.copy(),)))
elapsed_time = time.time() - start_time
    
print(f"Memory usage: {max(mem_usage) - min(mem_usage):.2f} MB")
print(f"Execution time: {elapsed_time:.4f} seconds")

Memory usage: 0.06 MB
Execution time: 0.8117 seconds
