Simple DMM-Style ETL Prototype\nUsing NYC Taxi Trip Data Sample

In [None]:
# Imports
import pandas as pd
import numpy as np
from memory_profiler import memory_usage
import time
import gc

# Load the same 10K NYC taxi data sample
df = pd.read_csv('../data/processed/yellow_tripdata_sample_10k.csv')
df.head()

# Step 1: Create a DMM-style identity matrix for transformation
payment_matrix = np.zeros((6, 6))  # Simulate a 6x6 identity matrix
np.fill_diagonal(payment_matrix, 1)

# Define label list for mapping
payment_labels = ['Credit Card', 'Cash', 'No Charge', 'Dispute', 'Unknown', 'Voided Trip']
payment_map_dmm = dict(zip(range(1, 7), payment_labels))

# Step 2: Define matrix-based transformation function
def dmm_transform(df):
    df['payment_label'] = df['payment_type'].map(payment_map_dmm)
    df['trip_duration_min'] = (
        pd.to_datetime(df['tpep_dropoff_datetime']) - pd.to_datetime(df['tpep_pickup_datetime'])
    ).dt.total_seconds() / 60
    return df

# Step 3: Profile time and memory usage
start_time = time.time()
mem_usage = memory_usage((dmm_transform, (df.copy(),)), interval=0.01, retval=False)
elapsed_time = time.time() - start_time

print(f"Memory usage: {max(mem_usage) - min(mem_usage):.2f} MB")
print(f"Execution time: {elapsed_time:.4f} seconds")

# Step 4: Save transformed result
transformed_df = dmm_transform(df.copy())
transformed_df.to_csv('../data/processed/transformed/transformed_dmm_full.csv', index=False)
print("Transformed full DMM data saved.")

# Step 5: Clean up memory
del transformed_df
del df
gc.collect(0)

Memory usage: 0.03 MB
Execution time: 0.5468 seconds
Transformed full DMM data saved.


52

### Old Version:

Simple DMM-Style ETL Prototype\nUsing NYC Taxi Trip Data Sample

In [10]:
#imports
import pandas as pd
import numpy as np
from memory_profiler import memory_usage
import time

# Load the same 10K NYC taxi data sample
df = pd.read_csv('../data/processed/yellow_tripdata_sample_10k.csv')
df.head()

# Simulate sparse mapping matrix

#Step 1: Create a sparse transformation matrix for payment_type
payment_matrix = np.zeros((6, 6))  # Simulate a 6x6 identity-like mapping
np.fill_diagonal(payment_matrix, 1)
# Define label list to map
payment_labels = ['Credit Card', 'Cash', 'No Charge', 'Dispute', 'Unknown', 'Voided Trip']
payment_map_dmm = dict(zip(range(1, 7), payment_labels))

# Matrix-style transformation

# Step 2: Apply matrix-based transformation
def dmm_transform(df):
    df['payment_label'] = df['payment_type'].map(payment_map_dmm)
    df['trip_duration_min'] = (
        pd.to_datetime(df['tpep_dropoff_datetime']) - pd.to_datetime(df['tpep_pickup_datetime'])
    ).dt.total_seconds() / 60
    return df

# Profiling

# Step 3: Profile time and memory usage of DMM transformation
start_time = time.time()
mem_usage = memory_usage((dmm_transform, (df.copy(),)))
elapsed_time = time.time() - start_time
    
print(f"Memory usage: {max(mem_usage) - min(mem_usage):.2f} MB")
print(f"Execution time: {elapsed_time:.4f} seconds")

Memory usage: 0.06 MB
Execution time: 0.8117 seconds


Sparced(100*100) DMM-Style ETL Prototype\nUsing NYC Taxi Trip Data Sample

In [None]:
# Imports
import pandas as pd
import numpy as np
from memory_profiler import memory_usage
import time


# Load the same 10K NYC taxi data sample
df = pd.read_csv('../data/processed/yellow_tripdata_sample_10k.csv')
df.head()

# Simulate sparse mapping matrix

# Step 1: Create a sparse transformation matrix
rows, cols = 100, 100
sparse_matrix = np.zeros((rows, cols))
np.random.seed(42)

# Activate ~5% of the matrix randomly
active_indices = np.random.choice(rows * cols, size=int(0.05 * rows * cols), replace=False)
for index in active_indices:
    r, c = divmod(index, cols)
    sparse_matrix[r, c] = 1

# Create label mappings for a subset (e.g., payment_type 1–6)
labels = [f"Label_{i}" for i in range(1, 101)]
payment_map_sparse_dmm = {i: labels[i] for i in range(1, 7)}

# Define transformation function
def dmm_sparse_transform(df):
    df['payment_label'] = df['payment_type'].map(payment_map_sparse_dmm)
    df['trip_duration_min'] = (
        pd.to_datetime(df['tpep_dropoff_datetime']) - pd.to_datetime(df['tpep_pickup_datetime'])
    ).dt.total_seconds() / 60
    return df


# Profile time and memory usage
start_time = time.time()
mem_usage = memory_usage((dmm_sparse_transform, (df.copy(),)))
elapsed_time = time.time() - start_time

print(f"Memory usage: {max(mem_usage) - min(mem_usage):.2f} MB")
print(f"Execution time: {elapsed_time:.4f} seconds")

# Save the transformed DataFrame
transformed_df = dmm_sparse_transform(df.copy())
transformed_df.to_csv('../data/processed/transformed_dmm_sparse.csv', index=False)
print("Transformed sparse DMM data saved.")

Memory usage: 0.19 MB
Execution time: 0.8119 seconds
Transformed sparse DMM data saved.
