### DMM ETL compation emulation
#### Using NYC Taxi Trip Data Sample with Sparse Matrix Simulation


In [None]:
# Imports
import pandas as pd
import numpy as np
from memory_profiler import memory_usage
import time
import gc

# Load the same 10K NYC taxi data sample
df = pd.read_csv('../data/processed/yellow_tripdata_sample_10k.csv')
df.head()

# Step 1: Create a sparse transformation matrix
rows, cols = 100, 100
sparse_matrix = np.zeros((rows, cols))
np.random.seed(42)

# Activate ~5% of the matrix randomly (compaction simulation)
active_indices = np.random.choice(rows * cols, size=int(0.05 * rows * cols), replace=False)
for index in active_indices:
    r, c = divmod(index, cols)
    sparse_matrix[r, c] = 1

# Step 2: Create label mappings only for payment_type values 1–6 (emulated compaction)
labels = [f"Label_{i}" for i in range(1, 101)]
payment_map_sparse_dmm = {i: labels[i] for i in range(1, 7)}

# Step 3: Define the sparse DMM transformation function
def dmm_sparse_transform(df):
    df['payment_label'] = df['payment_type'].map(payment_map_sparse_dmm)
    df['trip_duration_min'] = (
        pd.to_datetime(df['tpep_dropoff_datetime']) - pd.to_datetime(df['tpep_pickup_datetime'])
    ).dt.total_seconds() / 60
    return df

# Step 4: Profile memory and time
start_time = time.time()
mem_usage = memory_usage((dmm_sparse_transform, (df.copy(),)), interval=0.01, retval=False)
elapsed_time = time.time() - start_time

print(f"Memory usage: {max(mem_usage) - min(mem_usage):.2f} MB")
print(f"Execution time: {elapsed_time:.4f} seconds")

# Step 5: Save the transformed DataFrame
transformed_df = dmm_sparse_transform(df.copy())
transformed_df.to_csv('../data/processed/transformed/transformed_dmm_sparse.csv', index=False)
print("Transformed sparse DMM data saved.")

# Step 6: Clean up memory
del transformed_df
del df
gc.collect(0)

Memory usage: 0.02 MB
Execution time: 0.4611 seconds
Transformed sparse DMM data saved.


52