In [0]:
# This notebook simulates a taxi booking system dropping files into cloud storage.

import os
import time
from datetime import datetime, timedelta
import random
import pandas as pd

In [0]:
# Path to the Unity Catalog Volume created
volume_path = "/Volumes/workspace/taxi_assignment_db/taxi_raw_volume/"
os.makedirs(volume_path, exist_ok=True)

print(f"Target Volume: {volume_path}")

Target Volume: /Volumes/workspace/taxi_assignment_db/taxi_raw_volume/


In [0]:
# --- GENERATOR LOGIC ---
def generate_realistic_batch(batch_id):
    data = []
    
    # distinct start time for each batch to simulate data arriving over 3 days
    base_time = datetime.now() - timedelta(days=3 - batch_id)
    
    print(f"Generating Batch {batch_id} starting around {base_time}...")

    for i in range(20): 
        ride_time = base_time + timedelta(minutes=random.randint(1, 60) * i)
        
        # logic to create specific data scenarios
        trip_dist = round(random.uniform(1, 20), 2)
        fare = round(random.uniform(5, 50), 2)
    
        if i == 5:
            trip_dist = -5.0 # This should be dropped by the pipeline

        if i == 10:
            trip_dist = 0.5
            fare = 100.0 # This should be flagged as suspicious in Silver

        ride = {
            "ride_id": f"b{batch_id}_r{i}_{random.randint(1000,9999)}",
            "timestamp": ride_time,
            "passenger_id": random.randint(1, 50),
            "trip_distance": trip_dist,
            "fare_amount": fare
        }
        data.append(ride)
    
    # Save
    df = pd.DataFrame(data)
    file_name = f"{volume_path}taxi_batch_v2_{batch_id}.csv"
    df.to_csv(file_name, index=False)
    print(f"-> Written {file_name}")

In [0]:
# Creating 3 batches of simulated files
for i in range(1, 4):
    generate_batch(i)

print("Data Generation Complete!")

Created file: /Volumes/workspace/taxi_assignment_db/taxi_raw_volume/taxi_batch_1.csv
Created file: /Volumes/workspace/taxi_assignment_db/taxi_raw_volume/taxi_batch_2.csv
Created file: /Volumes/workspace/taxi_assignment_db/taxi_raw_volume/taxi_batch_3.csv
Data Generation Complete!
