## Data Generation and Loading into Volume

In [0]:
# import statements
from datetime import datetime, timedelta
import random
import pandas as pd
import uuid

# generating Customers Data
def generate_cdc_data_customers(num_row = 30):
    operations = ['insert', 'update', 'delete']
    data = []
    for i in range(num_row):
        cust_id = i+1
        op = random.choice(operations)
        ts = datetime.now() - timedelta(minutes=random.randint(0, 60))

        record ={
            "customerId": cust_id,
            "Name": f"Customer_{cust_id}",
            "email": f"customer{cust_id}@gmail.com",
            "phone": f"+91-954-321-{random.randint(1000,9999)}",
            "operation": op,
            "ts": ts.strftime("%Y-%m-%d %H:%M:%S.%f")
        }
        data.append(record)
    return pd.DataFrame(data)
    
# generating Orders Data
def generate_cdc_data_orders(num_row = 30):
    operations = ['insert', 'update', 'delete']
    data = []
    for i in range(num_row):
        order_id = uuid.uuid4().hex[:8]
        op = random.choice(operations)
        ts = datetime.now() - timedelta(minutes=random.randint(1, 60))

        record ={
            "orderId": order_id,
            "customerId": random.randint(1,30),
            "order_amount":round(random.uniform(100, 1000),2),
            "orderStatus":random.choice(["Pending", "Shipped", "Delivered", "Cancelled"]),
            "operation": op,
            "ts": ts.strftime("%Y-%m-%d %H:%M:%S.%f")
        }
        data.append(record)
    return pd.DataFrame(data)

customer_df = generate_cdc_data_customers()
order_df = generate_cdc_data_orders()
                                    
# creating DataFrames 
customer_df = spark.createDataFrame(customer_df)
order_df = spark.createDataFrame(order_df)

# Paths to the Volumes
customers_volume_path = "/Volumes/adb_catalog/landing_zone/raw_data/customers"
orders_volume_path = "/Volumes/adb_catalog/landing_zone/raw_data/orders"

## Writing Data into Volume
# Writing generated data into customers volume
customer_df.coalesce(1).write.mode("overwrite")\
    .option("header", True)\
    .csv(customers_volume_path)

# displaying customers data
display(customer_df)

# writing generated data into orders volume
order_df.coalesce(1).write.mode("overwrite")\
    .option("header", True)\
    .csv(orders_volume_path)

# displaying orders data
display(order_df)