In [1]:
# Cell 0: Install dependencies
!pip install azure-eventhub faker pandas

Collecting azure-eventhub
  Downloading azure_eventhub-5.15.0-py3-none-any.whl.metadata (73 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m73.1/73.1 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting faker
  Downloading faker-37.6.0-py3-none-any.whl.metadata (15 kB)
Downloading azure_eventhub-5.15.0-py3-none-any.whl (327 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m327.8/327.8 kB[0m [31m19.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading faker-37.6.0-py3-none-any.whl (1.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m37.0 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[?25hInstalling collected packages: faker, azure-eventhub
Successfully installed azure-eventhub-5.15.0 faker-37.6.0


In [None]:
# Cell 1: Imports & Configuration
import random
import uuid
import time
from datetime import datetime, timedelta
import threading
import json
import pandas as pd
from faker import Faker
from azure.eventhub import EventHubProducerClient, EventData

# Configurable Variables
EVENT_HUB_CONNECTION_STR = "<your-custom-endpoint-eventhub-connection-string>"
EVENT_HUB_NAME = "<your-custom-endpoint-eventhub-name>"
EVENTS_PER_SECOND = 10

NUM_SITES = 10
NUM_ASSETS = 50
NUM_OPERATORS = 30
NUM_PRODUCTS = 20

fake = Faker()
random.seed(42)

In [3]:
# Cell 2: Generate Site Locations
site_cities = ["Berlin", "Shanghai", "Austin", "Buffalo", "Barcelona", "Mumbai", "Paris", "London", "Rome", "Amsterdam"]

# City-to-country mapping
city_country_map = {
    "Berlin": "Germany",
    "Shanghai": "China",
    "Austin": "USA",
    "Buffalo": "USA",
    "Barcelona": "Spain",
    "Mumbai": "India",
    "Paris": "France",
    "London": "UK",
    "Rome": "Italy",
    "Amsterdam": "Netherlands"
}
site_cities = list(city_country_map.keys())

sites = []
for i in range(NUM_SITES):
    city = site_cities[i % len(site_cities)]
    country = city_country_map[city]
    sites.append({
        "SiteId": f"SITE{1000+i}",
        "City": city,
        "Country": country,
        "Latitude": round(fake.latitude(), 4),
        "Longitude": round(fake.longitude(), 4),
        "PlantType": random.choice(["Assembly", "Supplier", "Warehouse"])
    })
sites_df = pd.DataFrame(sites)
# sites_df.head(10)

In [5]:
# Cell 3: Generate Assets
assets = []
for i in range(NUM_ASSETS):
    assets.append({
        "AssetId": f"ASSET{2000+i}",
        "SiteId": random.choice(sites)["SiteId"],
        "AssetName": random.choice(["Conveyor","Grinder","Printer","Assemblyline"]),
        "SerialNumber": fake.bothify(text='??##??##'),
        "MaintenanceStatus": random.choice(["Done", "Pending", "Scheduled"])
    })
assets_df = pd.DataFrame(assets)

In [6]:
# Cell 4: Generate Operators
operators = []
for i in range(NUM_OPERATORS):
    operators.append({
        "OperatorId": f"OP{3000+i}",
        "Name": fake.name(),
        "Phone": fake.phone_number(),
        "Email": fake.email(),
        "SiteId": random.choice(sites)["SiteId"],
        "Role": random.choice(["Line Operator", "Quality Inspector", "Supervisor"]),
        "Shift": random.choice(["Day", "Night"])
    })
operators_df = pd.DataFrame(operators)

In [7]:
# Cell 5: Generate Products
product_types = ["Cyberpunk Hat", "Oldschool Cardigan", "TropicFeel Tshirt", "CloudShell Jacket", "UrbanStep Shoes", "ClassicWear Hoodie"]
products = []
for i in range(NUM_PRODUCTS):
    products.append({
        "ProductId": f"PROD{4000+i}",
        "ProductName": random.choice(product_types),
        "SKU": f"SKU{4000+i}",
        "Brand": random.choice(["ZAVA", "AirRun", "StreetFlex", "UrbanStep", "ClassicWear"]),
        "Category": random.choice(["GenZ Pros", "Altars", "Colours", "Kids"]),
        "UnitCost": round(random.uniform(19.99, 299.99), 2)
    })
products_df = pd.DataFrame(products)

In [9]:
# Cell 6: Event Generator Functions
def generate_production_event():
    site = random.choice(sites)
    asset = random.choice(assets)
    operator = random.choice(operators)
    product = random.choice(products)
    batch_id = str(uuid.uuid4())
    timestamp = datetime.utcnow().isoformat() + "Z"
    defect_probability = round(random.uniform(0, 0.2), 3)
    vibration = round(random.uniform(0.5, 1.5), 2)
    temperature = round(random.uniform(20, 30), 1)
    humidity = random.randint(30, 80)
    return {
        "event_type": "production",
        "timestamp": timestamp,
        "SiteId": site["SiteId"],
        "City": site["City"],
        "AssetId": asset["AssetId"],
        "OperatorId": operator["OperatorId"],
        "OperatorName": operator["Name"],
        "ProductId": product["ProductId"],
        "SKU": product["SKU"],
        "BatchId": batch_id,
        "DefectProbability": defect_probability,
        "Vibration": vibration,
        "Temperature": temperature,
        "Humidity": humidity
    }

In [10]:
# Cell 7: EventHub Producer Setup
producer = EventHubProducerClient.from_connection_string(conn_str=EVENT_HUB_CONNECTION_STR, eventhub_name=EVENT_HUB_NAME)

In [11]:
# Cell 8: Streaming Simulation
def emit_event(event):
    message = json.dumps(event)
    event_data = EventData(message)
    try:
        with producer:
            producer.send_batch([event_data])
    except Exception as e:
        print(f"Failed to send event: {e}")
    print(message)

def start_simulation(events_per_second=EVENTS_PER_SECOND):
    interval = 1.0 / events_per_second
    try:
        evt = generate_production_event()
        emit_event(evt)
        time.sleep(interval)
    except KeyboardInterrupt:
        print("Simulation stopped.")

In [12]:
# Cell 9: Run Simulation (Background Thread)
thread = threading.Thread(target=start_simulation, args=(EVENTS_PER_SECOND,), daemon=True)
thread.start()
print("Manufacturing data streaming simulator running. Press Ctrl+C to stop.")

Manufacturing data streaming simulator running. Press Ctrl+C to stop.
{"event_type": "quality", "timestamp": "2025-09-09T13:55:47.512503Z", "SiteId": "SITE1005", "City": "Mumbai", "AssetId": "ASSET2003", "OperatorId": "OP3001", "ProductId": "PROD4018", "SKU": "SKU4018", "BatchId": "07e86112-4f8e-4961-9319-6d89a2978c3b", "SensorType": "temperature", "SensorValue": 25.0, "AnomalyFlag": false}
{"event_type": "quality", "timestamp": "2025-09-09T13:55:48.151218Z", "SiteId": "SITE1002", "City": "Austin", "AssetId": "ASSET2003", "OperatorId": "OP3016", "ProductId": "PROD4002", "SKU": "SKU4002", "BatchId": "e5fd08e2-648b-4ef3-9b4e-ce1f87efb7bc", "SensorType": "vibration", "SensorValue": 0.57, "AnomalyFlag": false}
{"event_type": "production", "timestamp": "2025-09-09T13:55:49.194518Z", "SiteId": "SITE1003", "City": "Buffalo", "AssetId": "ASSET2025", "OperatorId": "OP3003", "OperatorName": "Joseph Hall", "ProductId": "PROD4018", "SKU": "SKU4018", "BatchId": "bd4b1318-919a-493b-90de-90f506090a27

In [8]:
# Cell 10: Load dimension data into Lakehouse Files
# Define the target path in your Lakehouse (adjust this to your environment)
lakehouse_path = "/lakehouse/default/Files"


# Write the DataFrame to the Lakehouse in CSV format
assets_df.to_csv(lakehouse_path+"/assets.csv", index=False)
sites_df.to_csv(lakehouse_path+"/sites.csv", index=False)
products_df.to_csv(lakehouse_path+"/products.csv", index=False)
operators_df.to_csv(lakehouse_path+"/operators.csv", index=False)