# **GENERATION OF SYNTHETIC DATA**

In [0]:
# Databricks notebook cell
# Step 1: Import libraries
import pandas as pd
import numpy as np
import datetime as dt
import random
from pyspark.sql import SparkSession

# Initialize Spark session
spark = SparkSession.builder.getOrCreate()

# Step 2: Synthetic data configuration
num_devices = 20                # Number of IoT devices (laptops, AC, fridge, etc.)
days = 30                       # Number of days of data
readings_per_day = 24           # Hourly readings

# Step 3: Generate device metadata
device_types = ['Laptop', 'Smartphone', 'TV', 'AC', 'Fridge', 'WashingMachine', 'Heater', 'Fan']
locations = ['Home', 'Office', 'Factory']

devices = [f'device_{i:03d}' for i in range(1, num_devices + 1)]
device_info = {d: {'type': random.choice(device_types),
                   'location': random.choice(locations)}
               for d in devices}

# Step 4: Generate timestamped energy data
data = []
start_date = dt.datetime.now() - dt.timedelta(days=days)
carbon_factors = {'Home': 0.6, 'Office': 0.4, 'Factory': 0.7}  # kg CO2 per kWh

for day in range(days):
    for hour in range(readings_per_day):
        timestamp = start_date + dt.timedelta(days=day, hours=hour)
        for device in devices:
            base_usage = {
                'Laptop': 0.15, 'Smartphone': 0.05, 'TV': 0.20,
                'AC': 1.50, 'Fridge': 0.30, 'WashingMachine': 0.8,
                'Heater': 1.2, 'Fan': 0.10
            }[device_info[device]['type']]

            # Add random daily/hourly variation
            usage = base_usage + np.random.normal(0, base_usage * 0.2)

            # Add spike anomalies
            if random.random() < 0.005:
                usage *= random.uniform(3, 6)

            # Carbon footprint (kg CO2)
            location = device_info[device]['location']
            carbon = usage * carbon_factors[location]

            data.append([timestamp, device, device_info[device]['type'], location, round(usage, 3), round(carbon, 3)])

# Step 5: Create DataFrame
df = pd.DataFrame(data, columns=['timestamp', 'device_id', 'device_type', 'location', 'energy_kWh', 'carbon_kg'])

# Step 6: Save as CSV to DBFS
csv_path = (
    "/Volumes/data/energy_volume/energy_power_data/energy_power_data.csv"
)
df.to_csv(csv_path, index=False)
print(f"✅ Synthetic dataset saved at {csv_path}")
print(f"Total records: {len(df)}")

✅ Synthetic dataset saved at /Volumes/data/energy_volume/energy_power_data/energy_power_data.csv
Total records: 14400


In [0]:
df = spark.read.option("header", True).csv(
    "/Volumes/data/energy_volume/energy_power_data/energy_power_data.csv"
)
display(df)

timestamp,device_id,device_type,location,energy_kWh,carbon_kg
2025-10-12 08:33:55.663224,device_001,Smartphone,Home,0.044,0.026
2025-10-12 08:33:55.663224,device_002,Fan,Office,0.081,0.032
2025-10-12 08:33:55.663224,device_003,Laptop,Office,0.106,0.042
2025-10-12 08:33:55.663224,device_004,Fan,Home,0.103,0.062
2025-10-12 08:33:55.663224,device_005,AC,Office,1.457,0.583
2025-10-12 08:33:55.663224,device_006,Fridge,Office,0.248,0.099
2025-10-12 08:33:55.663224,device_007,Fan,Factory,0.109,0.076
2025-10-12 08:33:55.663224,device_008,AC,Factory,1.923,1.346
2025-10-12 08:33:55.663224,device_009,Heater,Factory,1.043,0.73
2025-10-12 08:33:55.663224,device_010,Fan,Home,0.116,0.07


In [0]:
df.write.mode("overwrite").saveAsTable(
    "data.energy_volume.energy_power_data"
)

In [0]:
# Load the saved table into a Spark DataFrame
df_energy = spark.read.table("data.energy_volume.energy_power_data")

# Preview top rows
display(df_energy)

timestamp,device_id,device_type,location,energy_kWh,carbon_kg
2025-10-12 08:33:55.663224,device_001,Smartphone,Home,0.044,0.026
2025-10-12 08:33:55.663224,device_002,Fan,Office,0.081,0.032
2025-10-12 08:33:55.663224,device_003,Laptop,Office,0.106,0.042
2025-10-12 08:33:55.663224,device_004,Fan,Home,0.103,0.062
2025-10-12 08:33:55.663224,device_005,AC,Office,1.457,0.583
2025-10-12 08:33:55.663224,device_006,Fridge,Office,0.248,0.099
2025-10-12 08:33:55.663224,device_007,Fan,Factory,0.109,0.076
2025-10-12 08:33:55.663224,device_008,AC,Factory,1.923,1.346
2025-10-12 08:33:55.663224,device_009,Heater,Factory,1.043,0.73
2025-10-12 08:33:55.663224,device_010,Fan,Home,0.116,0.07
