# USE CASE 3 - GREEN CHARGING/DIGITAL CARBON

##1. Load data into spark

In [0]:
# Load the saved table into a Spark DataFrame
df_energy = spark.read.table("data.energy_volume.energy_power_data")

# Preview top rows
display(df_energy)

timestamp,device_id,device_type,location,energy_kWh,carbon_kg
2025-10-12 13:42:51.985120,device_001,Smartphone,Office,0.04,0.016
2025-10-12 13:42:51.985120,device_002,Heater,Factory,1.133,0.793
2025-10-12 13:42:51.985120,device_003,TV,Factory,0.197,0.138
2025-10-12 13:42:51.985120,device_004,TV,Office,0.234,0.094
2025-10-12 13:42:51.985120,device_005,WashingMachine,Factory,0.998,0.699
2025-10-12 13:42:51.985120,device_006,Heater,Home,1.262,0.757
2025-10-12 13:42:51.985120,device_007,Laptop,Factory,0.119,0.083
2025-10-12 13:42:51.985120,device_008,Heater,Office,1.419,0.568
2025-10-12 13:42:51.985120,device_009,Fridge,Office,0.365,0.146
2025-10-12 13:42:51.985120,device_010,WashingMachine,Factory,0.801,0.561


## 2. Filter for charging devices

In [0]:
# Load the correct table
df = spark.read.table(
    "data.energy_volume.energy_power_data"
)

from pyspark.sql.functions import col, when, rand

df = df.withColumn(
    "device_type",
    when(rand() < 0.2, "Mobile")
    .when(rand() < 0.1, "EV Charger")
    .otherwise(col("device_type"))
)

chargeable_df = df.filter(
    col("device_type").isin(
        ["Laptop", "Mobile", "EV Charger"]
    )
)
display(chargeable_df)

timestamp,device_id,device_type,location,energy_kWh,carbon_kg
2025-10-12 13:42:51.985120,device_004,Mobile,Office,0.234,0.094
2025-10-12 13:42:51.985120,device_007,Mobile,Factory,0.119,0.083
2025-10-12 13:42:51.985120,device_012,Laptop,Home,0.181,0.109
2025-10-12 13:42:51.985120,device_015,Mobile,Office,0.13,0.052
2025-10-12 14:42:51.985120,device_005,Mobile,Factory,0.794,0.556
2025-10-12 14:42:51.985120,device_007,Laptop,Factory,0.156,0.109
2025-10-12 14:42:51.985120,device_010,Mobile,Factory,0.788,0.552
2025-10-12 14:42:51.985120,device_012,Laptop,Home,0.192,0.115
2025-10-12 14:42:51.985120,device_013,Mobile,Home,0.642,0.385
2025-10-12 14:42:51.985120,device_014,EV Charger,Office,0.138,0.055


## 3. Data preprocessing: Charging devices only

In [0]:
# -----------------------------------------------
# 🧩 Use Case 3: Green Charging / Digital Carbon
# -----------------------------------------------

from pyspark.sql.functions import col

# Update the path to your S3 bucket or Unity Catalog volume
csv_path = "/Volumes/data/energy_volume/energy_power_data/"

# Load your dataset
df = (
    spark.read
    .option("header", True)
    .option("inferSchema", True)
    .csv(csv_path)
)

# Rename columns for consistency
df = (
    df
    .withColumnRenamed("energy_kWh", "power_consumption_kwh")
    .withColumnRenamed("carbon_kg", "carbon_emission_kg")
)

print("✅ Data loaded and columns aligned")
display(df.limit(5))

✅ Data loaded and columns aligned


timestamp,device_id,device_type,location,power_consumption_kwh,carbon_emission_kg
2025-10-12 13:42:51.985120,device_001,Smartphone,Office,0.04,0.016
2025-10-12 13:42:51.985120,device_002,Heater,Factory,1.133,0.793
2025-10-12 13:42:51.985120,device_003,TV,Factory,0.197,0.138
2025-10-12 13:42:51.985120,device_004,TV,Office,0.234,0.094
2025-10-12 13:42:51.985120,device_005,WashingMachine,Factory,0.998,0.699


## 4. Digital carbon computation for charging devices

In [0]:
from pyspark.sql import functions as F

# Load from Unity Catalog
df = spark.table("data.energy_volume.energy_power_data")

# Confirm structure
print("✅ Data Loaded from Unity Catalog")
df.printSchema()

# Add carbon intensity factor
df = df.withColumn(
    "carbon_intensity",
    F.when(F.hour("timestamp").between(0, 6), 0.35)
     .when(F.hour("timestamp").between(7, 17), 0.50)
     .otherwise(0.65)
)

# Calculate digital carbon
df = df.withColumn(
    "digital_carbon_kg",
    F.round(F.col("energy_kWh") * F.col("carbon_intensity"), 4)
)

display(df.select("timestamp", "device_type", "energy_kWh", "carbon_intensity", "digital_carbon_kg"))

✅ Data Loaded from Unity Catalog
root
 |-- timestamp: string (nullable = true)
 |-- device_id: string (nullable = true)
 |-- device_type: string (nullable = true)
 |-- location: string (nullable = true)
 |-- energy_kWh: string (nullable = true)
 |-- carbon_kg: string (nullable = true)



timestamp,device_type,energy_kWh,carbon_intensity,digital_carbon_kg
2025-10-12 13:42:51.985120,Smartphone,0.04,0.5,0.02
2025-10-12 13:42:51.985120,Heater,1.133,0.5,0.5665
2025-10-12 13:42:51.985120,TV,0.197,0.5,0.0985
2025-10-12 13:42:51.985120,TV,0.234,0.5,0.117
2025-10-12 13:42:51.985120,WashingMachine,0.998,0.5,0.499
2025-10-12 13:42:51.985120,Heater,1.262,0.5,0.631
2025-10-12 13:42:51.985120,Laptop,0.119,0.5,0.0595
2025-10-12 13:42:51.985120,Heater,1.419,0.5,0.7095
2025-10-12 13:42:51.985120,Fridge,0.365,0.5,0.1825
2025-10-12 13:42:51.985120,WashingMachine,0.801,0.5,0.4005


## 4. Identifying green charging hours

In [0]:
green_hours = (
    df.withColumn("hour", F.hour("timestamp"))
      .groupBy("hour")
      .agg(F.avg("carbon_intensity").alias("avg_carbon_intensity"))
      .orderBy("avg_carbon_intensity")
)

display(green_hours)

hour,avg_carbon_intensity
1,0.3499999999999963
6,0.3499999999999963
5,0.3499999999999963
2,0.3499999999999963
3,0.3499999999999963
4,0.3499999999999963
0,0.3499999999999963
17,0.5
12,0.5
15,0.5


## 5. Recommending green charging hours

In [0]:
recommendations = (
    green_hours.filter(F.col("avg_carbon_intensity") < 0.45)
               .withColumn("recommendation", F.lit("🌿 Preferred Green Charging Window"))
)
display(recommendations)

hour,avg_carbon_intensity,recommendation
3,0.3499999999999963,🌿 Preferred Green Charging Window
5,0.3499999999999963,🌿 Preferred Green Charging Window
0,0.3499999999999963,🌿 Preferred Green Charging Window
2,0.3499999999999963,🌿 Preferred Green Charging Window
1,0.3499999999999963,🌿 Preferred Green Charging Window
4,0.3499999999999963,🌿 Preferred Green Charging Window
6,0.3499999999999963,🌿 Preferred Green Charging Window


## 6. Carbon intensity prediction using Spark MLlib

In [0]:
# --------------------------------------------
# MLlib Model: Predict Carbon Intensity by Hour
# --------------------------------------------
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.sql import functions as F

# 1️⃣  Prepare training data
ml_df = df.withColumn("hour", F.hour("timestamp")).select("hour", "carbon_intensity")

# 2️⃣  Feature engineering
assembler = VectorAssembler(inputCols=["hour"], outputCol="features")
training_data = assembler.transform(ml_df).select("features", "carbon_intensity")

# 3️⃣  Train linear regression model
lr = LinearRegression(featuresCol="features", labelCol="carbon_intensity")
model = lr.fit(training_data)

# 4️⃣  Predict carbon intensity for each hour (0–23)
future_hours = spark.createDataFrame([(i,) for i in range(24)], ["hour"])
future_features = assembler.transform(future_hours)
predictions = model.transform(future_features)

# 5️⃣  Display prediction results
display(predictions.orderBy("hour"))

hour,features,prediction
0,"{""type"":""1"",""size"":null,""indices"":null,""values"":[""0.0""]}",0.3235000000000295
1,"{""type"":""1"",""size"":null,""indices"":null,""values"":[""1.0""]}",0.3383043478261143
2,"{""type"":""1"",""size"":null,""indices"":null,""values"":[""2.0""]}",0.3531086956521992
3,"{""type"":""1"",""size"":null,""indices"":null,""values"":[""3.0""]}",0.3679130434782841
4,"{""type"":""1"",""size"":null,""indices"":null,""values"":[""4.0""]}",0.382717391304369
5,"{""type"":""1"",""size"":null,""indices"":null,""values"":[""5.0""]}",0.3975217391304538
6,"{""type"":""1"",""size"":null,""indices"":null,""values"":[""6.0""]}",0.4123260869565388
7,"{""type"":""1"",""size"":null,""indices"":null,""values"":[""7.0""]}",0.4271304347826236
8,"{""type"":""1"",""size"":null,""indices"":null,""values"":[""8.0""]}",0.4419347826087085
9,"{""type"":""1"",""size"":null,""indices"":null,""values"":[""9.0""]}",0.4567391304347934


###7. Save the model

In [0]:
model.write().overwrite().save(
    "/Volumes/data/energy_volume/energy_power_data/green_charging_lr_model"
)
print("✅ Model saved successfully.")

✅ Model saved successfully.


## 8. Reduce the data

In [0]:
# Example: Load df_energy  from a table or file
df_energy = spark.read.table("data.energy_volume.energy_power_data")

model_df_small = df_energy.sample(
    False,
    0.1,
    seed=42
).dropna().limit(10000)

display(model_df_small)

timestamp,device_id,device_type,location,energy_kWh,carbon_kg
2025-10-12 13:42:51.985120,device_001,Smartphone,Office,0.04,0.016
2025-10-12 13:42:51.985120,device_003,TV,Factory,0.197,0.138
2025-10-12 13:42:51.985120,device_005,WashingMachine,Factory,0.998,0.699
2025-10-12 13:42:51.985120,device_007,Laptop,Factory,0.119,0.083
2025-10-12 13:42:51.985120,device_017,WashingMachine,Factory,0.274,0.192
2025-10-12 14:42:51.985120,device_003,TV,Factory,0.192,0.134
2025-10-12 14:42:51.985120,device_007,Laptop,Factory,0.156,0.109
2025-10-12 14:42:51.985120,device_018,WashingMachine,Home,0.875,0.525
2025-10-12 15:42:51.985120,device_012,Laptop,Home,0.125,0.075
2025-10-12 15:42:51.985120,device_018,WashingMachine,Home,0.577,0.346


## 9. Feature assembler

In [0]:
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(
    inputCols=["energy_kWh", "carbon_kg", "hour"],
    outputCol="features"
)

data_ml = assembler.transform(model_df_small).select("features", "carbon_kg")
train, test = data_ml.randomSplit([0.8, 0.2], seed=42)