# USE CASE 4 - ENERGY FORECASTING & ANAMOLY DETECTION

## 1. Load data & create time features 

In [0]:
from pyspark.sql import functions as F
from pyspark.sql.types import TimestampType, DoubleType

df_energy = spark.table("data.energy_volume.energy_power_data")

df_energy = (
    df_energy
    .withColumn("timestamp", F.col("timestamp").cast(TimestampType()))
    .withColumn("energy_kWh", F.col("energy_kWh").cast(DoubleType()))   # ✅ convert to numeric
    .withColumn("hour", F.hour("timestamp"))
    .withColumn("day", F.dayofmonth("timestamp"))
    .withColumn("dow", F.dayofweek("timestamp"))
    .dropna()
)

display(df_energy.limit(10))
df_energy.printSchema()

timestamp,device_id,device_type,location,energy_kWh,carbon_kg,hour,day,dow
2025-10-12T14:33:24.237Z,device_001,Smartphone,Factory,0.064,0.045,14,12,1
2025-10-12T14:33:24.237Z,device_002,AC,Factory,1.679,1.175,14,12,1
2025-10-12T14:33:24.237Z,device_003,TV,Home,0.191,0.114,14,12,1
2025-10-12T14:33:24.237Z,device_004,Fan,Factory,0.088,0.061,14,12,1
2025-10-12T14:33:24.237Z,device_005,Heater,Factory,1.236,0.865,14,12,1
2025-10-12T14:33:24.237Z,device_006,Fan,Home,0.094,0.056,14,12,1
2025-10-12T14:33:24.237Z,device_007,AC,Factory,1.067,0.747,14,12,1
2025-10-12T14:33:24.237Z,device_008,Fan,Home,0.081,0.049,14,12,1
2025-10-12T14:33:24.237Z,device_009,Heater,Factory,1.322,0.925,14,12,1
2025-10-12T14:33:24.237Z,device_010,Heater,Office,1.07,0.428,14,12,1


root
 |-- timestamp: timestamp (nullable = true)
 |-- device_id: string (nullable = true)
 |-- device_type: string (nullable = true)
 |-- location: string (nullable = true)
 |-- energy_kWh: double (nullable = true)
 |-- carbon_kg: string (nullable = true)
 |-- hour: integer (nullable = true)
 |-- day: integer (nullable = true)
 |-- dow: integer (nullable = true)



## 2. Feature engineering & linear regression forecasting

In [0]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import RegressionEvaluator

# ✅ Only numeric features
assembler = VectorAssembler(
    inputCols=["hour", "day", "dow"],
    outputCol="features"
)

lr = LinearRegression(
    featuresCol="features",
    labelCol="energy_kWh"
)

pipeline = Pipeline(stages=[assembler])

train, test = df_energy.randomSplit([0.8, 0.2], seed=42)

train_vec = pipeline.fit(train).transform(train)
test_vec = pipeline.fit(train).transform(test)

lr_model = lr.fit(train_vec)
pred_lr = lr_model.transform(test_vec)

evaluator = RegressionEvaluator(labelCol="energy_kWh", predictionCol="prediction")
rmse_lr = evaluator.evaluate(pred_lr, {evaluator.metricName: "rmse"})
r2_lr = evaluator.evaluate(pred_lr, {evaluator.metricName: "r2"})

print(f"Use Case 4 Linear Regression → RMSE: {rmse_lr:.4f}, R²: {r2_lr:.4f}")

display(pred_lr.select("timestamp", "energy_kWh", "prediction").limit(20))

Use Case 4 Linear Regression → RMSE: 0.7356, R²: -0.0018


timestamp,energy_kWh,prediction
2025-10-12T14:33:24.237Z,1.322,0.7983757700283018
2025-10-12T14:33:24.237Z,1.207,0.7983757700283018
2025-10-12T14:33:24.237Z,0.829,0.7983757700283018
2025-10-12T15:33:24.237Z,1.337,0.7991034593817194
2025-10-12T15:33:24.237Z,0.062,0.7991034593817194
2025-10-12T15:33:24.237Z,1.164,0.7991034593817194
2025-10-12T15:33:24.237Z,0.122,0.7991034593817194
2025-10-12T15:33:24.237Z,0.073,0.7991034593817194
2025-10-12T15:33:24.237Z,1.613,0.7991034593817194
2025-10-12T15:33:24.237Z,1.376,0.7991034593817194


## 3. Anamoly detection(Residual analysis)

In [0]:
pred_lr = pred_lr.withColumn("residual", F.abs(F.col("energy_kWh") - F.col("prediction")))

stats = pred_lr.select(F.mean("residual").alias("mean"), F.stddev("residual").alias("std")).first()
threshold = stats.mean + 3 * (stats.std if stats.std else 0.001)

anomalies = pred_lr.filter(F.col("residual") > threshold)

display(anomalies.orderBy("residual", ascending=False).limit(50))

timestamp,device_id,device_type,location,energy_kWh,carbon_kg,hour,day,dow,features,prediction,residual
2025-11-06T21:33:24.237Z,device_011,AC,Home,10.899,6.539,21,6,5,"{""type"":""1"",""size"":null,""indices"":null,""values"":[""21.0"",""6.0"",""5.0""]}",0.7857557276205491,10.11324427237945
2025-11-10T22:33:24.237Z,device_009,Heater,Factory,9.07,6.349,22,10,2,"{""type"":""1"",""size"":null,""indices"":null,""values"":[""22.0"",""10.0"",""2.0""]}",0.7996423759598843,8.270357624040116
2025-10-14T09:33:24.237Z,device_002,AC,Factory,7.814,5.47,9,14,3,"{""type"":""1"",""size"":null,""indices"":null,""values"":[""9.0"",""14.0"",""3.0""]}",0.7871448085737729,7.026855191426227
2025-11-02T00:33:24.237Z,device_005,Heater,Factory,6.654,4.658,0,2,1,"{""type"":""1"",""size"":null,""indices"":null,""values"":[""0.0"",""2.0"",""1.0""]}",0.7856592805736597,5.868340719426341
2025-10-27T11:33:24.237Z,device_010,Heater,Office,6.428,2.571,11,27,2,"{""type"":""1"",""size"":null,""indices"":null,""values"":[""11.0"",""27.0"",""2.0""]}",0.7959368185338415,5.632063181466158
2025-11-05T20:33:24.237Z,device_014,Heater,Home,6.403,3.842,20,5,4,"{""type"":""1"",""size"":null,""indices"":null,""values"":[""20.0"",""5.0"",""4.0""]}",0.7888242956108518,5.614175704389147
2025-10-23T15:33:24.237Z,device_012,Heater,Factory,5.984,4.189,15,23,5,"{""type"":""1"",""size"":null,""indices"":null,""values"":[""15.0"",""23.0"",""5.0""]}",0.7856886169615945,5.198311383038406
2025-10-18T01:33:24.237Z,device_015,Heater,Office,5.866,2.346,1,18,7,"{""type"":""1"",""size"":null,""indices"":null,""values"":[""1.0"",""18.0"",""7.0""]}",0.7661382643715503,5.0998617356284495
2025-10-13T02:33:24.237Z,device_002,AC,Factory,5.787,4.051,2,13,2,"{""type"":""1"",""size"":null,""indices"":null,""values"":[""2.0"",""13.0"",""2.0""]}",0.7858472404435697,5.001152759556431
2025-10-26T03:33:24.237Z,device_020,Heater,Factory,5.104,3.573,3,26,1,"{""type"":""1"",""size"":null,""indices"":null,""values"":[""3.0"",""26.0"",""1.0""]}",0.7939115610502205,4.31008843894978
