In [1]:
# STEP 1: Install and set up Spark
!apt-get install openjdk-11-jdk -qq > /dev/null
!wget -q https://archive.apache.org/dist/spark/spark-3.3.1/spark-3.3.1-bin-hadoop3.tgz
!tar xf spark-3.3.1-bin-hadoop3.tgz
!pip install -q findspark

import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.3.1-bin-hadoop3"

import findspark
findspark.init()


In [2]:
# STEP 2: Start SparkSession
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, to_timestamp, date_format, weekofyear, sum as _sum, avg as _avg

spark = SparkSession.builder \
    .appName("SmartEnergyETL") \
    .getOrCreate()


In [3]:
# Load cleaned CSV from Drive
input_path = "/content/drive/MyDrive/smart_energy/cleaned_energy_logs.csv"  # update path
df_raw = spark.read.option("header", "true").option("inferSchema", "true").csv(input_path)

# Enrich with timestamp, date, and week
df = (df_raw
      .withColumn("event_ts", to_timestamp("timestamp"))
      .withColumn("date", date_format("event_ts", "yyyy-MM-dd"))
      .withColumn("week", weekofyear("event_ts")))

df.show(5)


+---------+-------------------+----------+--------+-------------------+----------+----+
|device_id|          timestamp|energy_kwh|power_kw|           event_ts|      date|week|
+---------+-------------------+----------+--------+-------------------+----------+----+
|   dev001|2025-06-01 08:00:00|       1.5|     0.6|2025-06-01 08:00:00|2025-06-01|  22|
|   dev001|2025-06-01 12:00:00|       2.0|     0.8|2025-06-01 12:00:00|2025-06-01|  22|
|   dev002|2025-06-01 08:15:00|       1.2|     0.5|2025-06-01 08:15:00|2025-06-01|  22|
|   dev002|2025-06-01 13:30:00|       1.7|     0.7|2025-06-01 13:30:00|2025-06-01|  22|
|   dev003|2025-06-01 09:00:00|       0.9|     0.4|2025-06-01 09:00:00|2025-06-01|  22|
+---------+-------------------+----------+--------+-------------------+----------+----+
only showing top 5 rows



In [4]:
# Daily Summary (kWh, avg power)
daily_summary = (df.groupBy("date")
                 .agg(_sum("energy_kwh").alias("total_kwh"),
                      _avg("power_kw").alias("avg_power_kw")))

daily_summary.show()


+----------+-----------------+------------------+
|      date|        total_kwh|      avg_power_kw|
+----------+-----------------+------------------+
|2025-06-03|              5.0|0.7333333333333334|
|2025-06-01|7.300000000000001|0.5999999999999999|
|2025-06-05|              5.1|0.7333333333333334|
|2025-06-02|              5.0|0.7000000000000001|
|2025-06-04|4.300000000000001|               0.6|
|2025-06-06|              5.1|0.7333333333333334|
+----------+-----------------+------------------+



In [5]:
# Weekly Summary
weekly_summary = (df.groupBy("week")
                  .agg(_sum("energy_kwh").alias("total_kwh"),
                       _avg("power_kw").alias("avg_power_kw")))

weekly_summary.show()


+----+-----------------+------------------+
|week|        total_kwh|      avg_power_kw|
+----+-----------------+------------------+
|  22|7.300000000000001|0.5999999999999999|
|  23|             24.5|0.6999999999999998|
+----+-----------------+------------------+



In [6]:
# Save summaries to CSV (in Drive)
daily_path = "/content/drive/MyDrive/smart_energy/output/daily_summary"
weekly_path = "/content/drive/MyDrive/smart_energy/output/weekly_summary"

daily_summary.coalesce(1).write.mode("overwrite").option("header", "true").csv(daily_path)
weekly_summary.coalesce(1).write.mode("overwrite").option("header", "true").csv(weekly_path)


In [7]:
# Optional - Detect over-usage
device_usage = (df.groupBy("device_id", "date")
                .agg(_sum("energy_kwh").alias("device_kwh")))

daily_avg = (device_usage.groupBy("date")
             .agg(_avg("device_kwh").alias("avg_kwh")))

# Join and filter for devices using > 20% above average
overuse = (device_usage.alias("d")
           .join(daily_avg.alias("a"), on="date")
           .filter(col("d.device_kwh") > col("a.avg_kwh") * 1.2)
           .select("d.device_id", "d.date", "d.device_kwh", "a.avg_kwh"))

overuse.show()


+---------+----------+----------+------------------+
|device_id|      date|device_kwh|           avg_kwh|
+---------+----------+----------+------------------+
|   dev002|2025-06-02|       2.2|1.6666666666666667|
|   dev001|2025-06-01|       3.5|2.4333333333333336|
|   dev001|2025-06-03|       2.1|1.6666666666666667|
+---------+----------+----------+------------------+

