In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, hour, sum as _sum, when, desc


spark = SparkSession.builder \
    .appName("EnergyUsageAnalysis") \
    .getOrCreate()

# ---------------------------
# 2. Load dataset
# ---------------------------

 df = spark.read.csv("path/to/sensor_logs.csv", header=True, inferSchema=True)



# 3. Add peak/off-peak column

df = df.withColumn("hour", hour(col("timestamp")))
df = df.withColumn(
    "period",
    when((col("hour") >= 8) & (col("hour") < 20), "peak").otherwise("off-peak")
)


# 4. Group by device & period

usage_summary = df.groupBy("device_id", "period") \
    .agg(_sum("usage_kwh").alias("total_usage_kwh"))


# 5. Find top devices by usage

total_usage = usage_summary.groupBy("device_id") \
    .agg(_sum("total_usage_kwh").alias("overall_usage_kwh"))

top_devices = total_usage.orderBy(desc("overall_usage_kwh"))

# 6. Show results

print("=== Usage Summary (Peak vs Off-Peak) ===")
usage_summary.show()

print("=== Top Devices by Total Usage ===")
top_devices.show()


# 7. Save to CSV

output_path = "top_devices_usage"
top_devices.coalesce(1).write.csv(output_path, header=True, mode="overwrite")

print(f"Results saved to folder: {output_path}")

# Stop Spark session
spark.stop()


=== Usage Summary (Peak vs Off-Peak) ===
+---------+--------+---------------+
|device_id|  period|total_usage_kwh|
+---------+--------+---------------+
|  DeviceC|    peak|            5.5|
|  DeviceB|off-peak|            6.0|
|  DeviceA|off-peak|            2.0|
|  DeviceA|    peak|            3.5|
|  DeviceB|    peak|            5.0|
|  DeviceD|    peak|            8.0|
|  DeviceE|    peak|            7.0|
|  DeviceE|off-peak|            4.5|
|  DeviceD|off-peak|            3.0|
+---------+--------+---------------+

=== Top Devices by Total Usage ===
+---------+-----------------+
|device_id|overall_usage_kwh|
+---------+-----------------+
|  DeviceE|             11.5|
|  DeviceD|             11.0|
|  DeviceB|             11.0|
|  DeviceA|              5.5|
|  DeviceC|              5.5|
+---------+-----------------+

Results saved to folder: top_devices_usage
