In [5]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, hour, when, sum as spark_sum

# Step 1: Initialize Spark session
spark = SparkSession.builder.appName("SensorLogAnalysis").getOrCreate()

# Step 2: Load dataset from Google Drive
# Make sure sensor_logs.csv is in: /content/drive/My Drive/sensor_logs.csv
df = spark.read.csv("/content/drive/My Drive/sensor_logs.csv", header=True, inferSchema=True)

# Step 3: Extract hour from timestamp
df = df.withColumn("Hour", hour(col("Timestamp")))

# Step 4: Categorize as Peak (8 AM - 8 PM) or Off-Peak
df = df.withColumn("UsageType", when((col("Hour") >= 8) & (col("Hour") < 20), "Peak").otherwise("Off-Peak"))

# Step 5: Calculate usage per type per device
usage_summary = df.groupBy("DeviceID", "UsageType") \
                  .agg(spark_sum("EnergyUsage").alias("TotalUsage"))

# Step 6: Pivot table to separate Peak and Off-Peak columns
pivot_df = usage_summary.groupBy("DeviceID") \
                        .pivot("UsageType", ["Peak", "Off-Peak"]) \
                        .sum("TotalUsage") \
                        .fillna(0)

# Step 7: Calculate total energy consumption
pivot_df = pivot_df.withColumn("TotalEnergy", col("Peak") + col("Off-Peak"))

# Step 8: Identify top 5 devices by total energy consumption
top_devices = pivot_df.orderBy(col("TotalEnergy").desc()).limit(5)

# Step 8.1: Show result in notebook output
print("=== Top Energy Consuming Devices ===")
top_devices.show()

# Step 9: Save the result to CSV in Google Drive
top_devices.coalesce(1).write.mode("overwrite").option("header", True) \
    .csv("/content/drive/My Drive/output/top_energy_devices")

# Optional: List the saved CSV file to verify
print("=== Files written to Google Drive ===")
!ls "/content/drive/My Drive/output/top_energy_devices"

# Step 10: Stop Spark session
spark.stop()


=== Top Energy Consuming Devices ===
+--------+----+--------+-----------+
|DeviceID|Peak|Off-Peak|TotalEnergy|
+--------+----+--------+-----------+
|    D001| 4.1|     2.5|        6.6|
|    D002| 3.2|     0.0|        3.2|
|    D003| 0.0|     1.2|        1.2|
+--------+----+--------+-----------+

=== Files written to Google Drive ===
part-00000-44a2f534-d5a5-46f0-9543-ca8d2ead325c-c000.csv  _SUCCESS
