In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("FootpathETL") \
    .master("local[*]") \
    .getOrCreate()


In [4]:
df = spark.read.csv(
    "../data/processed/footpath_phase1.csv",
    header=True,
    inferSchema=True
)


In [5]:
from pyspark.sql.functions import *

df = (
    df.withColumn("lat", col("lat").cast("double"))
      .withColumn("lon", col("lon").cast("double"))
      .withColumn("duration_hours", col("duration_hours").cast("double"))
      .withColumn("timestamp", to_timestamp("timestamp"))
      .withColumn("last_activity", to_timestamp("last_activity"))
)


In [6]:
df = (
    df.withColumn("year", year("timestamp"))
      .withColumn("month", month("timestamp"))
      .withColumn("day", dayofmonth("timestamp"))
      .withColumn("week", weekofyear("timestamp"))
      .withColumn("weekday", date_format("timestamp", "E"))
      .withColumn("hour", hour("timestamp"))
      .withColumn(
          "day_period",
          when(col("hour") < 6, "night")
          .when(col("hour") < 12, "morning")
          .when(col("hour") < 18, "afternoon")
          .otherwise("evening")
      )
)


In [7]:
df_district = df.groupBy("district").count()

df_district.write \
    .mode("overwrite") \
    .option("header", True) \
    .csv("data/spark_output/summary_district")


In [8]:
df_month = df.groupBy("year", "month").count()

df_month.write \
    .mode("overwrite") \
    .option("header", True) \
    .csv("data/spark_output/summary_month")


In [9]:
df_median = df.groupBy("district") \
    .agg(expr("percentile(duration_hours, 0.5)").alias("median_duration"))

df_median.write \
    .mode("overwrite") \
    .option("header", True) \
    .csv("data/spark_output/summary_median")


In [10]:
df_median = df.groupBy("district") \
    .agg(expr("percentile(duration_hours, 0.5)").alias("median_duration"))

df_median.write \
    .mode("overwrite") \
    .option("header", True) \
    .csv("data/spark_output/summary_median")


In [11]:
df.coalesce(1).write \
    .mode("overwrite") \
    .option("header", True) \
    .csv("data/spark_output/footpath_clean_final")
