In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

In [2]:
# Initialize Spark session
spark = (
    SparkSession.builder.appName("WearableDataStandardization")
    .master("local[*]")
    .getOrCreate()
)

In [3]:
# Load JSON files into Spark DataFrames
df_apple = spark.read.option("multiline", True).json("mock_data/apple_watch_data.json")
df_fitbit = spark.read.option("multiline", True).json("mock_data/fitbit_data.json")
df_garmin = spark.read.option("multiline", True).json("mock_data/garmin_data.json")

In [7]:
df_fitbit_std = df_fitbit.select(
    col("user_id"),
    col("timestamp"),
    col("heartRate").alias("heart_rate"),
    col("hrv").alias("hrv_ms"),
    col("sleep").alias("sleep_stage"),
    col("steps").alias("steps_count"),
)

df_garmin_std = df_garmin.select(
    col("user_id"),
    col("timestamp"),
    col("bpm").alias("heart_rate"),
    col("hrv").alias("hrv_ms"),
    col("sleep_score").alias("sleep_stage"),
    col("steps_taken").alias("steps_count"),
)

df_apple_std = df_apple.select(
    col("user_id"),
    col("timestamp"),
    col("heart_rate_bpm").alias("heart_rate"),
    col("hrv_ms"),
    col("sleep_stage"),
    col("steps_count"),
)

# Union all DataFrames
df_combined = df_apple_std.union(df_fitbit_std).union(df_garmin_std)

# Perform further preprocessing if needed
df_combined = df_combined.withColumn("heart_rate", col("heart_rate").cast("integer"))

# Show the result
df_combined.toPandas()

Unnamed: 0,user_id,timestamp,heart_rate,hrv_ms,sleep_stage,steps_count
0,P001,2025-02-26T08:04:49.940638Z,63,37,light,9717
1,P001,2025-02-26T12:04:49.940699Z,69,79,light,5637
2,P001,2025-02-26T13:04:49.940713Z,65,21,deep,7780
3,P001,2025-02-26T18:04:49.940722Z,77,45,REM,9051
4,P001,2025-02-26T11:04:49.940730Z,94,83,REM,4588
5,P002,2025-02-26T18:04:49.940739Z,93,99,light,1284
6,P002,2025-02-26T05:04:49.940746Z,92,51,light,8519
7,P002,2025-02-26T21:04:49.940753Z,74,71,deep,3290
8,P002,2025-02-26T09:04:49.940760Z,61,51,light,9908
9,P002,2025-02-26T09:04:49.940766Z,78,25,light,152
