In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql import functions as F

spark = SparkSession \
    .builder \
    .appName("Python Spark SQL basic example") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

def load_and_preprocess(file_path, schema):
    return spark.read.format("csv") \
        .option("header", "false") \
        .option("inferSchema", "false") \
        .option("delimiter", ",") \
        .schema(schema) \
        .load(file_path) \
        .drop("_index", "_arrive", "_creation", "_device")

def compute_group_statistics(data):
    return data.groupBy(data['user'], data['model'], data['gt']).agg(
        F.mean(data['x']).alias('mean_x'),
        F.mean(data['y']).alias('mean_y'),
        F.mean(data['z']).alias('mean_z'),
        F.stddev(data['x']).alias('std_x'),
        F.stddev(data['y']).alias('std_y'),
        F.stddev(data['z']).alias('std_z'),
        F.max(data['x']).alias('max_x'),
        F.max(data['y']).alias('max_y'),
        F.max(data['z']).alias('max_z'),
        F.min(data['x']).alias('min_x'),
        F.min(data['y']).alias('min_y'),
        F.min(data['z']).alias('min_z')
    )

fields = [
    StructField("_index", BooleanType(), False), StructField("_arrive", BooleanType(), False), StructField("_creation", BooleanType(), False),
    StructField("x", DoubleType(), False), StructField("y", DoubleType(), False), StructField("z", DoubleType(), False),
    StructField("user", StringType(), False), StructField("model", StringType(), False), StructField("_device", StringType(), False),
    StructField("gt", StringType(), False)
]
schema = StructType(fields)

24/12/13 18:06:08 WARN Utils: Your hostname, Zhijie resolves to a loopback address: 127.0.1.1; using 10.27.109.93 instead (on interface wlo1)
24/12/13 18:06:08 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/12/13 18:06:08 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
# data1 = spark.read.format("csv").option(header=False, inferScheme=False, delimiter=",").schema(schema).load("Phones_accelerometer.csv")
# data1 = spark.read.csv("Phones_accelerometer.csv", schema=schema, nullValue="null")

data1 = load_and_preprocess("Phones_accelerometer.csv", schema)
data2 = load_and_preprocess("Phones_gyroscope.csv", schema)
data3 = load_and_preprocess("Watch_accelerometer.csv", schema)
data4 = load_and_preprocess("Watch_gyroscope.csv", schema)

In [3]:
group1 = compute_group_statistics(data1)
group2 = compute_group_statistics(data2)
group3 = compute_group_statistics(data3)
group4 = compute_group_statistics(data4)

join1 = group1.join(group2, ['user', 'model', 'gt'], 'inner')
join2 = group3.join(group4, on=['user', 'model', 'gt'], how='inner')

results = join1.union(join2)

In [4]:
results.show()

24/12/13 18:06:13 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

+----+------+-----+------------------+-------------------+-------------------+--------------------+--------------------+-------------------+----------+------------------+----------+-----------+-----------+----------+--------------------+--------------------+--------------------+--------------------+-------------------+--------------------+------------------+-------------------+------------+-----------+-----------+-------------------+
|user| model|   gt|            mean_x|             mean_y|             mean_z|               std_x|               std_y|              std_z|     max_x|             max_y|     max_z|      min_x|      min_y|     min_z|              mean_x|              mean_y|              mean_z|               std_x|              std_y|               std_z|             max_x|              max_y|       max_z|      min_x|      min_y|              min_z|
+----+------+-----+------------------+-------------------+-------------------+--------------------+--------------------+----

24/12/13 18:06:24 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors
