In [1]:
# Importing libraries
import pyspark
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType
from pyspark.sql.functions import mean, stddev, min, max, count

In [2]:
spark = pyspark.sql.SparkSession.builder.appName("CGE P5").getOrCreate()

22/12/19 20:07:05 WARN Utils: Your hostname, martin resolves to a loopback address: 127.0.1.1; using 192.168.0.24 instead (on interface wlp1s0)
22/12/19 20:07:05 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/12/19 20:07:16 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
# Dataset common path
common_path = "./"                      # Default dataset path
common_path = "../datasets/small_data/" # Custom dataset path

# Dataset names
phones_acc_path = common_path + "Phones_accelerometer.csv"
phones_gyr_path = common_path + "Phones_gyroscope.csv"
watch_acc_path = common_path + "Watch_accelerometer.csv"
watch_gry_path = common_path + "Watch_gyroscope.csv"

# Defining dataset schema
dataset_schema = StructType([
    StructField("Index", IntegerType(), True),
    StructField("Arrival_Time", StringType(), True),
    StructField("Creation_Time", StringType(), True),
    StructField("x", FloatType(), True),
    StructField("y", FloatType(), True),
    StructField("z", FloatType(), True),
    StructField("User", StringType(), True),
    StructField("Model", StringType(), True),
    StructField("Device", StringType(), True),
    StructField("gt", StringType(), True)
])

In [4]:
# Importing each dataset into a dataframe
df_acc_phones = spark.read.format("csv").schema(dataset_schema).load(phones_acc_path)
df_gyr_phones = spark.read.format("csv").schema(dataset_schema).load(phones_gyr_path)
df_acc_watches = spark.read.format("csv").schema(dataset_schema).load(watch_acc_path)
df_gyr_watches = spark.read.format("csv").schema(dataset_schema).load(watch_gry_path)

In [5]:
# Print schema and dataframe for one of them as a sample
df_acc_phones.printSchema()
df_acc_phones.show()

root
 |-- Index: integer (nullable = true)
 |-- Arrival_Time: string (nullable = true)
 |-- Creation_Time: string (nullable = true)
 |-- x: float (nullable = true)
 |-- y: float (nullable = true)
 |-- z: float (nullable = true)
 |-- User: string (nullable = true)
 |-- Model: string (nullable = true)
 |-- Device: string (nullable = true)
 |-- gt: string (nullable = true)



                                                                                

+-----+-------------+-------------------+----------+----------+---------+----+------+--------+-----+
|Index| Arrival_Time|      Creation_Time|         x|         y|        z|User| Model|  Device|   gt|
+-----+-------------+-------------------+----------+----------+---------+----+------+--------+-----+
|    0|1424696633908|1424696631913248572| -5.958191| 0.6880646| 8.135345|   a|nexus4|nexus4_1|stand|
|    1|1424696633909|1424696631918283972|  -5.95224| 0.6702118| 8.136536|   a|nexus4|nexus4_1|stand|
|    2|1424696633918|1424696631923288855|-5.9950867| 0.6535492| 8.204376|   a|nexus4|nexus4_1|stand|
|    3|1424696633919|1424696631928385290|-5.9427185| 0.6761627| 8.128204|   a|nexus4|nexus4_1|stand|
|    4|1424696633929|1424696631933420691| -5.991516|0.64164734| 8.135345|   a|nexus4|nexus4_1|stand|
|    5|1424696633929|1424696631938456091| -5.965332| 0.6297455| 8.128204|   a|nexus4|nexus4_1|stand|
|    6|1424696633938|1424696631943522009| -5.991516| 0.6356964|  8.16272|   a|nexus4|nexus4

In [6]:
# Defining function to group dataframe by User, Model, and class (gt) and performing required aggregates
def compute_stats(df):
    df_grouped = df.groupBy("User", "Model", "gt").agg(
        mean("x").alias("mean_x"),
        mean("y").alias("mean_y"),
        mean("z").alias("mean_z"),
        stddev("x").alias("stddev_x"),
        stddev("y").alias("stddev_y"),
        stddev("z").alias("stddev_z"),
        max("x").alias("max_x"),
        max("y").alias("max_y"),
        min("z").alias("min_z"),
        min("x").alias("min_x"),
        min("y").alias("min_y"),
        max("z").alias("max_z")
    )

    return df_grouped

In [7]:
# Grouping dataframes and calculating stats
df_acc_phones = compute_stats(df_acc_phones)
df_gyr_phones = compute_stats(df_gyr_phones)
df_acc_watches = compute_stats(df_acc_watches)
df_gyr_watches = compute_stats(df_gyr_watches)

In [12]:
# Showing sample grouped dataframe with stats
df_gyr_phones.show()

+----+------+-----+--------------------+-------------------+--------------------+-------------------+--------------------+--------------------+---------+---------+----------+-----------+-----------+----------+
|User| Model|   gt|              mean_x|             mean_y|              mean_z|           stddev_x|            stddev_y|            stddev_z|    max_x|    max_y|     min_z|      min_x|      min_y|     max_z|
+----+------+-----+--------------------+-------------------+--------------------+-------------------+--------------------+--------------------+---------+---------+----------+-----------+-----------+----------+
|   a|nexus4|stand|0.001588851928710...|0.00100946044921875|4.421844482421875E-4|0.04277706587606125|0.028614446701775695|0.045943341278186764|0.6321869|0.3497162|-0.6001587|-0.16569519|-0.15550232|0.44873047|
+----+------+-----+--------------------+-------------------+--------------------+-------------------+--------------------+--------------------+---------+-------