In [1]:
# Importing libraries
import pyspark
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType
from pyspark.sql.functions import mean, stddev, min, max, collect_list, flatten, concat, col

In [2]:
# Creating spark enviroment
spark = pyspark.sql.SparkSession.builder.appName("CGE P6").getOrCreate()

22/12/22 18:01:02 WARN Utils: Your hostname, martin resolves to a loopback address: 127.0.1.1; using 192.168.0.24 instead (on interface wlp1s0)
22/12/22 18:01:02 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/12/22 18:01:04 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


# Tarea 1
En esta tarea se va a realizar el mismo proceso que en la práctica anterior, pero con Dataframes en lugar de RDDs. Esto es, importar los datasets en los correspondientes Dataframes, agruparlos por la terna 'User', 'Model', 'gt', y calcular una serie de operaciones sobre las columnas 'x', 'y', y 'z', para posteriormente unir los Dataframes de un mismo tipo de dispositivo, y finalmente unir todos los Dataframes en uno final.

In [3]:
# Dataset common path
common_path = "./"                      # Default dataset path
common_path = "../datasets/small_data/" # Custom dataset path

# Dataset names
phones_acc_path = common_path + "Phones_accelerometer.csv"
phones_gyr_path = common_path + "Phones_gyroscope.csv"
watch_acc_path = common_path + "Watch_accelerometer.csv"
watch_gry_path = common_path + "Watch_gyroscope.csv"

# Defining dataset schema
dataset_schema = StructType([
    StructField("Index", IntegerType(), True),
    StructField("Arrival_Time", StringType(), True),
    StructField("Creation_Time", StringType(), True),
    StructField("x", FloatType(), True),
    StructField("y", FloatType(), True),
    StructField("z", FloatType(), True),
    StructField("User", StringType(), True),
    StructField("Model", StringType(), True),
    StructField("Device", StringType(), True),
    StructField("gt", StringType(), True)
])

In [4]:
# Importing each dataset into a Dataframe
df_acc_phones = spark.read.format("csv").schema(dataset_schema).load(phones_acc_path)
df_gyr_phones = spark.read.format("csv").schema(dataset_schema).load(phones_gyr_path)
df_acc_watches = spark.read.format("csv").schema(dataset_schema).load(watch_acc_path)
df_gyr_watches = spark.read.format("csv").schema(dataset_schema).load(watch_gry_path)

In [5]:
# Print schema and Dataframe for one of them as a sample
df_acc_phones.printSchema()
df_acc_phones.show()

root
 |-- Index: integer (nullable = true)
 |-- Arrival_Time: string (nullable = true)
 |-- Creation_Time: string (nullable = true)
 |-- x: float (nullable = true)
 |-- y: float (nullable = true)
 |-- z: float (nullable = true)
 |-- User: string (nullable = true)
 |-- Model: string (nullable = true)
 |-- Device: string (nullable = true)
 |-- gt: string (nullable = true)



[Stage 0:>                                                          (0 + 1) / 1]

+-----+-------------+-------------------+----------+----------+---------+----+------+--------+-----+
|Index| Arrival_Time|      Creation_Time|         x|         y|        z|User| Model|  Device|   gt|
+-----+-------------+-------------------+----------+----------+---------+----+------+--------+-----+
|    0|1424696633908|1424696631913248572| -5.958191| 0.6880646| 8.135345|   a|nexus4|nexus4_1|stand|
|    1|1424696633909|1424696631918283972|  -5.95224| 0.6702118| 8.136536|   a|nexus4|nexus4_1|stand|
|    2|1424696633918|1424696631923288855|-5.9950867| 0.6535492| 8.204376|   a|nexus4|nexus4_1|stand|
|    3|1424696633919|1424696631928385290|-5.9427185| 0.6761627| 8.128204|   a|nexus4|nexus4_1|stand|
|    4|1424696633929|1424696631933420691| -5.991516|0.64164734| 8.135345|   a|nexus4|nexus4_1|stand|
|    5|1424696633929|1424696631938456091| -5.965332| 0.6297455| 8.128204|   a|nexus4|nexus4_1|stand|
|    6|1424696633938|1424696631943522009| -5.991516| 0.6356964|  8.16272|   a|nexus4|nexus4

                                                                                

In [6]:
# Defining function to group Dataframe by User, Model, and class (gt) and performing required aggregates
def computeStats(df):
    df_grouped = df.groupBy("User", "Model", "gt").agg(
        mean("x").alias("mean_x"),
        mean("y").alias("mean_y"),
        mean("z").alias("mean_z"),
        stddev("x").alias("stddev_x"),
        stddev("y").alias("stddev_y"),
        stddev("z").alias("stddev_z"),
        max("x").alias("max_x"),
        max("y").alias("max_y"),
        min("z").alias("min_z"),
        min("x").alias("min_x"),
        min("y").alias("min_y"),
        max("z").alias("max_z")
    )

    return df_grouped

In [7]:
# Grouping Dataframes and calculating stats
df_acc_phones = computeStats(df_acc_phones)
df_gyr_phones = computeStats(df_gyr_phones)
df_acc_watches = computeStats(df_acc_watches)
df_gyr_watches = computeStats(df_gyr_watches)

In [8]:
# Showing sample grouped Dataframe with stats
df_acc_phones.show()

[Stage 1:>                                                          (0 + 1) / 1]

+----+------+-----+------------------+------------------+----------------+-------------------+------------------+-------------------+----------+---------+--------+----------+-----------+--------+
|User| Model|   gt|            mean_x|            mean_y|          mean_z|           stddev_x|          stddev_y|           stddev_z|     max_x|    max_y|   min_z|     min_x|      min_y|   max_z|
+----+------+-----+------------------+------------------+----------------+-------------------+------------------+-------------------+----------+---------+--------+----------+-----------+--------+
|   a|nexus4|stand|-6.026499951171875|0.9334959503173829|8.01364601135254|0.18456097451689402|0.2404461811789261|0.17600866155107173|-5.5202026|1.9472809|7.149872|-7.0448303|-0.84251404|8.638794|
+----+------+-----+------------------+------------------+----------------+-------------------+------------------+-------------------+----------+---------+--------+----------+-----------+--------+



                                                                                

In [9]:
# Defining function to group Dataframe by User, Model, and class (gt) and performing required aggregates
def regroupJoinedDataframes(df):
    df_grouped = df.groupBy("User", "Model", "gt").agg(
        collect_list('mean_x').alias('mean_x_list'),
        collect_list('mean_y').alias('mean_y_list'),
        collect_list('mean_z').alias('mean_z_list'),
        collect_list('stddev_x').alias('stddev_x_list'),
        collect_list('stddev_y').alias('stddev_y_list'),
        collect_list('stddev_z').alias('stddev_z_list'),
        collect_list('max_x').alias('max_x_list'),
        collect_list('max_y').alias('max_y_list'),
        collect_list('max_z').alias('max_z_list'),
        collect_list('min_x').alias('min_x_list'),
        collect_list('min_y').alias('min_y_list'),
        collect_list('min_z').alias('min_z_list')
    )

    return df_grouped

In [10]:
# Joining Dataframes by phones and watches
df_phones = regroupJoinedDataframes(df_acc_phones.union(df_gyr_phones))
df_watches = regroupJoinedDataframes(df_acc_watches.union(df_gyr_watches))

# Showing one joined Dataframe as an example
df_phones.show()

[Stage 5:>                                                          (0 + 1) / 1]

+----+------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|User| Model|   gt|         mean_x_list|         mean_y_list|         mean_z_list|       stddev_x_list|       stddev_y_list|       stddev_z_list|          max_x_list|          max_y_list|          max_z_list|          min_x_list|          min_y_list|          min_z_list|
+----+------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|   a|nexus4|stand|[-6.0264999511718...|[0.93349595031738...|[8.01364601135254...|[0.18456097451689...|[0.24044618117892...|[0.17600866155107...|[-5.5202026, 0.63...|[1.9472809, 0.349.

                                                                                

In [11]:
# Joining phone and whatch Dataframes into a single one 
df_full_data = df_phones.union(df_watches).groupBy(['User', 'Model', 'gt']).agg(
    flatten(collect_list('mean_x_list')).alias('mean_x_list'),
    flatten(collect_list('mean_y_list')).alias('mean_y_list'),
    flatten(collect_list('mean_z_list')).alias('mean_z_list'),
    flatten(collect_list('stddev_x_list')).alias('stddev_x_list'),
    flatten(collect_list('stddev_y_list')).alias('stddev_y_list'),
    flatten(collect_list('stddev_z_list')).alias('stddev_z_list'),
    flatten(collect_list('max_x_list')).alias('max_x_list'),
    flatten(collect_list('max_y_list')).alias('max_y_list'),
    flatten(collect_list('min_z_list')).alias('min_z_list'),
    flatten(collect_list('min_x_list')).alias('min_x_list'),
    flatten(collect_list('min_y_list')).alias('min_y_list'),
    flatten(collect_list('max_z_list')).alias('max_z_list')
)

# Showing result Dataframe
df_full_data.show()

[Stage 16:>                                                         (0 + 1) / 1]

+----+------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|User| Model|   gt|         mean_x_list|         mean_y_list|         mean_z_list|       stddev_x_list|       stddev_y_list|       stddev_z_list|          max_x_list|          max_y_list|          min_z_list|          min_x_list|          min_y_list|          max_z_list|
+----+------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|   a|nexus4|stand|[-6.0264999511718...|[0.93349595031738...|[8.01364601135254...|[0.18456097451689...|[0.24044618117892...|[0.17600866155107...|[-5.5202026, 0.63...|[1.9472809, 0.349.

                                                                                