In [1]:
# Importing libraries
import pyspark
import numpy as np

In [2]:
# Creating spark enviroment
sc = pyspark.SparkContext()

22/12/21 21:35:44 WARN Utils: Your hostname, martin resolves to a loopback address: 127.0.1.1; using 192.168.0.24 instead (on interface wlp1s0)
22/12/21 21:35:44 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/12/21 21:35:56 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
# Dataset common path
common_path = "./"                      # Default dataset path (same path as notebook)
common_path = "../datasets/small_data/" # Custom dataset path

# Dataset names
phones_acc_path = common_path + "Phones_accelerometer.csv"
phones_gyr_path = common_path + "Phones_gyroscope.csv"
watch_acc_path = common_path + "Watch_accelerometer.csv"
watch_gry_path = common_path + "Watch_gyroscope.csv"

In [4]:
# This function parses the input string based RDD into tuples
def parseRawRDD(rawRDD):
    # Splitting row into an array with the separator, and adding User, Model, and action (gt) as key
    return rawRDD.map(lambda line: line.split(",")).map(lambda line: ((line[6], line[7], line[9]), (float(line[3]), float(line[4]), float(line[5]))))

In [5]:
# Importing each dataset into an RDD
rdd_acc_phones = parseRawRDD(sc.textFile(phones_acc_path))
rdd_gyr_phones = parseRawRDD(sc.textFile(phones_gyr_path))
rdd_acc_watches = parseRawRDD(sc.textFile(watch_acc_path))
rdd_gyr_watches = parseRawRDD(sc.textFile(watch_gry_path))

# Showing first line of one RDD as an example
print(rdd_acc_phones.first())

[Stage 0:>                                                          (0 + 1) / 1]

(('a', 'nexus4', 'stand'), (-5.958191, 0.6880646, 8.135345))


                                                                                

In [6]:
# This function returns the given RDD with elements grouped by key and values as a list
def groupData(rdd):
    return rdd.groupByKey().mapValues(list)

In [7]:
# Defining funtion that will compute stats for each list of values
def computeStats(values):
    # Calculating values
    mean = np.mean(values, axis=0)
    std = np.std(values, axis=0)
    max = np.max(values, axis=0)
    min = np.min(values, axis=0)

    return np.concatenate([mean, std, max, min]).ravel().tolist()

In [8]:
# Defining function to group RDD by User, Model, and class (gt) and performing required aggregates
def transformRDD(rdd):
    # Grouping RDD by key
    rdd = groupData(rdd)

    # Calculating values
    rdd = rdd.mapValues(computeStats)

    # Returning RDD with the following shape for each line:
    # (User, Model, gt), [mean_x, mean_y, mean_z, std_x, std_y, std_z, max_x, max_y, max_z, min_x, min_y, min_z]
    return rdd

In [9]:
# Transforming each RDD given the required aggregate function
rdd_acc_phones = transformRDD(rdd_acc_phones)
rdd_gyr_phones = transformRDD(rdd_gyr_phones)
rdd_acc_watches = transformRDD(rdd_acc_watches)
rdd_gyr_watches = transformRDD(rdd_gyr_watches)

# Showing first line of one RDD as an example
print(rdd_acc_phones.first())

                                                                                

(('a', 'nexus4', 'stand'), [-6.02649995057012, 0.9334959509016011, 8.013646013119995, 0.18455174673530322, 0.24043415892742395, 0.17599985821420452, -5.5202026, 1.9472808999999998, 8.638794, -7.0448303, -0.84251404, 7.149872])


In [10]:
# Joining RDDs by phones and watches
rdd_phones = groupData(rdd_acc_phones.union(rdd_gyr_phones))
rdd_watches = groupData(rdd_acc_watches.union(rdd_gyr_watches))

# Showing first 5 lines of one joined RDD as an example
print(rdd_phones.take(5))

[(('a', 'nexus4', 'stand'), [[-6.02649995057012, 0.9334959509016011, 8.013646013119995, 0.18455174673530322, 0.24043415892742395, 0.17599985821420452, -5.5202026, 1.9472808999999998, 8.638794, -7.0448303, -0.84251404, 7.149872], [0.0015888519490950379, 0.001009460465647035, 0.00044218442934900517, 0.042774927054953664, 0.028613015987344143, 0.04594104405657258, 0.6321869000000001, 0.34971620000000003, 0.44873047, -0.16569519, -0.15550232, -0.6001586999999999]])]


                                                                                

In [11]:
# Joining phone and whatch RDDs into a single one 
rdd_full_data = groupData(rdd_phones.union(rdd_watches))

# Showing first 5 lines of result RDD
print(rdd_full_data.take(5))

[Stage 20:>                                                         (0 + 1) / 1]

[(('a', 'gear', 'stand'), [[[-9.28975320121918, -3.1371565242007433, -1.0662863009969377, 0.4137041474158963, 1.0628532344784534, 0.6173240991928324, -0.5650316, -0.5781997, 1.01574, -12.600683, -11.08276, -2.2625206], [0.022146662344973197, -0.03286732380555572, -0.07143672518383175, 0.07314944656109798, 0.05570243854351154, 0.05017749590092546, 0.81039995, 0.35446674, 1.1475562, -2.0383835, -0.55287224, -1.2319783]]]), (('a', 'gear', 'null'), [[[-9.260130103197667, -3.4726260017441857, -1.0407208369767433, 0.031091553472846954, 0.05145894400802322, 0.03143016039412707, -9.173983, -3.3207579, -0.9307459, -9.3583355, -3.5942953, -1.1420342], [0.022570096625655994, -0.036608760294460654, -0.07169493794752187, 0.0177998638098211, 0.011561341072093032, 0.008039675979049183, 0.0561927, -0.018109497, -0.055127434, -0.0314253, -0.08042747, -0.09347696]]]), (('a', 'nexus4', 'stand'), [[[-6.02649995057012, 0.9334959509016011, 8.013646013119995, 0.18455174673530322, 0.24043415892742395, 0.17599

                                                                                