Content of the primary data files:
There are 60 'csv.gz' files, one for each participant (user, subject) in the data collection.
Each of these files has filename with the form:
[UUID].features_labels.csv.gz
where each user has a unique (randomly generated) universal user identification (UUID) number.
Each file is a textual CSV file, compressed using the gzip format.

Within every user's CSV file:
-----------------------------
- The first row specifies the columns of the file.
- Every other row refers to an example from the user. The examples are sorted according to the primary key - the timestamp.
- The columns:

-- First column is 'timestamp'. This is represented as standard number of seconds since the epoch. Every example has its timestamp, indicating the minute when the example was recorded

-- Second, come columns for the extracted features.
   Unavailable features are represented with 'nan'.
   The name of each feature contains reference to the sensor it was extracted from, in the form [sensor_name]:[feature_name].
   The current version contains features from the following sensors, with sensor names:
--- raw_acc: Accelerometer from the phone. The 'raw' version of acceleration (as opposed to the decomposed versions of gravity and user-acceleration).   
--- proc_gyro: Gyroscope from the phone. Processed version of gyroscope measurements (the OS calculates a version that removes drift).
--- raw_magnet: Magnetometer from the phone. Raw version (as opposed to bias-fixed version that the OS also provides).
--- watch_acceleration: Accelerometer from the watch.
--- watch_heading: Heading from the compass on the watch.
--- location: Location services. These features were extracted offline for every example from the sequence of latitude-longitude-altitude updates from the example's minute.
              These features regard only to relative-location (not absolute location in the world) - meaning, they describe variability of movement within the minute.
--- location_quick_features: Location services. These features were calculated on the phone when data was collected. 
                             These are available even in cases that the other location features are not because the user wanted to conceal their absolute location coordinates.
							 These quick features are very simple heuristics that approximate the more thoughtful offline features.
--- audio_naive: Microphone. These naive features are simply averages and standard deviations of the 13 MFCCs from the ~20sec recording window of every example.
--- discrete: Phone-state. These are binary indicators for the state of the phone.
              Notice that time_of_day features are also considered phone-state features (also have prefix 'discrete:'), but their columns appear not right after the other 'discrete' columns.
              
              
http://extrasensory.ucsd.edu/

In [None]:
import findspark
findspark.add_jars('/app/postgresql-42.1.4.jar')
findspark.init()

In [None]:
from pyspark.sql import SparkSession
spark = (
    SparkSession.builder
    .appName("HumanActivityETL")
    .config("spark.driver.memory", "512m")
    .config("spark.driver.cores", "1")
    .config("spark.driver.memory", "15g") \
    .config("spark.executor.memory", "512m")
    .config("spark.executor.cores", "1")
    .config("spark.sql.shuffle.partitions", "2")
    .getOrCreate()
)

In [None]:
from pyspark.sql.functions import *;
from pyspark.sql.types import *;
from scipy.stats import *
from  pyspark.sql.functions import input_file_name

# Import all files from volume /dataset/features_labels

In [None]:
from functools import reduce
from pyspark.sql import DataFrame
import glob

path = r'/dataset/features_labels' # files in docker volume
all_files = glob.glob(path + "/*.csv.gz")

li = []

for filename in all_files:
    df_import = (
    spark.read.
    option("delimiter", ',').
    csv(filename,
        #schema=schema,
        header=True,
        ignoreLeadingWhiteSpace=True,
        ignoreTrailingWhiteSpace=True,
        nullValue='NA')
     )
    li.append(df_import)
    # Append all files imported in a single dataframe
    df = reduce(DataFrame.unionAll, li)

In [None]:
# Get user ID in a new column from each file name 

# Get file name in a new column
df = df.withColumn("filename", input_file_name())

# Get User ID from the filename column
df = df.withColumn("user_id", substring("filename", 33,36))

# Show values
df.select("user_id").show(n=5, vertical=True,truncate=100)

In [None]:
# Remove column not required
df = df.drop(
"filename",
"label_source",
"raw_acc:magnitude_autocorrelation:period",
"raw_acc:magnitude_autocorrelation:normalized_ac",
"proc_gyro:magnitude_autocorrelation:period",
"proc_gyro:magnitude_autocorrelation:normalized_ac",
"raw_magnet:magnitude_autocorrelation:period",
"raw_magnet:magnitude_autocorrelation:normalized_ac",
"watch_acceleration:magnitude_autocorrelation:period",
"watch_acceleration:magnitude_autocorrelation:normalized_ac",
"audio_naive:mfcc0:mean",
"audio_naive:mfcc1:mean",
"audio_naive:mfcc2:mean",
"audio_naive:mfcc3:mean",
"audio_naive:mfcc4:mean",
"audio_naive:mfcc5:mean",
"audio_naive:mfcc6:mean",
"audio_naive:mfcc7:mean",
"audio_naive:mfcc8:mean",
"audio_naive:mfcc9:mean",
"audio_naive:mfcc10:mean",
"audio_naive:mfcc11:mean",
"audio_naive:mfcc12:mean",
"audio_naive:mfcc0:std",
"audio_naive:mfcc1:std",
"audio_naive:mfcc2:std",
"audio_naive:mfcc3:std",
"audio_naive:mfcc4:std",
"audio_naive:mfcc5:std",
"audio_naive:mfcc6:std",
"audio_naive:mfcc7:std",
"audio_naive:mfcc8:std",
"audio_naive:mfcc9:std",
"audio_naive:mfcc10:std",
"audio_naive:mfcc11:std",
"audio_naive:mfcc12:std",
"audio_properties:max_abs_value",
"audio_properties:normalization_multiplier"
)

In [None]:
# Rename column names

df = df.withColumnRenamed("timestamp","timestamp")  \
.withColumnRenamed("raw_acc:magnitude_stats:mean","acc_magnitude_mean")  \
.withColumnRenamed("raw_acc:magnitude_stats:std","acc_magnitude_std")  \
.withColumnRenamed("raw_acc:magnitude_stats:moment3","acc_magnitude_moment3")  \
.withColumnRenamed("raw_acc:magnitude_stats:moment4","acc_magnitude_moment4")  \
.withColumnRenamed("raw_acc:magnitude_stats:percentile25","acc_magnitude_perc25")  \
.withColumnRenamed("raw_acc:magnitude_stats:percentile50","acc_magnitude_perc50")  \
.withColumnRenamed("raw_acc:magnitude_stats:percentile75","acc_magnitude_perc75")  \
.withColumnRenamed("raw_acc:magnitude_stats:value_entropy","acc_magnitude_value_entropy")  \
.withColumnRenamed("raw_acc:magnitude_stats:time_entropy","acc_magnitude_time_entropy")  \
.withColumnRenamed("raw_acc:magnitude_spectrum:log_energy_band0","acc_spec_log_energy0")  \
.withColumnRenamed("raw_acc:magnitude_spectrum:log_energy_band1","acc_spec_log_energy1")  \
.withColumnRenamed("raw_acc:magnitude_spectrum:log_energy_band2","acc_spec_log_energy2")  \
.withColumnRenamed("raw_acc:magnitude_spectrum:log_energy_band3","acc_spec_log_energy3")  \
.withColumnRenamed("raw_acc:magnitude_spectrum:log_energy_band4","acc_spec_log_energy4")  \
.withColumnRenamed("raw_acc:magnitude_spectrum:spectral_entropy","acc_spec_spectral_entropy")  \
.withColumnRenamed("raw_acc:3d:mean_x","acc_3d_mean_x")  \
.withColumnRenamed("raw_acc:3d:mean_y","acc_3d_mean_y")  \
.withColumnRenamed("raw_acc:3d:mean_z","acc_3d_mean_z")  \
.withColumnRenamed("raw_acc:3d:std_x","acc_3d_std_x")  \
.withColumnRenamed("raw_acc:3d:std_y","acc_3d_std_y")  \
.withColumnRenamed("raw_acc:3d:std_z","acc_3d_std_z")  \
.withColumnRenamed("raw_acc:3d:ro_xy","acc_3d_ro_x")  \
.withColumnRenamed("raw_acc:3d:ro_xz","acc_3d_ro_y")  \
.withColumnRenamed("raw_acc:3d:ro_yz","acc_3d_ro_z")  \
.withColumnRenamed("proc_gyro:magnitude_stats:mean","gyro_magnitude_mean")  \
.withColumnRenamed("proc_gyro:magnitude_stats:std","gyro_magnitude_std")  \
.withColumnRenamed("proc_gyro:magnitude_stats:moment3","gyro_magnitude_moment3")  \
.withColumnRenamed("proc_gyro:magnitude_stats:moment4","gyro_magnitude_moment4")  \
.withColumnRenamed("proc_gyro:magnitude_stats:percentile25","gyro_magnitude_perc25")  \
.withColumnRenamed("proc_gyro:magnitude_stats:percentile50","gyro_magnitude_perc50")  \
.withColumnRenamed("proc_gyro:magnitude_stats:percentile75","gyro_magnitude_perc75")  \
.withColumnRenamed("proc_gyro:magnitude_stats:value_entropy","gyro_magnitude_value_entropy")  \
.withColumnRenamed("proc_gyro:magnitude_stats:time_entropy","gyro_magnitude_time_entropy")  \
.withColumnRenamed("proc_gyro:magnitude_spectrum:log_energy_band0","gyro_spec_log_energy0")  \
.withColumnRenamed("proc_gyro:magnitude_spectrum:log_energy_band1","gyro_spec_log_energy1")  \
.withColumnRenamed("proc_gyro:magnitude_spectrum:log_energy_band2","gyro_spec_log_energy2")  \
.withColumnRenamed("proc_gyro:magnitude_spectrum:log_energy_band3","gyro_spec_log_energy3")  \
.withColumnRenamed("proc_gyro:magnitude_spectrum:log_energy_band4","gyro_spec_log_energy4")  \
.withColumnRenamed("proc_gyro:magnitude_spectrum:spectral_entropy","gyro_spec_spectral_entropy")  \
.withColumnRenamed("proc_gyro:3d:mean_x","gyro_3d_mean_x")  \
.withColumnRenamed("proc_gyro:3d:mean_y","gyro_3d_mean_y")  \
.withColumnRenamed("proc_gyro:3d:mean_z","gyro_3d_mean_z")  \
.withColumnRenamed("proc_gyro:3d:std_x","gyro_3d_std_x")  \
.withColumnRenamed("proc_gyro:3d:std_y","gyro_3d_std_y")  \
.withColumnRenamed("proc_gyro:3d:std_z","gyro_3d_std_z")  \
.withColumnRenamed("proc_gyro:3d:ro_xy","gyro_3d_ro_xy")  \
.withColumnRenamed("proc_gyro:3d:ro_xz","gyro_3d_ro_xz")  \
.withColumnRenamed("proc_gyro:3d:ro_yz","gyro_3d_ro_yz")  \
.withColumnRenamed("raw_magnet:magnitude_stats:mean","magnet_magnitude_mean")  \
.withColumnRenamed("raw_magnet:magnitude_stats:std","magnet_magnitude_std")  \
.withColumnRenamed("raw_magnet:magnitude_stats:moment3","magnet_magnitude_moment3")  \
.withColumnRenamed("raw_magnet:magnitude_stats:moment4","magnet_magnitude_moment4")  \
.withColumnRenamed("raw_magnet:magnitude_stats:percentile25","magnet_magnitude_perc25")  \
.withColumnRenamed("raw_magnet:magnitude_stats:percentile50","magnet_magnitude_perc50")  \
.withColumnRenamed("raw_magnet:magnitude_stats:percentile75","magnet_magnitude_perc75")  \
.withColumnRenamed("raw_magnet:magnitude_stats:value_entropy","magnet_magnitude_value_entropy")  \
.withColumnRenamed("raw_magnet:magnitude_stats:time_entropy","magnet_magnitude_time_entropy")  \
.withColumnRenamed("raw_magnet:magnitude_spectrum:log_energy_band0","magnet_spec_log_energy0")  \
.withColumnRenamed("raw_magnet:magnitude_spectrum:log_energy_band1","magnet_spec_log_energy1")  \
.withColumnRenamed("raw_magnet:magnitude_spectrum:log_energy_band2","magnet_spec_log_energy2")  \
.withColumnRenamed("raw_magnet:magnitude_spectrum:log_energy_band3","magnet_spec_log_energy3")  \
.withColumnRenamed("raw_magnet:magnitude_spectrum:log_energy_band4","magnet_spec_log_energy4")  \
.withColumnRenamed("raw_magnet:magnitude_spectrum:spectral_entropy","magnet_spec_spectral_entropy")  \
.withColumnRenamed("raw_magnet:3d:mean_x","magnet_3d_mean_x")  \
.withColumnRenamed("raw_magnet:3d:mean_y","magnet_3d_mean_y")  \
.withColumnRenamed("raw_magnet:3d:mean_z","magnet_3d_mean_z")  \
.withColumnRenamed("raw_magnet:3d:std_x","magnet_3d_std_x")  \
.withColumnRenamed("raw_magnet:3d:std_y","magnet_3d_std_y")  \
.withColumnRenamed("raw_magnet:3d:std_z","magnet_3d_std_z")  \
.withColumnRenamed("raw_magnet:3d:ro_xy","magnet_3d_ro_xy")  \
.withColumnRenamed("raw_magnet:3d:ro_xz","magnet_3d_ro_xz")  \
.withColumnRenamed("raw_magnet:3d:ro_yz","magnet_3d_ro_yz")  \
.withColumnRenamed("raw_magnet:avr_cosine_similarity_lag_range0","magnet_avr_cos_similarity_lag0")  \
.withColumnRenamed("raw_magnet:avr_cosine_similarity_lag_range1","magnet_avr_cos_similarity_lag1")  \
.withColumnRenamed("raw_magnet:avr_cosine_similarity_lag_range2","magnet_avr_cos_similarity_lag2")  \
.withColumnRenamed("raw_magnet:avr_cosine_similarity_lag_range3","magnet_avr_cos_similarity_lag3")  \
.withColumnRenamed("raw_magnet:avr_cosine_similarity_lag_range4","magnet_avr_cos_similarity_lag4")  \
.withColumnRenamed("watch_acceleration:magnitude_stats:mean","acc_watch_magnitude_mean")  \
.withColumnRenamed("watch_acceleration:magnitude_stats:std","acc_watch_magnitude_std")  \
.withColumnRenamed("watch_acceleration:magnitude_stats:moment3","acc_watch_magnitude_moment3")  \
.withColumnRenamed("watch_acceleration:magnitude_stats:moment4","acc_watch_magnitude_moment4")  \
.withColumnRenamed("watch_acceleration:magnitude_stats:percentile25","acc_watch_magnitude_perc25")  \
.withColumnRenamed("watch_acceleration:magnitude_stats:percentile50","acc_watch_magnitude_perc50")  \
.withColumnRenamed("watch_acceleration:magnitude_stats:percentile75","acc_watch_magnitude_perc75")  \
.withColumnRenamed("watch_acceleration:magnitude_stats:value_entropy","acc_watch_magnitude_value_entropy")  \
.withColumnRenamed("watch_acceleration:magnitude_stats:time_entropy","acc_watch_magnitude_time_entropy")  \
.withColumnRenamed("watch_acceleration:magnitude_spectrum:log_energy_band0","acc_watch_spec_log_energy0")  \
.withColumnRenamed("watch_acceleration:magnitude_spectrum:log_energy_band1","acc_watch_spec_log_energy1")  \
.withColumnRenamed("watch_acceleration:magnitude_spectrum:log_energy_band2","acc_watch_spec_log_energy2")  \
.withColumnRenamed("watch_acceleration:magnitude_spectrum:log_energy_band3","acc_watch_spec_log_energy3")  \
.withColumnRenamed("watch_acceleration:magnitude_spectrum:log_energy_band4","acc_watch_spec_log_energy4")  \
.withColumnRenamed("watch_acceleration:magnitude_spectrum:spectral_entropy","acc_watch_spec_spectral_entropy")  \
.withColumnRenamed("watch_acceleration:3d:mean_x","acc_watch_3d_mean_x")  \
.withColumnRenamed("watch_acceleration:3d:mean_y","acc_watch_3d_mean_y")  \
.withColumnRenamed("watch_acceleration:3d:mean_z","acc_watch_3d_mean_z")  \
.withColumnRenamed("watch_acceleration:3d:std_x","acc_watch_3d_std_x")  \
.withColumnRenamed("watch_acceleration:3d:std_y","acc_watch_3d_std_y")  \
.withColumnRenamed("watch_acceleration:3d:std_z","acc_watch_3d_std_z")  \
.withColumnRenamed("watch_acceleration:3d:ro_xy","acc_watch_3d_ro_xy")  \
.withColumnRenamed("watch_acceleration:3d:ro_xz","acc_watch_3d_ro_xz")  \
.withColumnRenamed("watch_acceleration:3d:ro_yz","acc_watch_3d_ro_yz")  \
.withColumnRenamed("watch_acceleration:spectrum:x_log_energy_band0","acc_watch_spec_x_log_energy0")  \
.withColumnRenamed("watch_acceleration:spectrum:x_log_energy_band1","acc_watch_spec_x_log_energy1")  \
.withColumnRenamed("watch_acceleration:spectrum:x_log_energy_band2","acc_watch_spec_x_log_energy2")  \
.withColumnRenamed("watch_acceleration:spectrum:x_log_energy_band3","acc_watch_spec_x_log_energy3")  \
.withColumnRenamed("watch_acceleration:spectrum:x_log_energy_band4","acc_watch_spec_x_log_energy4")  \
.withColumnRenamed("watch_acceleration:spectrum:y_log_energy_band0","acc_watch_spec_y_log_energy0")  \
.withColumnRenamed("watch_acceleration:spectrum:y_log_energy_band1","acc_watch_spec_y_log_energy1")  \
.withColumnRenamed("watch_acceleration:spectrum:y_log_energy_band2","acc_watch_spec_y_log_energy2")  \
.withColumnRenamed("watch_acceleration:spectrum:y_log_energy_band3","acc_watch_spec_y_log_energy3")  \
.withColumnRenamed("watch_acceleration:spectrum:y_log_energy_band4","acc_watch_spec_y_log_energy4")  \
.withColumnRenamed("watch_acceleration:spectrum:z_log_energy_band0","acc_watch_spec_z_log_energy0")  \
.withColumnRenamed("watch_acceleration:spectrum:z_log_energy_band1","acc_watch_spec_z_log_energy1")  \
.withColumnRenamed("watch_acceleration:spectrum:z_log_energy_band2","acc_watch_spec_z_log_energy2")  \
.withColumnRenamed("watch_acceleration:spectrum:z_log_energy_band3","acc_watch_spec_z_log_energy3")  \
.withColumnRenamed("watch_acceleration:spectrum:z_log_energy_band4","acc_watch_spec_z_log_energy4")  \
.withColumnRenamed("watch_acceleration:relative_directions:avr_cosine_similarity_lag_range0","acc_watch_avr_cos_similarity_lag0")  \
.withColumnRenamed("watch_acceleration:relative_directions:avr_cosine_similarity_lag_range1","acc_watch_avr_cos_similarity_lag1")  \
.withColumnRenamed("watch_acceleration:relative_directions:avr_cosine_similarity_lag_range2","acc_watch_avr_cos_similarity_lag2")  \
.withColumnRenamed("watch_acceleration:relative_directions:avr_cosine_similarity_lag_range3","acc_watch_avr_cos_similarity_lag3")  \
.withColumnRenamed("watch_acceleration:relative_directions:avr_cosine_similarity_lag_range4","acc_watch_avr_cos_similarity_lag4")  \
.withColumnRenamed("watch_heading:mean_cos","acc_watch_head_men_cos")  \
.withColumnRenamed("watch_heading:std_cos","acc_watch_head_std_cos")  \
.withColumnRenamed("watch_heading:mom3_cos","acc_watch_head_mom3_cos")  \
.withColumnRenamed("watch_heading:mom4_cos","acc_watch_head_mom4_cos")  \
.withColumnRenamed("watch_heading:mean_sin","acc_watch_head_men_sin")  \
.withColumnRenamed("watch_heading:std_sin","acc_watch_head_std_sin")  \
.withColumnRenamed("watch_heading:mom3_sin","acc_watch_head_mom3_sin")  \
.withColumnRenamed("watch_heading:mom4_sin","acc_watch_head_mom4_sin")  \
.withColumnRenamed("watch_heading:entropy_8bins","acc_watch_head_entropy_8bins")  \
.withColumnRenamed("location:num_valid_updates","loc_valid_updates")  \
.withColumnRenamed("location:log_latitude_range","loc_log_latitude_range")  \
.withColumnRenamed("location:log_longitude_range","loc_log_longitude_range")  \
.withColumnRenamed("location:min_altitude","loc_min_altitude")  \
.withColumnRenamed("location:max_altitude","loc_max_altitude")  \
.withColumnRenamed("location:min_speed","loc_min_speed")  \
.withColumnRenamed("location:max_speed","loc_max_speed")  \
.withColumnRenamed("location:best_horizontal_accuracy","loc_best_horizontal_accuracy")  \
.withColumnRenamed("location:best_vertical_accuracy","loc_best_vertical_accuracy")  \
.withColumnRenamed("location:diameter","loc_diameter")  \
.withColumnRenamed("location:log_diameter","loc_log_diameter")  \
.withColumnRenamed("location_quick_features:std_lat","loc_features_std_lat")  \
.withColumnRenamed("location_quick_features:std_long","loc_features_std_long")  \
.withColumnRenamed("location_quick_features:lat_change","loc_features_lat_change")  \
.withColumnRenamed("location_quick_features:long_change","loc_features_log_change")  \
.withColumnRenamed("location_quick_features:mean_abs_lat_deriv","loc_features_mean_abs_lat_deriv")  \
.withColumnRenamed("location_quick_features:mean_abs_long_deriv","loc_features_mean_abs_long_deriv")  \
.withColumnRenamed("discrete:app_state:is_active","discrete_state_active")  \
.withColumnRenamed("discrete:app_state:is_inactive","discrete_state_inactive")  \
.withColumnRenamed("discrete:app_state:is_background","discrete_state_background")  \
.withColumnRenamed("discrete:app_state:missing","discrete_state_missing")  \
.withColumnRenamed("discrete:battery_plugged:is_ac","discrete_plugged_ac")  \
.withColumnRenamed("discrete:battery_plugged:is_usb","discrete_plugged_usb")  \
.withColumnRenamed("discrete:battery_plugged:is_wireless","discrete_plugged_wireless")  \
.withColumnRenamed("discrete:battery_plugged:missing","discrete_plugged_missing")  \
.withColumnRenamed("discrete:battery_state:is_unknown","discrete_battery_unknown")  \
.withColumnRenamed("discrete:battery_state:is_unplugged","discrete_battery_unplugged")  \
.withColumnRenamed("discrete:battery_state:is_not_charging","discrete_battery_nocharging")  \
.withColumnRenamed("discrete:battery_state:is_discharging","discrete_battery_discharging")  \
.withColumnRenamed("discrete:battery_state:is_charging","discrete_battery_charging")  \
.withColumnRenamed("discrete:battery_state:is_full","discrete_battery_full")  \
.withColumnRenamed("discrete:battery_state:missing","discrete_battery_missing")  \
.withColumnRenamed("discrete:on_the_phone:is_False","discrete_on_phone_False")  \
.withColumnRenamed("discrete:on_the_phone:is_True","discrete_on_phone_True")  \
.withColumnRenamed("discrete:on_the_phone:missing","discrete_on_phone_missing")  \
.withColumnRenamed("discrete:ringer_mode:is_normal","discrete_ringer_normal")  \
.withColumnRenamed("discrete:ringer_mode:is_silent_no_vibrate","discrete_ringer_silent_novibrate")  \
.withColumnRenamed("discrete:ringer_mode:is_silent_with_vibrate","discrete_ringer_silent_vibrate")  \
.withColumnRenamed("discrete:ringer_mode:missing","discrete_ringer_ringer_missing")  \
.withColumnRenamed("discrete:wifi_status:is_not_reachable","discrete_wifi_noreachable")  \
.withColumnRenamed("discrete:wifi_status:is_reachable_via_wifi","discrete_wifi_reachable_via_wifi")  \
.withColumnRenamed("discrete:wifi_status:is_reachable_via_wwan","discrete_wifi_reachable_via_wwan")  \
.withColumnRenamed("discrete:wifi_status:missing","discrete_wifi_missing")  \
.withColumnRenamed("lf_measurements:light","lf_meas_light")  \
.withColumnRenamed("lf_measurements:pressure","lf_meas_pressure")  \
.withColumnRenamed("lf_measurements:proximity_cm","lf_meas_proximity_cm")  \
.withColumnRenamed("lf_measurements:proximity","lf_meas_proximity")  \
.withColumnRenamed("lf_measurements:relative_humidity","lf_meas_relative_humidity")  \
.withColumnRenamed("lf_measurements:battery_level","lf_meas_battery_level")  \
.withColumnRenamed("lf_measurements:screen_brightness","lf_meas_screen_brightness")  \
.withColumnRenamed("lf_measurements:temperature_ambient","lf_meas_temperature_ambient")  \
.withColumnRenamed("discrete:time_of_day:between0and6","time_of_day_between0_6")  \
.withColumnRenamed("discrete:time_of_day:between3and9","time_of_day_between3_9")  \
.withColumnRenamed("discrete:time_of_day:between6and12","time_of_day_between6_12")  \
.withColumnRenamed("discrete:time_of_day:between9and15","time_of_day_between9_15")  \
.withColumnRenamed("discrete:time_of_day:between12and18","time_of_day_between12_18")  \
.withColumnRenamed("discrete:time_of_day:between15and21","time_of_day_between15_21")  \
.withColumnRenamed("discrete:time_of_day:between18and24","time_of_day_between18_24")  \
.withColumnRenamed("discrete:time_of_day:between21and3","time_of_day_between21_3")  \
.withColumnRenamed("label:LYING_DOWN","lab_LYING_DOWN")  \
.withColumnRenamed("label:SITTING","lab_SITTING")  \
.withColumnRenamed("label:FIX_walking","lab_FIX_walking")  \
.withColumnRenamed("label:FIX_running","lab_FIX_running")  \
.withColumnRenamed("label:BICYCLING","lab_BICYCLING")  \
.withColumnRenamed("label:SLEEPING","lab_SLEEPING")  \
.withColumnRenamed("label:LAB_WORK","lab_LAB_WORK")  \
.withColumnRenamed("label:IN_CLASS","lab_IN_CLASS")  \
.withColumnRenamed("label:IN_A_MEETING","lab_IN_A_MEETING")  \
.withColumnRenamed("label:LOC_main_workplace","lab_LOC_main_workplace")  \
.withColumnRenamed("label:OR_indoors","lab_OR_indoors")  \
.withColumnRenamed("label:OR_outside","lab_OR_outside")  \
.withColumnRenamed("label:IN_A_CAR","lab_IN_A_CAR")  \
.withColumnRenamed("label:ON_A_BUS","lab_ON_A_BUS")  \
.withColumnRenamed("label:DRIVE_-_I_M_THE_DRIVER","lab_DRIVE_I_M_THE_DRIVER")  \
.withColumnRenamed("label:DRIVE_-_I_M_A_PASSENGER","lab_DRIVE_I_M_A_PASSENGER")  \
.withColumnRenamed("label:LOC_home","lab_LOC_home")  \
.withColumnRenamed("label:FIX_restaurant","lab_FIX_restaurant")  \
.withColumnRenamed("label:PHONE_IN_POCKET","lab_PHONE_IN_POCKET")  \
.withColumnRenamed("label:OR_exercise","lab_OR_exercise")  \
.withColumnRenamed("label:COOKING","lab_COOKING")  \
.withColumnRenamed("label:SHOPPING","lab_SHOPPING")  \
.withColumnRenamed("label:STROLLING","lab_STROLLING")  \
.withColumnRenamed("label:DRINKING__ALCOHOL_","lab_DRINKING__ALCOHOL_")  \
.withColumnRenamed("label:BATHING_-_SHOWER","lab_BATHING_SHOWER")  \
.withColumnRenamed("label:CLEANING","lab_CLEANING")  \
.withColumnRenamed("label:DOING_LAUNDRY","lab_DOING_LAUNDRY")  \
.withColumnRenamed("label:WASHING_DISHES","lab_WASHING_DISHES")  \
.withColumnRenamed("label:WATCHING_TV","lab_WATCHING_TV")  \
.withColumnRenamed("label:SURFING_THE_INTERNET","lab_SURFING_THE_INTERNET")  \
.withColumnRenamed("label:AT_A_PARTY","lab_AT_A_PARTY")  \
.withColumnRenamed("label:AT_A_BAR","lab_AT_A_BAR")  \
.withColumnRenamed("label:LOC_beach","lab_LOC_beach")  \
.withColumnRenamed("label:SINGING","lab_SINGING")  \
.withColumnRenamed("label:TALKING","lab_TALKING")  \
.withColumnRenamed("label:COMPUTER_WORK","lab_COMPUTER_WORK")  \
.withColumnRenamed("label:EATING","lab_EATING")  \
.withColumnRenamed("label:TOILET","lab_TOILET")  \
.withColumnRenamed("label:GROOMING","lab_GROOMING")  \
.withColumnRenamed("label:DRESSING","lab_DRESSING")  \
.withColumnRenamed("label:AT_THE_GYM","lab_AT_THE_GYM")  \
.withColumnRenamed("label:STAIRS_-_GOING_UP","lab_STAIRS_GOING_UP")  \
.withColumnRenamed("label:STAIRS_-_GOING_DOWN","lab_STAIRS_GOING_DOWN")  \
.withColumnRenamed("label:ELEVATOR","lab_ELEVATOR")  \
.withColumnRenamed("label:OR_standing","lab_OR_standing")  \
.withColumnRenamed("label:AT_SCHOOL","lab_AT_SCHOOL")  \
.withColumnRenamed("label:PHONE_IN_HAND","lab_PHONE_IN_HAND")  \
.withColumnRenamed("label:PHONE_IN_BAG","lab_PHONE_IN_BAG")  \
.withColumnRenamed("label:PHONE_ON_TABLE","lab_PHONE_ON_TABLE")  \
.withColumnRenamed("label:WITH_CO-WORKERS","lab_WITH_COWORKERS")  \
.withColumnRenamed("label:WITH_FRIENDS","lab_WITH_FRIENDS")  

In [None]:
# Original Dataframe dimensions

orig_rows= df.count()
orig_columns= len(df.columns)

print("Dataframe dimensions: ", (orig_rows, orig_columns))
#df.show(n=2, vertical=True)

# Tranformations: Data types

In [None]:
# Changing unix time format to datetime 
df = df.withColumn("timestamp", from_unixtime('timestamp', "yyyy-MM-dd HH:mm:ss"))

# Setting the correct time_zone = PST ('US/Pacific'), according to http://extrasensory.ucsd.edu/papers/vaizman2017a_pervasiveAcceptedVersion.pdf
df = df.withColumn("timestamp", from_utc_timestamp(df.timestamp, 'PST')) 

# Show values
df.select("timestamp").show(n=10, vertical=True)

In [None]:
# Change numeric data types

df = df.withColumn("acc_magnitude_mean", df.acc_magnitude_mean.cast(DoubleType()))   \
.withColumn("acc_magnitude_std", df.acc_magnitude_std.cast(DoubleType()))   \
.withColumn("acc_magnitude_moment3", df.acc_magnitude_moment3.cast(DoubleType()))   \
.withColumn("acc_magnitude_moment4", df.acc_magnitude_moment4.cast(DoubleType()))   \
.withColumn("acc_magnitude_perc25", df.acc_magnitude_perc25.cast(DoubleType()))   \
.withColumn("acc_magnitude_perc50", df.acc_magnitude_perc50.cast(DoubleType()))   \
.withColumn("acc_magnitude_perc75", df.acc_magnitude_perc75.cast(DoubleType()))   \
.withColumn("acc_magnitude_value_entropy", df.acc_magnitude_value_entropy.cast(DoubleType()))   \
.withColumn("acc_magnitude_time_entropy", df.acc_magnitude_time_entropy.cast(DoubleType()))   \
.withColumn("acc_spec_log_energy0", df.acc_spec_log_energy0.cast(DoubleType()))   \
.withColumn("acc_spec_log_energy1", df.acc_spec_log_energy1.cast(DoubleType()))   \
.withColumn("acc_spec_log_energy2", df.acc_spec_log_energy2.cast(DoubleType()))   \
.withColumn("acc_spec_log_energy3", df.acc_spec_log_energy3.cast(DoubleType()))   \
.withColumn("acc_spec_log_energy4", df.acc_spec_log_energy4.cast(DoubleType()))   \
.withColumn("acc_spec_spectral_entropy", df.acc_spec_spectral_entropy.cast(DoubleType()))   \
.withColumn("acc_3d_mean_x", df.acc_3d_mean_x.cast(DoubleType()))   \
.withColumn("acc_3d_mean_y", df.acc_3d_mean_y.cast(DoubleType()))   \
.withColumn("acc_3d_mean_z", df.acc_3d_mean_z.cast(DoubleType()))   \
.withColumn("acc_3d_std_x", df.acc_3d_std_x.cast(DoubleType()))   \
.withColumn("acc_3d_std_y", df.acc_3d_std_y.cast(DoubleType()))   \
.withColumn("acc_3d_std_z", df.acc_3d_std_z.cast(DoubleType()))   \
.withColumn("acc_3d_ro_x", df.acc_3d_ro_x.cast(DoubleType()))   \
.withColumn("acc_3d_ro_y", df.acc_3d_ro_y.cast(DoubleType()))   \
.withColumn("acc_3d_ro_z", df.acc_3d_ro_z.cast(DoubleType()))   \
.withColumn("gyro_magnitude_mean", df.gyro_magnitude_mean.cast(DoubleType()))   \
.withColumn("gyro_magnitude_std", df.gyro_magnitude_std.cast(DoubleType()))   \
.withColumn("gyro_magnitude_moment3", df.gyro_magnitude_moment3.cast(DoubleType()))   \
.withColumn("gyro_magnitude_moment4", df.gyro_magnitude_moment4.cast(DoubleType()))   \
.withColumn("gyro_magnitude_perc25", df.gyro_magnitude_perc25.cast(DoubleType()))   \
.withColumn("gyro_magnitude_perc50", df.gyro_magnitude_perc50.cast(DoubleType()))   \
.withColumn("gyro_magnitude_perc75", df.gyro_magnitude_perc75.cast(DoubleType()))   \
.withColumn("gyro_magnitude_value_entropy", df.gyro_magnitude_value_entropy.cast(DoubleType()))   \
.withColumn("gyro_magnitude_time_entropy", df.gyro_magnitude_time_entropy.cast(DoubleType()))   \
.withColumn("gyro_spec_log_energy0", df.gyro_spec_log_energy0.cast(DoubleType()))   \
.withColumn("gyro_spec_log_energy1", df.gyro_spec_log_energy1.cast(DoubleType()))   \
.withColumn("gyro_spec_log_energy2", df.gyro_spec_log_energy2.cast(DoubleType()))   \
.withColumn("gyro_spec_log_energy3", df.gyro_spec_log_energy3.cast(DoubleType()))   \
.withColumn("gyro_spec_log_energy4", df.gyro_spec_log_energy4.cast(DoubleType()))   \
.withColumn("gyro_spec_spectral_entropy", df.gyro_spec_spectral_entropy.cast(DoubleType()))   \
.withColumn("gyro_3d_mean_x", df.gyro_3d_mean_x.cast(DoubleType()))   \
.withColumn("gyro_3d_mean_y", df.gyro_3d_mean_y.cast(DoubleType()))   \
.withColumn("gyro_3d_mean_z", df.gyro_3d_mean_z.cast(DoubleType()))   \
.withColumn("gyro_3d_std_x", df.gyro_3d_std_x.cast(DoubleType()))   \
.withColumn("gyro_3d_std_y", df.gyro_3d_std_y.cast(DoubleType()))   \
.withColumn("gyro_3d_std_z", df.gyro_3d_std_z.cast(DoubleType()))   \
.withColumn("gyro_3d_ro_xy", df.gyro_3d_ro_xy.cast(DoubleType()))   \
.withColumn("gyro_3d_ro_xz", df.gyro_3d_ro_xz.cast(DoubleType()))   \
.withColumn("gyro_3d_ro_yz", df.gyro_3d_ro_yz.cast(DoubleType()))   \
.withColumn("magnet_magnitude_mean", df.magnet_magnitude_mean.cast(DoubleType()))   \
.withColumn("magnet_magnitude_std", df.magnet_magnitude_std.cast(DoubleType()))   \
.withColumn("magnet_magnitude_moment3", df.magnet_magnitude_moment3.cast(DoubleType()))   \
.withColumn("magnet_magnitude_moment4", df.magnet_magnitude_moment4.cast(DoubleType()))   \
.withColumn("magnet_magnitude_perc25", df.magnet_magnitude_perc25.cast(DoubleType()))   \
.withColumn("magnet_magnitude_perc50", df.magnet_magnitude_perc50.cast(DoubleType()))   \
.withColumn("magnet_magnitude_perc75", df.magnet_magnitude_perc75.cast(DoubleType()))   \
.withColumn("magnet_magnitude_value_entropy", df.magnet_magnitude_value_entropy.cast(DoubleType()))   \
.withColumn("magnet_magnitude_time_entropy", df.magnet_magnitude_time_entropy.cast(DoubleType()))   \
.withColumn("magnet_spec_log_energy0", df.magnet_spec_log_energy0.cast(DoubleType()))   \
.withColumn("magnet_spec_log_energy1", df.magnet_spec_log_energy1.cast(DoubleType()))   \
.withColumn("magnet_spec_log_energy2", df.magnet_spec_log_energy2.cast(DoubleType()))   \
.withColumn("magnet_spec_log_energy3", df.magnet_spec_log_energy3.cast(DoubleType()))   \
.withColumn("magnet_spec_log_energy4", df.magnet_spec_log_energy4.cast(DoubleType()))   \
.withColumn("magnet_spec_spectral_entropy", df.magnet_spec_spectral_entropy.cast(DoubleType()))   \
.withColumn("magnet_3d_mean_x", df.magnet_3d_mean_x.cast(DoubleType()))   \
.withColumn("magnet_3d_mean_y", df.magnet_3d_mean_y.cast(DoubleType()))   \
.withColumn("magnet_3d_mean_z", df.magnet_3d_mean_z.cast(DoubleType()))   \
.withColumn("magnet_3d_std_x", df.magnet_3d_std_x.cast(DoubleType()))   \
.withColumn("magnet_3d_std_y", df.magnet_3d_std_y.cast(DoubleType()))   \
.withColumn("magnet_3d_std_z", df.magnet_3d_std_z.cast(DoubleType()))   \
.withColumn("magnet_3d_ro_xy", df.magnet_3d_ro_xy.cast(DoubleType()))   \
.withColumn("magnet_3d_ro_xz", df.magnet_3d_ro_xz.cast(DoubleType()))   \
.withColumn("magnet_3d_ro_yz", df.magnet_3d_ro_yz.cast(DoubleType()))   \
.withColumn("magnet_avr_cos_similarity_lag0", df.magnet_avr_cos_similarity_lag0.cast(DoubleType()))   \
.withColumn("magnet_avr_cos_similarity_lag1", df.magnet_avr_cos_similarity_lag1.cast(DoubleType()))   \
.withColumn("magnet_avr_cos_similarity_lag2", df.magnet_avr_cos_similarity_lag2.cast(DoubleType()))   \
.withColumn("magnet_avr_cos_similarity_lag3", df.magnet_avr_cos_similarity_lag3.cast(DoubleType()))   \
.withColumn("magnet_avr_cos_similarity_lag4", df.magnet_avr_cos_similarity_lag4.cast(DoubleType()))   \
.withColumn("acc_watch_magnitude_mean", df.acc_watch_magnitude_mean.cast(DoubleType()))   \
.withColumn("acc_watch_magnitude_std", df.acc_watch_magnitude_std.cast(DoubleType()))   \
.withColumn("acc_watch_magnitude_moment3", df.acc_watch_magnitude_moment3.cast(DoubleType()))   \
.withColumn("acc_watch_magnitude_moment4", df.acc_watch_magnitude_moment4.cast(DoubleType()))   \
.withColumn("acc_watch_magnitude_perc25", df.acc_watch_magnitude_perc25.cast(DoubleType()))   \
.withColumn("acc_watch_magnitude_perc50", df.acc_watch_magnitude_perc50.cast(DoubleType()))   \
.withColumn("acc_watch_magnitude_perc75", df.acc_watch_magnitude_perc75.cast(DoubleType()))   \
.withColumn("acc_watch_magnitude_value_entropy", df.acc_watch_magnitude_value_entropy.cast(DoubleType()))   \
.withColumn("acc_watch_magnitude_time_entropy", df.acc_watch_magnitude_time_entropy.cast(DoubleType()))   \
.withColumn("acc_watch_spec_log_energy0", df.acc_watch_spec_log_energy0.cast(DoubleType()))   \
.withColumn("acc_watch_spec_log_energy1", df.acc_watch_spec_log_energy1.cast(DoubleType()))   \
.withColumn("acc_watch_spec_log_energy2", df.acc_watch_spec_log_energy2.cast(DoubleType()))   \
.withColumn("acc_watch_spec_log_energy3", df.acc_watch_spec_log_energy3.cast(DoubleType()))   \
.withColumn("acc_watch_spec_log_energy4", df.acc_watch_spec_log_energy4.cast(DoubleType()))   \
.withColumn("acc_watch_spec_spectral_entropy", df.acc_watch_spec_spectral_entropy.cast(DoubleType()))   \
.withColumn("acc_watch_3d_mean_x", df.acc_watch_3d_mean_x.cast(DoubleType()))   \
.withColumn("acc_watch_3d_mean_y", df.acc_watch_3d_mean_y.cast(DoubleType()))   \
.withColumn("acc_watch_3d_mean_z", df.acc_watch_3d_mean_z.cast(DoubleType()))   \
.withColumn("acc_watch_3d_std_x", df.acc_watch_3d_std_x.cast(DoubleType()))   \
.withColumn("acc_watch_3d_std_y", df.acc_watch_3d_std_y.cast(DoubleType()))   \
.withColumn("acc_watch_3d_std_z", df.acc_watch_3d_std_z.cast(DoubleType()))   \
.withColumn("acc_watch_3d_ro_xy", df.acc_watch_3d_ro_xy.cast(DoubleType()))   \
.withColumn("acc_watch_3d_ro_xz", df.acc_watch_3d_ro_xz.cast(DoubleType()))   \
.withColumn("acc_watch_3d_ro_yz", df.acc_watch_3d_ro_yz.cast(DoubleType()))   \
.withColumn("acc_watch_spec_x_log_energy0", df.acc_watch_spec_x_log_energy0.cast(DoubleType()))   \
.withColumn("acc_watch_spec_x_log_energy1", df.acc_watch_spec_x_log_energy1.cast(DoubleType()))   \
.withColumn("acc_watch_spec_x_log_energy2", df.acc_watch_spec_x_log_energy2.cast(DoubleType()))   \
.withColumn("acc_watch_spec_x_log_energy3", df.acc_watch_spec_x_log_energy3.cast(DoubleType()))   \
.withColumn("acc_watch_spec_x_log_energy4", df.acc_watch_spec_x_log_energy4.cast(DoubleType()))   \
.withColumn("acc_watch_spec_y_log_energy0", df.acc_watch_spec_y_log_energy0.cast(DoubleType()))   \
.withColumn("acc_watch_spec_y_log_energy1", df.acc_watch_spec_y_log_energy1.cast(DoubleType()))   \
.withColumn("acc_watch_spec_y_log_energy2", df.acc_watch_spec_y_log_energy2.cast(DoubleType()))   \
.withColumn("acc_watch_spec_y_log_energy3", df.acc_watch_spec_y_log_energy3.cast(DoubleType()))   \
.withColumn("acc_watch_spec_y_log_energy4", df.acc_watch_spec_y_log_energy4.cast(DoubleType()))   \
.withColumn("acc_watch_spec_z_log_energy0", df.acc_watch_spec_z_log_energy0.cast(DoubleType()))   \
.withColumn("acc_watch_spec_z_log_energy1", df.acc_watch_spec_z_log_energy1.cast(DoubleType()))   \
.withColumn("acc_watch_spec_z_log_energy2", df.acc_watch_spec_z_log_energy2.cast(DoubleType()))   \
.withColumn("acc_watch_spec_z_log_energy3", df.acc_watch_spec_z_log_energy3.cast(DoubleType()))   \
.withColumn("acc_watch_spec_z_log_energy4", df.acc_watch_spec_z_log_energy4.cast(DoubleType()))   \
.withColumn("acc_watch_avr_cos_similarity_lag0", df.acc_watch_avr_cos_similarity_lag0.cast(DoubleType()))   \
.withColumn("acc_watch_avr_cos_similarity_lag1", df.acc_watch_avr_cos_similarity_lag1.cast(DoubleType()))   \
.withColumn("acc_watch_avr_cos_similarity_lag2", df.acc_watch_avr_cos_similarity_lag2.cast(DoubleType()))   \
.withColumn("acc_watch_avr_cos_similarity_lag3", df.acc_watch_avr_cos_similarity_lag3.cast(DoubleType()))   \
.withColumn("acc_watch_avr_cos_similarity_lag4", df.acc_watch_avr_cos_similarity_lag4.cast(DoubleType()))   \
.withColumn("acc_watch_head_men_cos", df.acc_watch_head_men_cos.cast(DoubleType()))   \
.withColumn("acc_watch_head_std_cos", df.acc_watch_head_std_cos.cast(DoubleType()))   \
.withColumn("acc_watch_head_mom3_cos", df.acc_watch_head_mom3_cos.cast(DoubleType()))   \
.withColumn("acc_watch_head_mom4_cos", df.acc_watch_head_mom4_cos.cast(DoubleType()))   \
.withColumn("acc_watch_head_men_sin", df.acc_watch_head_men_sin.cast(DoubleType()))   \
.withColumn("acc_watch_head_std_sin", df.acc_watch_head_std_sin.cast(DoubleType()))   \
.withColumn("acc_watch_head_mom3_sin", df.acc_watch_head_mom3_sin.cast(DoubleType()))   \
.withColumn("acc_watch_head_mom4_sin", df.acc_watch_head_mom4_sin.cast(DoubleType()))   \
.withColumn("acc_watch_head_entropy_8bins", df.acc_watch_head_entropy_8bins.cast(DoubleType()))   \
.withColumn("loc_valid_updates", df.loc_valid_updates.cast(IntegerType()))   \
.withColumn("loc_log_latitude_range", df.loc_log_latitude_range.cast(DoubleType()))   \
.withColumn("loc_log_longitude_range", df.loc_log_longitude_range.cast(DoubleType()))   \
.withColumn("loc_min_altitude", df.loc_min_altitude.cast(DoubleType()))   \
.withColumn("loc_max_altitude", df.loc_max_altitude.cast(DoubleType()))   \
.withColumn("loc_min_speed", df.loc_min_speed.cast(DoubleType()))   \
.withColumn("loc_max_speed", df.loc_max_speed.cast(DoubleType()))   \
.withColumn("loc_best_horizontal_accuracy", df.loc_best_horizontal_accuracy.cast(DoubleType()))   \
.withColumn("loc_best_vertical_accuracy", df.loc_best_vertical_accuracy.cast(DoubleType()))   \
.withColumn("loc_diameter", df.loc_diameter.cast(DoubleType()))   \
.withColumn("loc_log_diameter", df.loc_log_diameter.cast(DoubleType()))   \
.withColumn("loc_features_std_lat", df.loc_features_std_lat.cast(DoubleType()))   \
.withColumn("loc_features_std_long", df.loc_features_std_long.cast(DoubleType()))   \
.withColumn("loc_features_lat_change", df.loc_features_lat_change.cast(DoubleType()))   \
.withColumn("loc_features_log_change", df.loc_features_log_change.cast(DoubleType()))   \
.withColumn("loc_features_mean_abs_lat_deriv", df.loc_features_mean_abs_lat_deriv.cast(DoubleType()))   \
.withColumn("loc_features_mean_abs_long_deriv", df.loc_features_mean_abs_long_deriv.cast(DoubleType()))   \
.withColumn("discrete_state_active", df.discrete_state_active.cast(IntegerType()))   \
.withColumn("discrete_state_inactive", df.discrete_state_inactive.cast(IntegerType()))   \
.withColumn("discrete_state_background", df.discrete_state_background.cast(IntegerType()))   \
.withColumn("discrete_state_missing", df.discrete_state_missing.cast(IntegerType()))   \
.withColumn("discrete_plugged_ac", df.discrete_plugged_ac.cast(IntegerType()))   \
.withColumn("discrete_plugged_usb", df.discrete_plugged_usb.cast(IntegerType()))   \
.withColumn("discrete_plugged_wireless", df.discrete_plugged_wireless.cast(IntegerType()))   \
.withColumn("discrete_plugged_missing", df.discrete_plugged_missing.cast(IntegerType()))   \
.withColumn("discrete_battery_unknown", df.discrete_battery_unknown.cast(IntegerType()))   \
.withColumn("discrete_battery_unplugged", df.discrete_battery_unplugged.cast(IntegerType()))   \
.withColumn("discrete_battery_nocharging", df.discrete_battery_nocharging.cast(IntegerType()))   \
.withColumn("discrete_battery_discharging", df.discrete_battery_discharging.cast(IntegerType()))   \
.withColumn("discrete_battery_charging", df.discrete_battery_charging.cast(IntegerType()))   \
.withColumn("discrete_battery_full", df.discrete_battery_full.cast(IntegerType()))   \
.withColumn("discrete_battery_missing", df.discrete_battery_missing.cast(IntegerType()))   \
.withColumn("discrete_on_phone_False", df.discrete_on_phone_False.cast(IntegerType()))   \
.withColumn("discrete_on_phone_True", df.discrete_on_phone_True.cast(IntegerType()))   \
.withColumn("discrete_on_phone_missing", df.discrete_on_phone_missing.cast(IntegerType()))   \
.withColumn("discrete_ringer_normal", df.discrete_ringer_normal.cast(IntegerType()))   \
.withColumn("discrete_ringer_silent_novibrate", df.discrete_ringer_silent_novibrate.cast(IntegerType()))   \
.withColumn("discrete_ringer_silent_vibrate", df.discrete_ringer_silent_vibrate.cast(IntegerType()))   \
.withColumn("discrete_ringer_ringer_missing", df.discrete_ringer_ringer_missing.cast(IntegerType()))   \
.withColumn("discrete_wifi_noreachable", df.discrete_wifi_noreachable.cast(IntegerType()))   \
.withColumn("discrete_wifi_reachable_via_wifi", df.discrete_wifi_reachable_via_wifi.cast(IntegerType()))   \
.withColumn("discrete_wifi_reachable_via_wwan", df.discrete_wifi_reachable_via_wwan.cast(IntegerType()))   \
.withColumn("discrete_wifi_missing", df.discrete_wifi_missing.cast(IntegerType()))   \
.withColumn("lf_meas_light", df.lf_meas_light.cast(DoubleType()))   \
.withColumn("lf_meas_pressure", df.lf_meas_pressure.cast(DoubleType()))   \
.withColumn("lf_meas_proximity_cm", df.lf_meas_proximity_cm.cast(IntegerType()))   \
.withColumn("lf_meas_proximity", df.lf_meas_proximity.cast(DoubleType()))   \
.withColumn("lf_meas_relative_humidity", df.lf_meas_relative_humidity.cast(DoubleType()))   \
.withColumn("lf_meas_battery_level", df.lf_meas_battery_level.cast(DoubleType()))   \
.withColumn("lf_meas_screen_brightness", df.lf_meas_screen_brightness.cast(DoubleType()))   \
.withColumn("lf_meas_temperature_ambient", df.lf_meas_temperature_ambient.cast(DoubleType()))   \
.withColumn("time_of_day_between0_6", df.time_of_day_between0_6.cast(IntegerType()))   \
.withColumn("time_of_day_between3_9", df.time_of_day_between3_9.cast(IntegerType()))   \
.withColumn("time_of_day_between6_12", df.time_of_day_between6_12.cast(IntegerType()))   \
.withColumn("time_of_day_between9_15", df.time_of_day_between9_15.cast(IntegerType()))   \
.withColumn("time_of_day_between12_18", df.time_of_day_between12_18.cast(IntegerType()))   \
.withColumn("time_of_day_between15_21", df.time_of_day_between15_21.cast(IntegerType()))   \
.withColumn("time_of_day_between18_24", df.time_of_day_between18_24.cast(IntegerType()))   \
.withColumn("time_of_day_between21_3", df.time_of_day_between21_3.cast(IntegerType()))   \
.withColumn("lab_LYING_DOWN", df.lab_LYING_DOWN.cast(IntegerType()))   \
.withColumn("lab_SITTING", df.lab_SITTING.cast(IntegerType()))   \
.withColumn("lab_FIX_walking", df.lab_FIX_walking.cast(IntegerType()))   \
.withColumn("lab_FIX_running", df.lab_FIX_running.cast(IntegerType()))   \
.withColumn("lab_BICYCLING", df.lab_BICYCLING.cast(IntegerType()))   \
.withColumn("lab_SLEEPING", df.lab_SLEEPING.cast(IntegerType()))   \
.withColumn("lab_LAB_WORK", df.lab_LAB_WORK.cast(IntegerType()))   \
.withColumn("lab_IN_CLASS", df.lab_IN_CLASS.cast(IntegerType()))   \
.withColumn("lab_IN_A_MEETING", df.lab_IN_A_MEETING.cast(IntegerType()))   \
.withColumn("lab_LOC_main_workplace", df.lab_LOC_main_workplace.cast(IntegerType()))   \
.withColumn("lab_OR_indoors", df.lab_OR_indoors.cast(IntegerType()))   \
.withColumn("lab_OR_outside", df.lab_OR_outside.cast(IntegerType()))   \
.withColumn("lab_IN_A_CAR", df.lab_IN_A_CAR.cast(IntegerType()))   \
.withColumn("lab_ON_A_BUS", df.lab_ON_A_BUS.cast(IntegerType()))   \
.withColumn("lab_DRIVE_I_M_THE_DRIVER", df.lab_DRIVE_I_M_THE_DRIVER.cast(IntegerType()))   \
.withColumn("lab_DRIVE_I_M_A_PASSENGER", df.lab_DRIVE_I_M_A_PASSENGER.cast(IntegerType()))   \
.withColumn("lab_LOC_home", df.lab_LOC_home.cast(IntegerType()))   \
.withColumn("lab_FIX_restaurant", df.lab_FIX_restaurant.cast(IntegerType()))   \
.withColumn("lab_PHONE_IN_POCKET", df.lab_PHONE_IN_POCKET.cast(IntegerType()))   \
.withColumn("lab_OR_exercise", df.lab_OR_exercise.cast(IntegerType()))   \
.withColumn("lab_COOKING", df.lab_COOKING.cast(IntegerType()))   \
.withColumn("lab_SHOPPING", df.lab_SHOPPING.cast(IntegerType()))   \
.withColumn("lab_STROLLING", df.lab_STROLLING.cast(IntegerType()))   \
.withColumn("lab_DRINKING__ALCOHOL_", df.lab_DRINKING__ALCOHOL_.cast(IntegerType()))   \
.withColumn("lab_BATHING_SHOWER", df.lab_BATHING_SHOWER.cast(IntegerType()))   \
.withColumn("lab_CLEANING", df.lab_CLEANING.cast(IntegerType()))   \
.withColumn("lab_DOING_LAUNDRY", df.lab_DOING_LAUNDRY.cast(IntegerType()))   \
.withColumn("lab_WASHING_DISHES", df.lab_WASHING_DISHES.cast(IntegerType()))   \
.withColumn("lab_WATCHING_TV", df.lab_WATCHING_TV.cast(IntegerType()))   \
.withColumn("lab_SURFING_THE_INTERNET", df.lab_SURFING_THE_INTERNET.cast(IntegerType()))   \
.withColumn("lab_AT_A_PARTY", df.lab_AT_A_PARTY.cast(IntegerType()))   \
.withColumn("lab_AT_A_BAR", df.lab_AT_A_BAR.cast(IntegerType()))   \
.withColumn("lab_LOC_beach", df.lab_LOC_beach.cast(IntegerType()))   \
.withColumn("lab_SINGING", df.lab_SINGING.cast(IntegerType()))   \
.withColumn("lab_TALKING", df.lab_TALKING.cast(IntegerType()))   \
.withColumn("lab_COMPUTER_WORK", df.lab_COMPUTER_WORK.cast(IntegerType()))   \
.withColumn("lab_EATING", df.lab_EATING.cast(IntegerType()))   \
.withColumn("lab_TOILET", df.lab_TOILET.cast(IntegerType()))   \
.withColumn("lab_GROOMING", df.lab_GROOMING.cast(IntegerType()))   \
.withColumn("lab_DRESSING", df.lab_DRESSING.cast(IntegerType()))   \
.withColumn("lab_AT_THE_GYM", df.lab_AT_THE_GYM.cast(IntegerType()))   \
.withColumn("lab_STAIRS_GOING_UP", df.lab_STAIRS_GOING_UP.cast(IntegerType()))   \
.withColumn("lab_STAIRS_GOING_DOWN", df.lab_STAIRS_GOING_DOWN.cast(IntegerType()))   \
.withColumn("lab_ELEVATOR", df.lab_ELEVATOR.cast(IntegerType()))   \
.withColumn("lab_OR_standing", df.lab_OR_standing.cast(IntegerType()))   \
.withColumn("lab_AT_SCHOOL", df.lab_AT_SCHOOL.cast(IntegerType()))   \
.withColumn("lab_PHONE_IN_HAND", df.lab_PHONE_IN_HAND.cast(IntegerType()))   \
.withColumn("lab_PHONE_IN_BAG", df.lab_PHONE_IN_BAG.cast(IntegerType()))   \
.withColumn("lab_PHONE_ON_TABLE", df.lab_PHONE_ON_TABLE.cast(IntegerType()))   \
.withColumn("lab_WITH_COWORKERS", df.lab_WITH_COWORKERS.cast(IntegerType()))   \
.withColumn("lab_WITH_FRIENDS", df.lab_WITH_FRIENDS.cast(IntegerType()))

In [None]:
df.printSchema()

In [None]:
# Show values
df.show(n=2, vertical=True)

# Transformations: Null values

In [None]:
# Evaluate null values for each column
# for col in df.columns:
#   print(col, "\t", "Nulls: ", df.filter(df[col].isNull()).count())


In [None]:
# Replace null values with 0 for all label columns

label_columns = []

for col in df.columns:
  if('lab' in col):
    label_columns.append(col)

df = df.na.fill(value=0,subset=label_columns)

In [None]:
# Remove rows with null values on all columns 
df = df.na.drop("all")

print("Removed rows: ", (orig_rows - df.count()))

In [None]:
# Remove rows with any null value on columns for high-frequency motion-reactive sensors: accelerometer, gyroscope
# The sensors magnetometer and watch accelerometer are also motion-reactive sensors, although these columns contain more than 20% of null values. 
#Another null-handling strategy will be use for these columns

motion_sensor_columns = [
"acc_magnitude_mean",
"acc_magnitude_std",
"acc_magnitude_moment3",
"acc_magnitude_moment4",
"acc_magnitude_perc25",
"acc_magnitude_perc50",
"acc_magnitude_perc75",
"acc_magnitude_value_entropy",
"acc_magnitude_time_entropy",
"acc_spec_log_energy0",
"acc_spec_log_energy1",
"acc_spec_log_energy2",
"acc_spec_log_energy3",
"acc_spec_log_energy4",
"acc_spec_spectral_entropy",
"acc_3d_mean_x",
"acc_3d_mean_y",
"acc_3d_mean_z",
"acc_3d_std_x",
"acc_3d_std_y",
"acc_3d_std_z",
"acc_3d_ro_x",
"acc_3d_ro_y",
"acc_3d_ro_z",
"gyro_magnitude_mean",
"gyro_magnitude_std",
"gyro_magnitude_moment3",
"gyro_magnitude_moment4",
"gyro_magnitude_perc25",
"gyro_magnitude_perc50",
"gyro_magnitude_perc75",
"gyro_magnitude_value_entropy",
"gyro_magnitude_time_entropy",
"gyro_spec_log_energy0",
"gyro_spec_log_energy1",
"gyro_spec_log_energy2",
"gyro_spec_log_energy3",
"gyro_spec_log_energy4",
"gyro_spec_spectral_entropy",
"gyro_3d_mean_x",
"gyro_3d_mean_y",
"gyro_3d_mean_z",
"gyro_3d_std_x",
"gyro_3d_std_y",
"gyro_3d_std_z",
"gyro_3d_ro_xy",
"gyro_3d_ro_xz",
"gyro_3d_ro_yz"]

df = df.na.drop(subset= motion_sensor_columns)

print("Removed rows: ", (orig_rows - df.count()))
# print("Removed ", ((orig_rows - df.count())*100)/orig_rows, "% of total rows")

In [None]:
double_columns = [f.name for f in df.schema.fields if isinstance(f.dataType, DoubleType)]
double_columns = [x for x in double_columns if x not in motion_sensor_columns]
print("number of double columns: ", len(double_columns))
double_columns

In [None]:
# Remove columns with more than 80% null values

nan_percent=0.8
threshold = df.count() * nan_percent

rem_columns = []

for col in df.columns:
  if(col in double_columns) and (df.filter(df[col].isNull()).count() >= threshold):
    rem_columns.append(col)

df = df.drop(*rem_columns)

print("Removed columns: ", rem_columns)

# java.lang.OutOfMemoryError: Java heap space

# Transformations : Replace values

In [None]:
# Kolmogorov-Smirnov test (no assumption about the distribution of the data)
# Tests how well the distribution of sample data conforms to some theoretical distribution. 
# Compare between some theoretical cumulative distribution function, (Ft(x)), and a samples’ cumulative distribution function , (Fs(x)) 
# In this case: theoretical cumulative distribution function (Ft(x)) = normal distribution
# H0: Fs(x) is equal to Ft(x) for all x --> variable normally distributed
# HA: Fs(x) is not equal to Ft(x) for at least one x  --> variable not normally distributed

#from scipy.stats import *

#df1 = df.select(*double_columns)
#pddf1 = df1.toPandas()

#for col in pddf1.columns:
#    df.filter(df[col].isNotNull())
#    print([col])
#    a,b = stats.kstest(pddf1[[col]], "norm")
#    print ("Statistics: ", a,"p-value: ", b)
#    if b < alpha:
#        print("Not normally ditributed (Null hypothesis rejected)")
#    else:
#        print("Normally ditributed (Null hypothesis not rejected)")

 # MemoryError: Unable to allocate 244. GiB for an array with shape       

In [None]:
# Replace null values with mean values for columns with sensor measures (no labels)

from pyspark.ml.feature import Imputer

# Select columns to impute with mean value
double_columns = [f.name for f in df.schema.fields if isinstance(f.dataType, DoubleType)]
double_columns = [x for x in double_columns if x not in motion_sensor_columns]

sensor_columns = []

for col in df.columns:
  if(col in double_columns):
    sensor_columns.append(col)

# Apply imputer on df
imputer = Imputer(
    inputCols=sensor_columns, 
    outputCols=["{}_imputed".format(c) for c in sensor_columns]
)
df = imputer.fit(df).transform(df)

# Remove original imputed columns 
df = df.drop(*double_columns)

for name in df.schema.names:
  df = df.withColumnRenamed(name, name.replace('_imputed', ''))

In [None]:
print("Dataframe new dimensions: ", (df.count(), len(df.columns)))

In [None]:
# Replace null values with mean values for columns with sensor measures (no labels)

from pyspark.sql.functions import avg

# Function to replace null values with mean values
def replace_with_mean(df_input, exclude=set()):

    stats = df_input.agg(*(avg(c).alias(c) for c in df_input.columns if c not in exclude))
    return df_input.na.fill(stats.first().asDict())


In [None]:
res = replace_with_mean(df, label_columns)
res.show()

In [None]:
from scipy.stats import *

df1 = df.select(*double_columns)

for col in df1.columns:
    print([col])
    a,b = stats.normaltest(df1[[col]])
    print (a,b)
    if b < alpha:
        print("The null hypothesis can be rejected. all the variables are not normally distributed")
    else:
        print("The null hypothesis cannot be rejected")


In [None]:
import pandas as pd
from scipy import stats

df1 = df.select(*double_columns)
pddf1 = df1.toPandas()

k2, p = stats.normaltest(pddf1)
print (p)

In [None]:
import pandas as pd
from scipy import stats

df1 = df.select(*double_columns)
pddf1 = df1.toPandas()

for col in double_columns
    df.na.drop(subset=[col])
    k2, p = stats.normaltest(pddf1)
print (p)

# Tranformations: Clean outliers

In [None]:
IQRdf = df.groupby('user_id')
        .agg(F.expr('percentile(duration, array(0.25))')[0]
             .alias('lower_quartile'), F.expr('percentile(duration, array(0.75))')[0]
             .alias('upper_quartile'), F.expr('percentile(duration, array(0.5))')[0]
             .alias('duration_median'))
    .withColumn("quartile_deviation", (F.col("upper_quartile") - F.col("lower_quartile"))/2)

In [None]:
outliersremoved = explodesplitdf.join(IQRdf, "genre", "left")
.filter(F.abs(F.col("duration")-F.col("duration_median")) >= (F.col("quartile_deviation")*2.2))

In [None]:
# New column "lab_new_geographical_shift"

In [None]:
### Write to postgres

In [None]:
df \
    .write \
    .format("jdbc") \
    .option("url", "jdbc:postgresql://postgres/human_activity") \
    .option("dbtable", "human_activity.ha_dataset") \
    .option("user", "ha_user") \
    .option("password", "hnm4/4c71v1tY") \
    .option("driver", "org.postgresql.Driver") \
    .mode('overwrite') \
    .save()

In [None]:
spark.stop()