# __HoWDe__ 
### _a Home and Work location Detection algorithm for GPS data analytics_

This notebook is intended to work as a brief tutorial on how to user "HoWDe". It leverages functions contained in the "HoWDe_utils.py" file (source code). 

In [1]:
%config InlineBackend.figure_format = 'retina'
# # NB: Home and Work location labelling
# 1. Load pre-computed stop locations (in our case computed using infostop)
# 2. Assigning labels to stop locations: Home ("H"), Work ("W"), and Other ("O")
from howde import *

In [4]:
HW_PATH = None
HW_PATH = '/Users/lorentz/JupyterDir/20-04_COVID_all/World_Bank/02-home_work_detection/data/stop_location_results/veraset_location_clustered/'


try: assert type(HW_PATH) is str
except: print("Path to data is missing.")

#### 1. Use HoWDe providing pre-loaded data and Spark Session

In [None]:
#### Load DATA and PASS it to HoWDe
import os
from pyspark.sql.types import *
from pyspark.sql import SparkSession
import warnings
warnings.filterwarnings("ignore")

# Set up Spark
driver_memory=250
packages = "data/work/shared/tools/spark-avro_2.12-3.0.0.jar"
os.environ["PYSPARK_SUBMIT_ARGS"] = "--jars {0} pyspark-shell ".format(packages)
spark = (SparkSession
            .builder.master("local[50]")
            .config("spark.sql.files.ignoreCorruptFiles", "true")
            .config("spark.driver.memory", f"{driver_memory}g")
            .config("spark.executor.memory", "250g")
            .getOrCreate()
)
spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")
spark.sparkContext.setLogLevel("ERROR")

# Load DATA
input_data = spark.read.format("parquet").load(HW_PATH, pathGlobFilter="*.parquet")

res1 = HoWDe_labelling(input_data = input_data, spark=spark, HW_PATH='./',
                    SAVE_PATH=None, SAVE_NAME='', save_multiple=False,
                    edit_config_default=None, 
                    range_window=42, bnd_none_day=6,
                    bnd_none_home=[0.4,0.6], bnd_none_work=0.8,
                    range_freq_home=0.2, range_freq_work_h=0.2,
                    range_freq_work_d=0.2,
                    driver_memory = 250
                   )

res1.printSchema()

#### 2. Use HoWDe in a self contained way (providing path to data and location to save)

In [5]:
### Let HoWDe load data 
res2 = HoWDe_labelling(input_data=None, spark=None, HW_PATH=HW_PATH,
                    SAVE_PATH=None, SAVE_NAME='', save_multiple=False,
                    edit_config_default=None, 
                    range_window=42, bnd_none_day=6,
                    bnd_none_home=[0.4,0.6], bnd_none_work=0.8,
                    range_freq_home=0.2, range_freq_work_h=0.2,
                    range_freq_work_d=0.2,
                    driver_memory = 250
                   )
res2.printSchema()

25/01/23 17:11:36 WARN Utils: Your hostname, LooP.local resolves to a loopback address: 127.0.0.1; using 192.168.129.24 instead (on interface en0)
25/01/23 17:11:36 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
25/01/23 17:11:36 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/01/23 17:11:37 WARN DependencyUtils: Local jar /Users/lorentz/JupyterDir/GIT/projects/home_work_detection/HoWDe/data/work/shared/tools/spark-avro_2.12-3.0.0.jar does not exist, skipping.
25/01/23 17:11:37 INFO SparkContext: Running Spark version 3.5.4
25/01/23 17:11:37 INFO SparkContext: OS info Mac OS X, 15.1.1, aarch64
25/01/23 17:11:37 INFO SparkContext: Java version 19.0.2
25/01/23 17:11:37 INFO ResourceUtils: No custom resources configured for spark.driver.
25/01/23 17:11:37 INFO SparkContext: Submitted application: pyspark-shell
25/01/23 17:11:37 INFO ResourceProfile: Default ResourceProfile created, execu

HoWDe Labelling: computing LABs ...


100%|██████████| 2/2 [00:02<00:00,  1.17s/it]

HoWDe Labelling: computations completed!
root
 |-- useruuid: string (nullable = true)
 |-- country: string (nullable = true)
 |-- loc: string (nullable = true)
 |-- lat: double (nullable = true)
 |-- lon: double (nullable = true)
 |-- date: timestamp (nullable = true)
 |-- start: long (nullable = true)
 |-- end: long (nullable = true)
 |-- stop_duration: long (nullable = true)
 |-- location_type: string (nullable = false)






In [8]:
res2.show()


                                                                                

+--------------------+-------+---+----------+----------+-------------------+----------+----------+-------------+-------------+
|            useruuid|country|loc|       lat|       lon|               date|     start|       end|stop_duration|location_type|
+--------------------+-------+---+----------+----------+-------------------+----------+----------+-------------+-------------+
|63c29bed022b6c8dd...|     BR| 30| -23.63578| -46.56485|2021-06-06 00:00:00|1623005440|1623006208|          768|            O|
|63c29bed022b6c8dd...|     BR| 30| -23.63582| -46.56487|2021-06-07 00:00:00|1623020288|1623023872|         3584|            O|
|63c29bed022b6c8dd...|     BR| 30| -23.63579| -46.56487|2021-06-07 00:00:00|1623032448|1623034752|         2304|            O|
|63c29bed022b6c8dd...|     BR| 30| -23.63582| -46.56488|2021-06-07 00:00:00|1623062528|1623063424|          896|            O|
|63c29bed022b6c8dd...|     BR| 30| -23.63583| -46.56485|2021-06-07 00:00:00|1623068032|1623068416|          384