## 2b_Traditional feature extraction notebook

Project: clustering-analysis-domain-agnostic-features-2018

Authors: Jordan Perr-Sauer, Caleb Phillips

License: BSD 3-Clause

Copyright (c) 2021 Alliance for Sustainable Energy LLC

## Description

This notebook uses a Spark context to produce the traditional features CSV file from the FleeDNA dataset. The notbook must be run with a spark context. The easiet way to do this is to boot up Spark using start_spark_jupyter_notebook.sh

In [1]:
# Metrics Classes for FleetDNA Big Data SDK
# Author: Jordan Perr-Sauer <jordan.perr-sauer@nrel.gov>


import pyspark.sql.functions as func
import calendar


mph2ms = 0.44704
mph2ftss = 1.466667
g = 9.80665
ft2m = 0.3048
mi2m = 1609.344

spow = func.pow
ssqrt = func.sqrt
col = func.col

speed = [col("lag_speed"), col("speed")]
time = [col("lag_ts"), col("ts")]
grade = col("grade")


class Features:

    @property
    def driving_speeds(self):
        return func.when(speed[0] > 0, speed[0])

    @property
    def vbar(self):
        return ((speed[0] + speed[1]) / 2.0)*mph2ms

    @property
    def v3bar(self):
        return ( \
         spow( (speed[0]*mph2ms) ,3.0) + \
         ( spow( (speed[0]*mph2ms) ,2.0) * (speed[1]*mph2ms) ) + \
         ( (speed[0]*mph2ms) * spow(speed[1]*mph2ms,2.0) ) + \
         spow((speed[1]*mph2ms),3.0) \
        ) / 4.0

    @property
    def dv(self):
        return speed[1] - speed[0]

    @property
    def dt(self):
        return time[1] - time[0]

    @property
    def dd(self):
        vbar = self.vbar
        dt = self.dt
        return (vbar * dt) / (3600.0*mph2ms)

    @property
    def a(self):
        return 0.5*(spow((speed[1]*mph2ms),2.0)-spow((speed[0]*mph2ms),2.0))
        #return 0.5*(spow((speed[1]*mph2ms),2.0)-spow((speed[0]*mph2ms),2.0)) + g*(grade*ft2m)

    @property
    def acceleration(self):
        dv = self.dv
        dt = self.dt
        return dv / dt * mph2ftss


class Aggregates:

    @property
    def avg_driving_speed(self):
        driving_speeds = self.driving_speeds
        return func.avg(driving_speeds)

    @property
    def driving_speed_standard_deviation(self):
        driving_speeds = self.driving_speeds
        return func.stddev(driving_speeds)

    @property
    def max_speed(self):
        driving_speeds = self.driving_speeds
        return func.max(driving_speeds)

    @property
    def percent_zero(self):
        return func.sum(func.when(speed[0] == 0.0, 1)) / func.sum(func.lit(1)) * 100

    @property
    def distance_below_55(self):
        dd = self.dd
        return func.sum(func.when(speed[0] < 55, dd))

    @property
    def total_distance(self):
        dd = self.dd
        return func.sum(dd)

    @property
    def percent_distance_below_55(self):
        distance_below_55 = self.distance_below_55
        total_distance = self.total_distance
        return distance_below_55 / total_distance * 100

    @property
    def characteristic_acceleration(self):
        a = self.a
        total_distance = self.total_distance
        return func.sum(func.when(a > 0, a)) / (total_distance*mi2m)

    @property
    def ca_standard(self):
        characteristic_acceleration = self.characteristic_acceleration
        return characteristic_acceleration / ft2m

    @property
    def aerodynamic_speed(self):
        return ssqrt(func.sum(self.v3bar*self.dt) / (self.total_distance*mi2m))

    @property
    def as_standard(self):
        return self.aerodynamic_speed / ft2m

    @property
    def number_of_stops(self):
        return func.sum(func.when( (self.acceleration < 0.0) & (speed[1] < 0.0001), func.lit(1) ) )

    @property
    def stops_per_mile(self):
        return self.number_of_stops / self.total_distance


class all(Aggregates, Features):
    def __init__(self):
        pass



In [2]:
df = spark.read.parquet("./data/FleetDNAETL_CoDA_epaprime")

In [3]:
df.count()

746507968

In [None]:
from pyspark.sql.functions import mean, col

TRADITIONAL_OUTPUT = "FleetDNA_CoDA_Paper_traditional"
traditional_features = ["avg_driving_speed", "max_speed", "percent_zero", "driving_speed_standard_deviation", "ca_standard", "as_standard", "percent_distance_below_55", "stops_per_mile"]
mm = all()
traditional_df = df.groupBy("vdir", "trip")\
                    .agg(*[getattr(mm, m).alias(m) for m in traditional_features])\
                    .groupby(['vdir']).agg(*[mean(col(c)).alias(c) for c in traditional_features])

traditional_pandas = traditional_df.toPandas()
traditional_pandas = traditional_pandas.fillna(0)
traditional_pandas.to_csv("./data/FleetDNAETL_CoDA_epaprime_traditional_nolimit.csv", index=False)