## 2b_Agnostic feature extraction notebook

Project: clustering-analysis-domain-agnostic-features-2018

Authors: Jordan Perr-Sauer, Caleb Phillips

License: BSD 3-Clause

Copyright (c) 2021 Alliance for Sustainable Energy LLC

## Description

This notebook uses a Spark context and the TSFresh python library to produce the application agnostic features CSV file from the FleeDNA dataset. The notbook must be run with a spark context. The easiet way to do this is to boot up Spark using start_spark_jupyter_notebook.sh


In [1]:
import pandas
pandas.__version__

u'0.20.3'

In [2]:
import tsfresh
tsfresh.__version__

'0.10.1'

In [3]:
import sys
import pickle
sys.version
pickle.HIGHEST_PROTOCOL

2

In [4]:
# Jordan Perr-Sauer <jordan.perr-sauer@nrel.gov>

from pyspark.sql.functions import collect_list, col, struct, array
from pyspark.sql.functions import col, size, max, avg
from pyspark.sql.functions import udf, explode
from pyspark.sql.types import StructType, DoubleType, StructField, IntegerType, ArrayType,  MapType, StringType
import pandas
import tsfresh
from tsfresh.feature_extraction.settings import EfficientFCParameters, MinimalFCParameters, ComprehensiveFCParameters
import numpy as np
import sklearn.preprocessing as preprocessing
import json


def post_processing(features):
    """
    Normalize features and fill null values with zero.
    :param features: Pandas dataframe with an index column "vdir" and however many features columns
    :return:
    """
    df = features.toPandas()
    df.set_index("vdir")

    df = df.fillna(0)

    min_max_scaler = preprocessing.MinMaxScaler()

    columns = df.columns.difference(['vdir'])
    df[columns] = min_max_scaler.fit_transform(df[columns])

    return df


def tsfresh_udf(params = "minimal", features = ["speed", "grade", "ts"]):
    
    param_dict = {"minimal": MinimalFCParameters, "full": ComprehensiveFCParameters, "efficient": EfficientFCParameters}

    def agnostic_features_distributed_pertrip(raw_data):
        data = pandas.DataFrame(raw_data)
        data.columns = features
        data_features = data.fillna(0)
        data_features['id']='foo'
        extract_agnostic = tsfresh.extract_features(data_features,
                                                    column_id="id",
                                                    column_sort="ts",
                                                    default_fc_parameters=param_dict[params](),
                                                    n_jobs=0)
        ef = extract_agnostic.replace([np.inf, -np.inf], np.nan)
        d = ef.to_dict(orient='records')[0]
        dd = {str(key): float(val) for (key, val) in d.items()}
        return dd

    return agnostic_features_distributed_pertrip


# The following functions return feature vectors and should be called by a driver program


def halfday_minimal_trip_means(df, averageTo="vehicle", parameters="minimal", maximum_trip=1000):
    """
    Couldn't this be done so much easier with an RDD?

    Agnostic Features EPA Prime Full - Trip Means Method
    :param fleet:
    :return:
    """
    features = ["speed", "grade", "ts"]

    afd = df.groupBy("vdir", "trip").agg(collect_list(struct(*features)).alias("raw_data"))

    afd_repartition = afd.repartition(2500)

    afd_limited = afd_repartition.withColumn("data_length", size(col("raw_data"))).filter(col("data_length") <= maximum_trip)

    agnostic_features_distributed_udf = udf(tsfresh_udf(parameters), MapType(StringType(), DoubleType()))

    afd2 = afd_limited.withColumn("features", agnostic_features_distributed_udf(col("raw_data")))

    afd2.cache()

    keys = afd2.select(explode("features")).select("key").distinct().collect()

    if averageTo == "vehicle":
        averages = [avg(col("features").getItem(k.key)).alias(k.key) for k in keys]
        afd3 = afd2.groupBy("vdir").agg(*averages)
    else:
        trips = [col("features").getItem(k.key).alias(k.key) for k in keys]
        afd3 = afd2.select("vdir", "trip", *trips)

    return post_processing(afd3)


In [None]:
df = spark.read.parquet("./data/FleetDNAETL_CoDA_epaprime")

In [None]:
agnostic_pandas = halfday_minimal_trip_means(df, parameters="efficient", maximum_trip=50000)

In [None]:
agnostic_pandas.to_csv("./data/FleetDNAETL_CoDA_epaprime_agnostic_50klimit.csv", index=False)