In [1]:
from pyspark.ml.classification import RandomForestClassificationModel
from pyspark.ml.classification import GBTClassificationModel
from pyspark.ml.tuning import CrossValidatorModel
from pyspark.ml.pipeline import PipelineModel
from pyspark.sql.types import *


class Loader():
    """
    Includes all methods to read or write from the ADLS storage
    """

    def __init__(self, spark_session, adls_account_name='REDACTED'):
        self.spark_session = spark_session
        self.spark_context = spark_session.sparkContext
        self.adls_account_name = adls_account_name

    def readDataframeFromAdls(self, table_path, container_name='data'):
        """
        Reads a dataframe in parquet format from ADLS

        Paramters
        -----
        table_path: str, required
            absolute path to table in ADLS container
        container_name: str, optional
            name of the container in ADLS

        Returns
        -----
        Spark dataframe
        """
        return (
            self
            .spark_session
            .read
            .parquet(
                f'abfss://{container_name}@{self.adls_account_name}.dfs.core.windows.net{table_path}')
        )

    def readModelFromAdls(self, model_path, model_class, container_name='data'):
        """
        Reads a RandomForestClassificationModel from ADLS

        Paramters
        -----
        model_class; str, required
            class name of the model e.g. RandomForestClassificationModel
        model_path: str, required
            absolute path to model in ADLS container
        container_name: str, optional
            name of the container in ADLS

        Returns
        -----
        PipelineModel
        """
        model = globals()[model_class]
        return (
            model
            .load(
                f"abfss://{container_name}@{self.adls_account_name}.dfs.core.windows.net{model_path}"
            )
        )

In [1]:
import pyspark
from pyspark.sql.types import *
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext
import pyspark.sql.functions as F
import pyspark.ml.feature as ML
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
from pyspark.mllib.evaluation import BinaryClassificationMetrics
from sklearn.metrics import classification_report, confusion_matrix, auc, roc_curve
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
account_name = "REDACTED"
account_key = "5S/FS3ZJpxmiu8CzAVWCyKnFhK0tymvu3thvsvvUqUvkaP2w75skbLHMkzHpycSB35wo4Tf26v1rjiYTEmV9Gw=="

spark = (
    SparkSession
        .builder
        .getOrCreate()
)

sc = spark.sparkContext

Py4JJavaError: An error occurred while calling None.org.apache.spark.api.java.JavaSparkContext.
: Configuration property REDACTED.dfs.core.windows.net not found.
	at org.apache.hadoop.fs.azurebfs.AbfsConfiguration.getStorageAccountKey(AbfsConfiguration.java:372)
	at org.apache.hadoop.fs.azurebfs.AzureBlobFileSystemStore.initializeClient(AzureBlobFileSystemStore.java:1133)
	at org.apache.hadoop.fs.azurebfs.AzureBlobFileSystemStore.<init>(AzureBlobFileSystemStore.java:174)
	at org.apache.hadoop.fs.azurebfs.AzureBlobFileSystem.initialize(AzureBlobFileSystem.java:110)
	at org.apache.hadoop.fs.FileSystem.createFileSystem(FileSystem.java:3303)
	at org.apache.hadoop.fs.FileSystem.access$200(FileSystem.java:124)
	at org.apache.hadoop.fs.FileSystem$Cache.getInternal(FileSystem.java:3352)
	at org.apache.hadoop.fs.FileSystem$Cache.get(FileSystem.java:3320)
	at org.apache.hadoop.fs.FileSystem.get(FileSystem.java:479)
	at org.apache.spark.util.Utils$.getHadoopFileSystem(Utils.scala:1915)
	at org.apache.spark.deploy.history.EventLogFileWriter.<init>(EventLogFileWriters.scala:60)
	at org.apache.spark.deploy.history.SingleEventLogFileWriter.<init>(EventLogFileWriters.scala:213)
	at org.apache.spark.deploy.history.EventLogFileWriter$.apply(EventLogFileWriters.scala:181)
	at org.apache.spark.scheduler.EventLoggingListener.<init>(EventLoggingListener.scala:66)
	at org.apache.spark.SparkContext.<init>(SparkContext.scala:603)
	at org.apache.spark.api.java.JavaSparkContext.<init>(JavaSparkContext.scala:58)
	at sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
	at sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:62)
	at sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
	at java.lang.reflect.Constructor.newInstance(Constructor.java:423)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:247)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:238)
	at py4j.commands.ConstructorCommand.invokeConstructor(ConstructorCommand.java:80)
	at py4j.commands.ConstructorCommand.execute(ConstructorCommand.java:69)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)


In [None]:
loader = Loader(spark, account_name)

In [None]:
df_predict = loader.readDataframeFromAdls('/transformation/df_transform_training')
gbt_hyper = loader.readModelFromAdls('/model/gbt_hyper', 'GBTClassificationModel')
rfc_hyper = loader.readModelFromAdls('/model/rfc_hyper', 'RandomForestClassificationModel')

In [None]:
df_predicted_gbt = gbt_hyper.transform(df_predict)
df_predicted_rfc = rfc_hyper.transform(df_predict)

In [None]:
def getPerformanceMetrics(df):
    metrics_dictionary = {}
    evaluator_multi = MulticlassClassificationEvaluator(
        labelCol='Y', predictionCol="prediction")
    evaluator_binary = BinaryClassificationEvaluator(
        labelCol='Y', rawPredictionCol="prediction", metricName='areaUnderROC')

    # Get metrics
    metrics_dictionary['acc'] = evaluator_multi.evaluate(
        df, {evaluator_multi.metricName: "accuracy"})
    metrics_dictionary['f1'] = evaluator_multi.evaluate(
        df, {evaluator_multi.metricName: "f1"})
    metrics_dictionary['weighted_precision'] = evaluator_multi.evaluate(
        df, {evaluator_multi.metricName: "weightedPrecision"})
    metrics_dictionary['weighted_recall'] = evaluator_multi.evaluate(
        df, {evaluator_multi.metricName: "weightedRecall"})
    metrics_dictionary['auc'] = evaluator_binary.evaluate(df)

    print(f"Accuracy: {metrics_dictionary['acc']}")
    print(f"F1: {metrics_dictionary['f1']}")
    print(
        f"Weighted Precision: {metrics_dictionary['weighted_precision']}")
    print(f"Weighted Recall: {metrics_dictionary['weighted_recall']}")
    print(f"AUC: {metrics_dictionary['auc']}")

    return metrics_dictionary

In [None]:
def printConfusionMatrix(df):
    y_true = df.select(['Y']).collect()
    y_pred = df.select(['prediction']).collect()

    print(classification_report(y_true, y_pred))
    print(confusion_matrix(y_true, y_pred))

In [None]:
class CurveMetrics(BinaryClassificationMetrics):
    def __init__(self, *args):
        super(CurveMetrics, self).__init__(*args)

    def _to_list(self, rdd):
        points = []
        # Note this collect could be inefficient for large datasets 
        # considering there may be one probability per datapoint (at most)
        # The Scala version takes a numBins parameter, 
        # but it doesn't seem possible to pass this from Python to Java
        for row in rdd.collect():
            # Results are returned as type scala.Tuple2, 
            # which doesn't appear to have a py4j mapping
            points += [(float(row._1()), float(row._2()))]
        return points

    def get_curve(self, method):
        rdd = getattr(self._java_model, method)().toJavaRDD()
        return self._to_list(rdd)

def plotROC(df):
    preds = df.select('Y','probability').rdd.map(lambda row: (float(row['probability'][1]), float(row['Y'])))
    points = CurveMetrics(preds).get_curve('roc')

    plt.figure()
    x_val = [x[0] for x in points]
    y_val = [x[1] for x in points]
    plt.title('title')
    plt.xlabel('xlabel')
    plt.ylabel('ylabel')
    plt.plot(x_val, y_val)

In [None]:
def ExtractFeatureImp(model, df, featuresCol='X'):
    featureImp = model.featureImportances
    list_extract = []
    for i in df.schema[featuresCol].metadata["ml_attr"]["attrs"]:
        list_extract = list_extract + df.schema[featuresCol].metadata["ml_attr"]["attrs"][i]
    varlist = pd.DataFrame(list_extract)
    varlist['score'] = varlist['idx'].apply(lambda x: featureImp[x])
    return(varlist.sort_values('score', ascending = False))