# Loading in Required Libraries

In [0]:
# Standard libraries/functions
import pandas as pd
import numpy as np

# Pyspark libraries/functions
import pyspark.sql.functions as F
from pyspark.sql.functions import col, sum, isnan, when, count, year, month, dayofmonth, date_format, concat_ws, acos, cos, radians, sin, udf, concat
from pyspark.sql.types import IntegerType, DateType, DoubleType, StringType, FloatType
from pyspark.sql.window import Window

# from pyspark.ml.linalg import Vectors#, VectorUDT
# from pyspark.ml.linalg import VectorType
from pyspark.mllib.linalg import Vectors, VectorUDT
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler, StandardScaler, StringIndexer, OneHotEncoder, Imputer
from pyspark.ml.functions import vector_to_array
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.stat import Correlation
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import PCA
from pyspark.mllib.evaluation import BinaryClassificationMetrics, MulticlassMetrics

# Sklearn libraries/functions
from sklearn.utils import shuffle
# from sklearn.linear_model import LogisticRegression
# from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from pyspark.ml.classification import LogisticRegression, GBTClassifier, RandomForestClassifier, LinearSVC
import time

import mlflow
import mlflow.spark
# from pyspark.mllib.linalg import Vectors, VectorUDT




In [0]:
## library settings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)


# Setting Up Storage

In [0]:
## Place this cell in any team notebook that needs access to the team cloud storage.

# The following blob storage is accessible to team members only (read and write)
# access key is valid til TTL
# after that you will need to create a new SAS key and authenticate access again via DataBrick command line
blob_container  = "smsj-261"       # The name of your container created in https://portal.azure.com
storage_account = "smsj"  # The name of your Storage account created in https://portal.azure.com
secret_scope    = "smsjscope"           # The name of the scope created in your local computer using the Databricks CLI
secret_key      = "smsjkey"             # The name of the secret key created in your local computer using the Databricks CLI
team_blob_url        = f"wasbs://{blob_container}@{storage_account}.blob.core.windows.net"  #points to the root of your team storage bucket

# the 261 course blob storage is mounted here on the DataBricks workspace.
mids261_mount_path      = "/mnt/mids-w261"

# SAS Token: Grant the team limited access to Azure Storage resources
spark.conf.set(
  f"fs.azure.sas.{blob_container}.{storage_account}.blob.core.windows.net",
  dbutils.secrets.get(scope = secret_scope, key = secret_key)
)
import pandas as pd
pdf = pd.DataFrame([[1, 2, 3, "Jane"], [2, 2,2, None], [12, 12,12, "John"]], columns=["x", "y", "z", "a_string"])
df = spark.createDataFrame(pdf) # Create a Spark dataframe from a pandas DF

# The following can write the dataframe to the team's Cloud Storage  
# Navigate back to your Storage account in https://portal.azure.com, to inspect the partitions/files.
# df.write.parquet(f"{team_blob_url}/test")

# see what's in the blob storage root folder 
display(dbutils.fs.ls(f"{team_blob_url}"))

path,name,size,modificationTime
wasbs://smsj-261@smsj.blob.core.windows.net/test/,test/,0,1689534418000


# Loading the Data

In [0]:
# import spark
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("ReadingParquet").getOrCreate()

# df = spark.read.parquet("/path/to/parquet/file.parquet")
# df.show()

# spark = spark.sql.SparkSession.builder.appName("Call Parquet File").getOrCreate()

df_test_60m = spark.read.parquet("dbfs:/user/hive/warehouse/test_60m")
df_train_60m = spark.read.parquet("dbfs:/user/hive/warehouse/train_60m")



# df = spark.read.parquet("s3://my-bucket/test_3m.parquet")

df_test_60m.createOrReplaceTempView("test_60m")
df_train_60m.createOrReplaceTempView("df_train_60m")
# Use the DataFrame in your ML Flow project

In [0]:
df_val = spark.sql("SELECT * FROM train_60m where YEAR(FL_DATE) = 2018")
df_train_60m = spark.sql("SELECT * FROM train_60m where YEAR(FL_DATE) < 2018")

# Initializing Functions

## Reading the Data Function

## Modelling Functions

In [0]:

def build_model (model_name, param_dict):
    '''
    Description: Contains models and hyperparameters
    Inputs:
    Outputs: 
    '''
    # Build Logistic Regression model
    if model_name == 'log':
        model = LogisticRegression(featuresCol = "allFeatures", 
                                    regParam=param_dict['regParam'], 
                                    maxIter=param_dict['maxIter'], 
                                    elasticNetParam=param_dict['elasticNetParam'])
    # Build Gradient Boosted Tree model
    elif model_name == 'gbt':
        model = GBTClassifier(featuresCol = "allFeatures",
                              maxDepth=param_dict['maxDepth'], # Default: 5
                              maxIter=param_dict['maxIter'], # Default: 20
                              maxBins=param_dict['maxBins'], # Default: 32
                              stepSize=param_dict['stepSize']) # Default: 0.1
    # Build Support Vector Machine model
    elif model_name == 'svm':
        model = LinearSVC(featuresCol="scaledFeatures",
                          maxIter=10)
    # Build Random Forest model
    elif model_name == 'rf':
        model = RandomForestClassifier(featuresCol = "scaledFeatures",
                                       numTrees=70,
                                       maxDepth=3, 
                                       seed=42)
    elif model_name == 'mpc':
        model = MultilayerPerceptronClassifier(labelCol="label",
                                               featuresCol="allFeatures",
                                               maxIter=param_dict['maxIter'],
                                               layers=param_dict['layers'], 
                                               stepSize=param_dict['stepSize'],
                                               solver=param_dict['solver'],
                                               blockSize=param_dict['blockSize'],
                                               seed=123)
    return model

In [0]:
# Obtain parameters from provided parameter dictionary
def udf_grid_search_params(params_grid):
    '''
    Description:
    Input:
    Output:
    '''
    cleaned_dict_list = []

    for param_dict in test_output:
        cleaned_dict = {param.name: value for param, value in param_dict.items()}
        cleaned_dict_list.append(cleaned_dict)
    return cleaned_dict_list

In [0]:
# Create pipline
def build_model_pipeline(input_df, input_categoricals_columns, input_pca_columns, input_non_pca_columns, input_prediction_feature = 'model_delay'):
    '''
    Description:
    Input:
    Output:
    '''
    base = input_df

    # input the categorical columns
    categoricals_columns = input_categoricals_columns

    # impute the missing categorical data using the mode
    # flight time category was missing data
    # we can actuall swtich to the CRS_DEP_TIME to fix this issue, but this is a function that we might have needed anyways or might need in the future
    base = impute_categoricals(base,base,categoricals_columns)
    
    indexers = map(lambda c: StringIndexer(inputCol=c, outputCol=c+"_idx", handleInvalid = 'keep'), categoricals_columns)
    ohes = map(lambda c: OneHotEncoder(inputCol=c + "_idx", outputCol=c+"_class"),categoricals_columns)

    # Establish features columns
    categoricals = list(map(lambda c: c+"_class", categoricals_columns))

    # input the number columns to be reduced with PCA
    numerics_pca_columns = input_pca_columns

    # input the columns we decide not to reduce with PCA
    numerics_non_pca_columns = input_non_pca_columns

    # input the feature we are trying to predict
    prediction_feature = input_prediction_feature

    all_numerics = numerics_pca_columns + numerics_non_pca_columns

    # imputer should handle all numeric columns regardless of usage in pca or not
    imputers = Imputer(inputCols = all_numerics, outputCols = all_numerics)

    # grab only relevant columns, we need numerics, categorical, and predictor
    base = base[all_numerics +categoricals_columns + [prediction_feature]]

    # VectorAssembler
    assembler_numeric_pca = VectorAssembler( inputCols=numerics_pca_columns, outputCol='features_numeric_pca_pre_scale')
    assembler_numeric_non_pca = VectorAssembler( inputCols=numerics_non_pca_columns, outputCol='features_numeric_non_pca_pre_scale')

    scaler_non_pca = StandardScaler(inputCol="features_numeric_non_pca_pre_scale",
                            outputCol="features_numeric_non_pca_scaled",
                            withStd=True,
                            withMean=True)

    scaler_pca = StandardScaler(inputCol="features_numeric_pca_pre_scale",
                            outputCol="features_numeric_pca_scaled",
                            withStd=True,
                            withMean=True)

    pca = PCA(k=2, inputCol='features_numeric_pca_scaled', outputCol='dense_vect_pca_features')

    assemblerAll = VectorAssembler(inputCols= ["features_numeric_non_pca_scaled", "dense_vect_pca_features"] +categoricals , outputCol="allFeatures")
    print(assemblerAll.getInputCols)

    label = StringIndexer(inputCol="model_delay", outputCol="label")

    model_matrix_stages =   list(indexers) + list(ohes) + \
                            [imputers] + \
                            [assembler_numeric_non_pca] + \
                            [assembler_numeric_pca] + \
                            [scaler_non_pca] + [scaler_pca] + \
                            [pca] + \
                            [assemblerAll] + \
                            [label]
    return model_matrix_stages

In [0]:

## Run time series cross validation and perform hyperparameter tuning to select the best model
def tscv(dataset, model, pca_numerics, non_pca_numerics, categoricals, k=5):
    '''
    Description: Runs time series cross validation on the provided model
    Input: Dataset, built model, PCA numeric features, non-PCA numeric features, categorical features, number of folds (k)
    Output: Average F1 score of model
    '''

    # initialize variables 
    n = dataset.count()
    chunk_size = int(n/(k+1))
    scores_auc = []
    scores_accuracy = []
    scores_precision = []
    scores_recall = []
    scores_f1 = []

    # Assume that we are ordering by FL_DATE always 
    sort_dataset = dataset.withColumn("row_id", F.row_number().over(Window.partitionBy().orderBy("FL_DATE")))

    # Perform tscv and hyperparameter tuning 
    for i in range(k):
        train_df = sort_dataset.filter(F.col('row_id') <= chunk_size * (i+1)).cache()
        val_df = sort_dataset.filter((F.col('row_id') > chunk_size * (i+1)) & (F.col('row_id') <= chunk_size * (i+2))).cache()

        build_pipeline_matrix = build_model_pipeline(train_df, categoricals,pca_numerics,non_pca_numerics,'model_delay')

        pipeline = Pipeline (stages=build_pipeline_matrix + [model])

        #Train model with train_df
        pipeline_model = pipeline.fit(train_df)
        
        # impute null validation categoricals basked on mode of training categoricals
        val_df = impute_categoricals(train_df, val_df, categoricals)

        # Predict on validation set
        predictions = pipeline_model.transform(val_df).select("probability", "label", "prediction")
        predictions = predictions.rdd.map(extract).toDF(["p0", "p1", "label", "prediction"])

        valid_input_model = predictions

        # Create an evaluator
        evaluator_auc = BinaryClassificationEvaluator(labelCol='label')
        evaluator_auc.setRawPredictionCol('p1')

        # Compute the areaUnderROC on the test data
        areaUnderROC = evaluator_auc.evaluate(valid_input_model, {evaluator_auc.metricName: "areaUnderROC"})

        evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction")

        # Compute various evaluation metrics
        accuracy = evaluator.evaluate(valid_input_model, {evaluator.metricName: "accuracy"})
        precision = evaluator.evaluate(valid_input_model, {evaluator.metricName: "weightedPrecision"})
        recall = evaluator.evaluate(valid_input_model, {evaluator.metricName: "weightedRecall"})
        f1 = evaluator.evaluate(valid_input_model, {evaluator.metricName: "f1"})

        print("Accuracy = %g" % accuracy)
        print("Precision = %g" % precision)
        print("Recall = %g" % recall)
        print("F1 = %g" % f1)

        # Append the score to the scores list
        scores_auc.append(areaUnderROC)
        scores_accuracy.append(accuracy)
        scores_precision.append(precision)
        scores_recall.append(recall)
        scores_f1.append(f1)

    f1_score = np.mean(scores_f1)
    return f1_score


In [0]:

def get_hyperparameters(dataset, model_type, hyperparameter_list, k):
    '''
    Description:
    Input:
    Output:
    '''
    # Initialize variables
    best_score = 0
    best_param = None
    start_time = time.time()
    iterations = 0
    # Loop through all combinations of hyperparameters and pick the best set
    for parameters in hyperparameter_list:
        iterations = iterations+1
        new_model = build_model(model_type, parameters)
        avg_score = tscv(dataset = dataset, model = new_model, k=k)
        if avg_score > best_score:
            best_score = avg_score
            best_params = parameters
        print(parameters)
        print('best score:', best_score, '|best params:', best_params)
        print("iteration: " + str(iterations) + " --- %s seconds ---" % (time.time() - start_time))
    return best_score, best_params

In [0]:

## Run time series cross validation and perform hyperparameter tuning to select the best model
def single_model_run(dataset, model, pca_numerics, non_pca_numerics, categoricals, k=5):
    '''
    Description: Runs time series cross validation on the provided model
    Input: Dataset, built model, PCA numeric features, non-PCA numeric features, categorical features, number of folds (k)
    Output: Average F1 score of model
    '''
    # initialize variables 
    # n = dataset.count()
    # chunk_size = int(n/(k+1))
    scores_auc = []
    scores_accuracy = []
    scores_precision = []
    scores_recall = []
    scores_f1 = []

    # Assume that we are ordering by FL_DATE always 
    sort_dataset = dataset.withColumn("row_id", F.row_number().over(Window.partitionBy().orderBy("FL_DATE",F.rand())))

    model_n = sort_dataset.count()
    split_factor = 0.7

    # Perform tscv and hyperparameter tuning 

    train_df = sort_dataset.filter(F.col('row_id') <= (model_n * split_factor)).cache()
    val_df = sort_dataset.filter((F.col('row_id') > (model_n * split_factor))).cache()

    build_pipeline_matrix = build_model_pipeline(train_df, categoricals,pca_numerics,non_pca_numerics,'model_delay')

    pipeline = Pipeline (stages=build_pipeline_matrix + [model])

    #Train model with train_df
    pipeline_model = pipeline.fit(train_df)
    
    # impute null validation categoricals basked on mode of training categoricals
    val_df = impute_categoricals(train_df, val_df, categoricals)

    # Predict on validation set
    predictions = pipeline_model.transform(val_df).select("probability", "label", "prediction")
    predictions = predictions.rdd.map(extract).toDF(["p0", "p1", "label", "prediction"])

    valid_input_model = predictions

    # Create an evaluator
    evaluator_auc = BinaryClassificationEvaluator(labelCol='label')
    evaluator_auc.setRawPredictionCol('p1')

    # Compute the areaUnderROC on the test data
    areaUnderROC = evaluator_auc.evaluate(valid_input_model, {evaluator_auc.metricName: "areaUnderROC"})

    evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction")

    # Compute various evaluation metrics
    accuracy = evaluator.evaluate(valid_input_model, {evaluator.metricName: "accuracy"})
    precision = evaluator.evaluate(valid_input_model, {evaluator.metricName: "weightedPrecision"})
    recall = evaluator.evaluate(valid_input_model, {evaluator.metricName: "weightedRecall"})
    f1 = evaluator.evaluate(valid_input_model, {evaluator.metricName: "f1"})

    print("Accuracy = %g" % accuracy)
    print("Precision = %g" % precision)
    print("Recall = %g" % recall)
    print("F1 = %g" % f1)

    # Append the score to the scores list
    scores_auc.append(areaUnderROC)
    scores_accuracy.append(accuracy)
    scores_precision.append(precision)
    scores_recall.append(recall)
    scores_f1.append(f1)

    f1_score = np.mean(scores_f1)

    return f1_score


In [0]:

def get_hyperparameters_single_run(dataset, model_type, hyperparameter_list, k_folds, pca_numerics, non_pca_numerics, categoricals):
    '''
    Description:
    Input:
    Output:
    '''
    # Initialize variables
    best_score = 0
    best_param = None

    # Loop through all combinations of hyperparameters and pick the best set
    for parameters in hyperparameter_list:
        new_model = build_model(model_type, parameters)
        avg_score = single_model_run(dataset = dataset, model = new_model, pca_numerics = pca_num, non_pca_numerics = non_pca_num, categoricals = categoricals, k = k_folds)
        if avg_score > best_score:
            best_score = avg_score
            best_params = parameters
        print(parameters)
        print('best score:', best_score, '|best params:', best_params)
    return best_score, best_params


In [0]:
## Run time series cross validation and perform hyperparameter tuning to select the best model
def single_model_fit(dataset, model, pca_numerics, non_pca_numerics, categoricals, k=5):
    '''
    Description: Runs time series cross validation on the provided model
    Input: Dataset, built model, PCA numeric features, non-PCA numeric features, categorical features, number of folds (k)
    Output: Average F1 score of model
    '''
    # initialize variables 
    # n = dataset.count()
    # chunk_size = int(n/(k+1))
    scores_auc = []
    scores_accuracy = []
    scores_precision = []
    scores_recall = []
    scores_f1 = []

    # Assume that we are ordering by FL_DATE always 
    sort_dataset = dataset.withColumn("row_id", F.row_number().over(Window.partitionBy().orderBy("FL_DATE",F.rand())))

    model_n = sort_dataset.count()
    split_factor = 0.7

    # Perform tscv and hyperparameter tuning 

    train_df = sort_dataset.filter(F.col('row_id') <= (model_n * split_factor)).cache()
    val_df = sort_dataset.filter((F.col('row_id') > (model_n * split_factor))).cache()

    build_pipeline_matrix = build_model_pipeline(train_df, categoricals,pca_numerics,non_pca_numerics,'model_delay')

    pipeline = Pipeline(stages=build_pipeline_matrix + [model])

    #Train model with train_df
    pipeline_model = pipeline.fit(train_df)

    #################### Feature Importances Code ######################

    ###################################################################
    return pipeline_model


In [0]:
def extract(row):
    '''
    Description:
    Input:
    Output:
    '''
    
    return tuple(row.probability.toArray().tolist()) +  (row.label,) + (row.prediction,)

# 4. Model Pipeline (Modelling & Hyperparameter Tuning)

## 4.1 Neural Network

# ML Flow Model

In [0]:
def RegressionEvaluator(preds):
    # print(preds)
    rdd_preds_m = preds.select(['prediction', 'label']).rdd

    # predictions = pipeline_model.transform(df_test_3m).select("probability", "label", "prediction")
    # predictions = predictions.rdd.map(extract).toDF(["p0", "p1", "label", "prediction"])

    preds = preds.select("probability", "label", "prediction")
    preds = preds.rdd.map(extract).toDF(["p0", "p1", "label", "prediction"])


    # Create an binary evaluator
    evaluator_auc = BinaryClassificationEvaluator(labelCol='label')
    evaluator_auc.setRawPredictionCol('p1')

    # Compute the areaUnderROC on the test data
    areaUnderROC = evaluator_auc.evaluate(preds, {evaluator_auc.metricName: "areaUnderROC"})
    areaUnderPR = evaluator_auc.evaluate(preds, {evaluator_auc.metricName: "areaUnderPR"})

    multi_evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction")
    multi_evaluator2 = MulticlassMetrics(rdd_preds_m)
    # Compute various evaluation metrics
    accuracy = multi_evaluator.evaluate(preds, {multi_evaluator.metricName: "accuracy"})
    precision = multi_evaluator.evaluate(preds, {multi_evaluator.metricName: "precisionByLabel"})
    recall = multi_evaluator.evaluate(preds, {multi_evaluator.metricName: "recallByLabel"})
    f1 = multi_evaluator.evaluate(preds, {multi_evaluator.metricName: "f1"})

    f2 = np.round(multi_evaluator2.fMeasure(label=1.0, beta=2.0), 5)
    # pr = binary_evaluator.areaUnderPR

    return accuracy, precision, recall, f1, f2 ,areaUnderROC,areaUnderPR



In [0]:
df_train = df_train_60m
df_test = df_test_60m

In [0]:
# CHANGES HERE
from hyperopt import hp
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.classification import MultilayerPerceptronClassifier

model_nn = MultilayerPerceptronClassifier(labelCol="label", featuresCol="allFeatures", seed=123)
pipeline = Pipeline(stages=[model_nn])
feature_size = df_train.schema["allFeatures"].metadata["ml_attr"]["num_attrs"]

## String format : MLP - 37 - 4 Relu - 2 Softmax; MLP - 37 - 4 Relu - 2 relu- 2 Softmax
# search_space = {
#     "maxIter": hp.quniform("maxIter", 50, 200, 25),
#     # "layers": hp.choice('layers', [[feature_size, 4, 2], [feature_size, 4, 2, 2]]),
#     "layers": hp.choice('layers', [[feature_size, 4, 2, 2], [feature_size, 4, 2, 2]]),
#     "stepSize": hp.quniform("stepSize", 0.1, 1, 0.1),
#     "solver": hp.choice('solver', ['gd', 'l-bfgs']),
#     'blockSize': hp.quniform('blockSize', 16, 128, 16)
# }


## String format : MLP - 37 - 4 Relu - 2 Softmax; MLP - 37 - 32 Relu - 16 relu - 2 Softmax; MLP - 37 - 2 Relu - 1 relu - 2 Softmax 
# search_space = {
#     "maxIter": hp.quniform("maxIter", 175, 200, 25),
#     "layers": hp.choice('layers', [[feature_size, 4, 2, 2], [feature_size, 32, 16, 2], [feature_size, 2, 1, 2]]),# maybe throw away
#     "stepSize": hp.quniform("stepSize", 0.7, 1, 0.1),
#     "solver": hp.choice('solver', ['l-bfgs']),
#     'blockSize': hp.quniform('blockSize', 96, 128, 16)
# }

search_space = {
    "maxIter": hp.choice("maxIter", [175]),
    "layers": hp.choice('layers', [[feature_size, 32, 16, 2]]),# maybe throw away
    "stepSize": hp.choice("stepSize", [0.8]),
    "solver": hp.choice('solver', ['l-bfgs']),
    'blockSize': hp.choice('blockSize', [96])
}

search_space

{'maxIter': <hyperopt.pyll.base.Apply at 0x7f38614280d0>,
 'layers': <hyperopt.pyll.base.Apply at 0x7f38614284f0>,
 'stepSize': <hyperopt.pyll.base.Apply at 0x7f3861428310>,
 'solver': <hyperopt.pyll.base.Apply at 0x7f38614281f0>,
 'blockSize': <hyperopt.pyll.base.Apply at 0x7f3861428070>}

In [0]:
def objective_function(params):
    # CHANGES HERE
    maxIter = params["maxIter"]
    layers = params["layers"]
    stepSize = params["stepSize"]
    solver = params["solver"]
    blockSize = params["blockSize"]


    with mlflow.start_run():
        # CHANGES HERE
        estimator = pipeline.copy({model_nn.maxIter:maxIter,
                                  model_nn.layers:layers,
                                  model_nn.stepSize:stepSize,
                                  model_nn.solver:solver,
                                  model_nn.blockSize:blockSize})
        

        model = estimator.fit(df_train)

        preds_training = model.transform(df_train)       
        pred_calc_training = RegressionEvaluator(preds_training)  

        train_accuracy = pred_calc_training[0]
        train_precision = pred_calc_training[1]
        train_recall = pred_calc_training[2]
        train_f1_score = pred_calc_training[3]
        train_f2_score = pred_calc_training[4]
        train_areaUnderROC = pred_calc_training[5]
        train_areaUnderPR = pred_calc_training[6]

        mlflow.log_metric('train_accuracy', train_accuracy)
        mlflow.log_metric('train_precision', train_precision)
        mlflow.log_metric('train_recall', train_recall)
        mlflow.log_metric('train_f1_score', train_f1_score)
        mlflow.log_metric('train_f2_score', train_f2_score)
        mlflow.log_metric('train_areaUnderROC', train_areaUnderROC)
        mlflow.log_metric('train_areaUnderPR', train_areaUnderPR)


        print('-------------------')
        print('Train Metrics:')
        print('accuracy:',train_accuracy)
        print('precision:',train_precision)
        print('recall:',train_recall)

        print('f1_score:',train_f1_score)
        print('f2_score:',train_f2_score)

        print('areaUnderROC:',str(train_areaUnderROC))
        print('areaUnderPR:',str(train_areaUnderPR))

        preds = model.transform(df_val)
        pred_calc = RegressionEvaluator(preds)
        val_accuracy = pred_calc[0]
        val_precision = pred_calc[1]
        val_recall = pred_calc[2]
        val_f1_score = pred_calc[3]
        val_f2_score = pred_calc[4]
        val_areaUnderROC = pred_calc[5]
        val_areaUnderPR = pred_calc[6]

        mlflow.log_metric('val_accuracy', val_accuracy)
        mlflow.log_metric('val_precision', val_precision)
        mlflow.log_metric('val_recall', val_recall)
        mlflow.log_metric('val_f1_score', val_f1_score)
        mlflow.log_metric('val_f2_score', val_f2_score)
        mlflow.log_metric('val_areaUnderROC', val_areaUnderROC)
        mlflow.log_metric('val_areaUnderPR', val_areaUnderPR)
        print('-------------------')
        print('Validation Metrics:')
        print('accuracy:',val_accuracy)
        print('precision:',val_precision)
        print('recall:',val_recall)
        print('f1_score:',val_f1_score)
        print('f2_score:',val_f2_score)
        print('areaUnderROC:',val_areaUnderROC)
        print('areaUnderPR:',val_areaUnderPR)


        print('-------------------')
        print('Model Params:')
        print('maxIter:',maxIter)
        print('layers:',layers)
        print('stepSize:',stepSize)
        print('solver:',solver)
        print('blockSize:',blockSize)

        mlflow.spark.log_model(model, "NN_Model_Test_60m")
        # print('model_logging_complete' + " --- %s seconds ---" % (time.time() - start_time))
    return val_areaUnderPR

In [0]:
# def objective_function(params):
#     # CHANGES HERE
#     maxIter = params["maxIter"]
#     layers = params["layers"]
#     stepSize = params["stepSize"]
#     solver = params["solver"]
#     blockSize = params["blockSize"]

#     start_time = time.time()

#     with mlflow.start_run():
#         # CHANGES HERE
#         estimator = pipeline.copy({model_nn.maxIter:maxIter,
#                                   model_nn.layers:layers,
#                                   model_nn.stepSize:stepSize,
#                                   model_nn.solver:solver,
#                                   model_nn.blockSize:blockSize})
#         model = estimator.fit(df_train)
#         # print('fit_complete'+ " --- %s seconds ---" % (time.time() - start_time))

#         preds = model.transform(df_test)
#         # print('transform_complete' + " --- %s seconds ---" % (time.time() - start_time))

#         pred_calc = RegressionEvaluator(preds)
#         # print('evaluation_metrics_complete' + " --- %s seconds ---" % (time.time() - start_time))

#         mlflow.spark.log_model(model, "NN_Model_1YEAR")
#         print('model_logging_complete' + " --- %s seconds ---" % (time.time() - start_time))
#         f1_score = pred_calc[3]
#         f2_score = pred_calc[4]
#         mlflow.log_metric('f1', f1_score)

#         print(f1_score,maxIter,layers,stepSize,solver,blockSize)
#     return f2_score


In [0]:
from hyperopt import fmin, tpe, Trials
import mlflow

mlflow.pyspark.ml.autolog(log_models=False)
num_evals = 1
trials = Trials()
best_hyperparam = fmin(fn=objective_function,
                       space = search_space,
                       algo=tpe.suggest,
                       max_evals = num_evals,
                       trials=trials,
                       rstate=np.random.default_rng(42))


  0%|          | 0/1 [00:00<?, ?trial/s, best loss=?]





                                                     -------------------
  0%|          | 0/1 [41:32<?, ?trial/s, best loss=?]                                                     Train Metrics:
  0%|          | 0/1 [41:32<?, ?trial/s, best loss=?]                                                     accuracy:
  0%|          | 0/1 [41:32<?, ?trial/s, best loss=?]                                                     0.8226789893561179
  0%|          | 0/1 [41:32<?, ?trial/s, best loss=?]                                                     precision:
  0%|          | 0/1 [41:32<?, ?trial/s, best loss=?]                                                     0.825025096299883
  0%|          | 0/1 [41:32<?, ?trial/s, best loss=?]                                                     recall:
  0%|          | 0/1 [41:32<?, ?trial/s, best loss=?]                                                     0.9952883140576727
  0%|          | 0/1 [41:32<?, ?trial/s, best loss=?]       

2023/08/08 21:40:17 INFO mlflow.spark: Inferring pip requirements by reloading the logged model from the databricks artifact repository, which can be time-consuming. To speed up, explicitly specify the conda_env or pip_requirements when calling log_model().




100%|██████████| 1/1 [43:57<00:00, 2637.70s/trial, best loss: 0.33109242467464406]100%|██████████| 1/1 [43:57<00:00, 2637.70s/trial, best loss: 0.33109242467464406]


In [0]:
best_hyperparam

{'blockSize': 0, 'layers': 0, 'maxIter': 0, 'solver': 0, 'stepSize': 0}

In [0]:
MultilayerPerceptronClassifier()

MultilayerPerceptronClassifier_c41af4305c0c