In [None]:
%%html
<style>
.output_subarea.output_text.output_stream.output_stdout > pre {
    width:max-content;
}
.p-Widget.jp-RenderedText.jp-OutputArea-output > pre {
   width:max-content;
}
</style>

In [None]:
import html
import visualize

In [None]:
import pandas as pd
import sys
#sys.path.insert(0, '..')
sys.path.append('..')
pd.set_option('display.max_columns', 10000000)
pd.set_option('display.max_rows', 10000000)
pd.set_option('display.width', 10000000)

In [None]:
import visualize
from com.ophelia.OpheliaVendata import OpheliaVendata

In [None]:
ophelia = OpheliaVendata()

In [None]:
spark = ophelia.ophelia_session
path = "data/ophelia/out/model/RiskClassifier/"
customer_banking = ophelia.ophelia_read.read_file(spark, path, "parquet")

In [None]:
customer_banking.show(5)

## Se ha decidido aplicar el algoritmo Gradient-Boosted Tree Classifier, la rezón es por tener el potenciador del gradiente descendente, este ha demostrado tener buenos resultados, dado que el algoritmo no soporta clasificación multiclass, se trabajará un tratamiento especial a los datos.

## Se crearán 5 GBTClassifier, uno para cada clase de riesgo {'A', 'MA', 'M', 'MC', 'C'}, convertiremos la clase $k_{i}$ en 1 y el resto en cero.

In [None]:
import numpy as np

from pyspark.sql import DataFrame
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, VectorAssembler, OneHotEncoder
from pyspark.sql.functions import when, col, lit

In [None]:
initial_string_columns = [
    "job",
    "marital",
    "education",
    "gender"
]
cast_numeric_columns = [
    col("age").cast("float"),
    col("child").cast("float"),
    col("saving").cast("float"),
    col("insight").cast("float"),
    col("backup").cast("float")
]
build_target = when(col("risk_label") == "C", lit(1.0)).otherwise(0.0)
mapped_classes = customer_banking.select(
    *initial_string_columns,
    *cast_numeric_columns,
    build_target.alias("label")
)
mapped_classes.show(5, False)
mapped_classes.printSchema()

## Bitácora de experimentos:
- para el primer experimento, entrenamos un modelo Gradient Boosted Tree Classifier prediciendo la clase 'C' (conservador) como variable 'target' = 1, la distribución es de [1:526, 0:10636], la clase 1 (positiva) presenta desbalanceo 20:1, observaremos los resultado producidos con el modelo defacto a entrenar.

In [None]:
negative_class = mapped_classes.groupBy("label").count().where(col("label") == 0.0).select("count").collect()[0][0]
positive_class = mapped_classes.groupBy("label").count().where(col("label") == 1.0).select("count").collect()[0][0]

print("Class proportion", str(round(negative_class/positive_class)) + ":1")

In [None]:
def pipe(transform_list):
    return Pipeline(stages=transform_list)

def fit(pipe, df):
    return pipe.fit(df)

def transform(model, df):
    return model.transform(df)

In [None]:
def string_indexer(col_list):
    return [StringIndexer(inputCol=column, outputCol="{0}_index".format(column)) for column in col_list]

def build_string_index(df, col_list):
    indexers = string_indexer(col_list)
    pipe = pipe(indexers)
    fit_model = fit(pipe, df)
    return transform(fit_model, df)

In [None]:
a = []
if a is list:
    print("hola")

In [None]:
from typing import List

In [None]:
def single_string_indexer(single_col: str) -> StringIndexer:
    return StringIndexer(inputCol=single_col, outputCol=single_col + "_index")

def multi_string_indexer(multi_col: list) -> List[StringIndexer]:
    indexer = [StringIndexer(
        inputCol=column,
        outputCol="{0}_index".format(column)) for column in multi_col]
    return indexer

In [None]:
def build_string_index(df: DataFrame, indexer_type: str, col_name: str = None, col_list: list = None) -> DataFrame:
    dict_indexer = {
        "single": single_string_indexer,
        "multi": multi_string_indexer
    }
    if col_list is not None:
        pipe_ml = pipe(dict_indexer[indexer_type](col_list))
    elif col_name is not None:
        pipe_ml = pipe(dict_indexer[indexer_type](col_name))
    else:
        raise ValueError('Unexpected indexer type:{}'.format(dict_indexer[indexer_type]))
    fit_model = fit(pipe_ml, df)
    return transform(fit_model, df)

In [None]:
categorical_columns = ['job', 'marital', 'education', 'gender']

In [None]:
transform_df = build_string_index(df=mapped_classes, indexer_type="multi", col_name=categorical_columns)
transform_df.show(5, False)
transform_df.printSchema()

In [None]:
from pyspark.ml.feature import Binarizer, BucketedRandomProjectionLSH, BucketedRandomProjectionLSHModel, Bucketizer, \
    ChiSqSelector, ChiSqSelectorModel, CountVectorizer, CountVectorizerModel, DCT, ElementwiseProduct, FeatureHasher, \
    HashingTF, IDF, IDFModel, Imputer, ImputerModel, IndexToString, MaxAbsScaler, MaxAbsScalerModel, MinHashLSH, \
    MinHashLSHModel, MinMaxScaler, MinMaxScalerModel, NGram, Normalizer, OneHotEncoder, \
    OneHotEncoderModel, PCA, PCAModel, PolynomialExpansion, QuantileDiscretizer, RegexTokenizer, RFormula, \
    RFormulaModel, StringIndexer, StringIndexerModel, VectorSlicer, Word2Vec, VectorSizeHint, StopWordsRemover, \
    StandardScalerModel, StandardScaler, SQLTransformer, Tokenizer, VectorAssembler, VectorIndexer, VectorIndexerModel, \
    Word2VecModel


class OpheliaJavaTransformers:

    __all__ = [Binarizer, BucketedRandomProjectionLSH, BucketedRandomProjectionLSHModel,
               Bucketizer, ChiSqSelector, ChiSqSelectorModel, CountVectorizer, CountVectorizerModel,
               DCT, ElementwiseProduct, FeatureHasher, HashingTF, IDF, IDFModel, Imputer, ImputerModel,
               IndexToString, MaxAbsScaler, MaxAbsScalerModel, MinHashLSH, MinHashLSHModel, MinMaxScaler,
               MinMaxScalerModel, NGram, Normalizer, OneHotEncoder, OneHotEncoderModel,
               PCA, PCAModel, PolynomialExpansion, QuantileDiscretizer, RegexTokenizer, RFormula, RFormulaModel,
               SQLTransformer, StandardScaler, StandardScalerModel, StopWordsRemover, StringIndexer, StringIndexerModel,
               Tokenizer, VectorAssembler, VectorIndexer, VectorIndexerModel, VectorSizeHint, VectorSlicer, Word2Vec,
               Word2VecModel]

In [None]:
def class_type(dtype):
    return dtype.__class__

def class_name(dtype):
    return dtype.__class__.__name__

In [None]:
def fit(pipe, df: DataFrame):
    if class_type(pipe) not in OpheliaJavaTransformers.__all__:
        raise TypeError("'pipe' must be OpheliaMLObjects not " + class_name(pipe))
    return pipe.fit(df)

In [None]:
def ohe_estimator(col_list):
    indexers = string_indexer(col_list)
    encoder = OneHotEncoder(
        inputCols=[indexer.getOutputCol() for indexer in indexers],
        outputCols=["{0}_encoded".format(indexer.getOutputCol()) for indexer in indexers]
    )
    return encoder

def build_one_hot_encoder(df, col_list):
    encoder = ohe_estimator(col_list)
    encode_vector = fit(encoder, df)
    return transform(encode_vector, df)

In [None]:
columns_lambda = lambda k: k.endswith('_index')
indexed_columns = list(filter(columns_lambda, transform_df.columns))
encode_vector_df = build_one_hot_encoder(transform_df, categorical_columns).drop(*indexed_columns)
encode_vector_df.show(5)

In [None]:
def indexer_encoded(index_list):
    indexers = string_indexer(index_list)
    encode_index_list = []
    for c in range(len(indexers)):
        encode_index_list.append(indexers[c].getOutputCol() + "_encoded")
    return encode_index_list

In [None]:
indexers = string_indexer(categorical_columns)
string_indexer_cols = []
for c in range(len(indexers)):
    string_indexer_cols.append(indexers[c].getOutputCol() + "_encoded")

In [None]:
string_indexer_cols

In [None]:
numericCols = ['age', 'child', 'saving', 'insight', 'backup']
assemblerInputs = string_indexer_cols + numericCols
assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")

In [None]:
assemble_model = assembler.transform(encode_vector_df)
assemble_model.show(5, False)

In [None]:
cols = mapped_classes.columns
selectedCols = cols + ['features']
vectorized_customer_banking = assemble_model.select(selectedCols)

In [None]:
vectorized_customer_banking.show(5, False)

In [None]:
train, test = vectorized_customer_banking.randomSplit([0.7, 0.3], seed = 2020)
print("Training Dataset Count: " + str(train.count()))
print("Test Dataset Count: " + str(test.count()))

print("\nTrain distribution label")
train.groupBy("label").count().show()

print("Test distribution label")
test.groupBy("label").count().show()

## Logistic Regression Model

In [None]:
from pyspark.ml.classification import LogisticRegression
lr = LogisticRegression(featuresCol='features', labelCol='label', maxIter=10)
lrModel = lr.fit(train)

In [None]:
class_type(lrModel)

In [None]:
class_name(lrModel)

In [None]:
def plot_summary(model):
    beta = np.sort(model.coefficientMatrix)
    plt.plot(beta)
    plt.ylabel('Beta Coefficients')
    plt.show()

In [None]:
lrModel.coefficientMatrix

In [None]:
lrModel.coefficients

In [None]:
lrModel.elasticNetParam

In [None]:
lrModel.intercept

In [None]:
lrModel.interceptVector

In [None]:
import matplotlib.pyplot as plt
plt.figure(figsize=(5,5))
plt.plot([0, 1], [0, 1], 'r--')
plt.plot(lrModel.summary.roc.select('FPR').collect(),
         lrModel.summary.roc.select('TPR').collect())
plt.xlabel('FPR')
plt.ylabel('TPR')
plt.show()

In [None]:
lrModel.summary.roc.show(5, False)

In [None]:
import matplotlib.pyplot as plt
import numpy as np
beta = np.sort(lrModel.coefficients)
plt.plot(beta)
plt.ylabel('Beta Coefficients')
plt.show()

In [None]:
trainingSummary = lrModel.summary
roc = trainingSummary.roc.toPandas()
plt.plot(roc['FPR'],roc['TPR'])
plt.ylabel('False Positive Rate')
plt.xlabel('True Positive Rate')
plt.title('ROC Curve')
plt.show()
print('Training set areaUnderROC: ' + str(trainingSummary.areaUnderROC))

In [None]:
pr = trainingSummary.pr.toPandas()
plt.plot(pr['recall'],pr['precision'])
plt.ylabel('Precision')
plt.xlabel('Recall')
plt.show()

In [None]:
mapped_classes.columns

In [None]:
test.printSchema()

In [None]:
predict_cols = mapped_classes.columns + ['prediction', 'probability', 'rawPrediction']
predictions = lrModel.transform(test)
predictions.select(*predict_cols).where(col("label") == 1.0).show(20, False)

In [None]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator()
print('Test Area Under ROC', evaluator.evaluate(predictions))

In [None]:
import numpy as np

from typing import Any
from pyspark.sql import DataFrame
from pyspark.ml.linalg import DenseVector, DenseMatrix
from pyspark.ml.classification import LogisticRegression, LogisticRegressionModel, \
    BinaryLogisticRegressionTrainingSummary

def __build_parameters_model(**kargs: Any) -> dict:
        
    features_col = str(kargs.get("featuresCol"))
    label_col = str(kargs.get("labelCol"))
    max_iter = int(kargs.get("maxIter"))
    prediction_col = str(kargs.get("predictionCol"))
    reg_param = float(kargs.get("regParam"))
    elastic_net_param = float(kargs.get("elasticNetParam"))
    tolerance = float(kargs.get("tol"))
    fit_intercept = bool(kargs.get("fitIntercept"))
    threshold = float(kargs.get("threshold"))
    thresholds = kargs.get("thresholds")
    probability_col = str(kargs.get("probabilityCol"))
    raw_prediction_col = str(kargs.get("rawPredictionCol"))
    standardization = bool(kargs.get("standardization"))
    weight_col = kargs.get("weightCol")
    aggregation_depth = int(kargs.get("aggregationDepth"))
    family = str(kargs.get("family"))
    lower_bounds_coefficients = kargs.get("lowerBoundsOnCoefficients")
    upper_bounds_coefficients = kargs.get("upperBoundsOnCoefficients")
    lower_bounds_intercepts = kargs.get("lowerBoundsOnIntercepts")
    upper_bounds_intercepts = kargs.get("upperBoundsOnIntercepts")
    conf = {
        "features_col": 'features' if features_col is None else features_col,
        "label_col": 'label' if label_col is None else label_col,
        "max_iter": 10 if max_iter is None else max_iter,
        "prediction_col": 'prediction' if prediction_col is None else prediction_col,
        "reg_param": 0.0 if reg_param is None else reg_param,
        "elastic_net_param": 0.0 if elastic_net_param is None else elastic_net_param,
        "tolerance": 1e-6 if tolerance is None else tolerance,
        "fit_intercept": True if fit_intercept is None else fit_intercept,
        "threshold": 0.5 if threshold is None else threshold,
        "thresholds": None if thresholds is None else thresholds,
        "probability_col": 'probability' if probability_col is None else probability_col,
        "raw_prediction_col": 'rawPrediction' if raw_prediction_col is None else raw_prediction_col,
        "standardization": True if standardization is None else standardization,
        "weight_col": None if weight_col is None else weight_col,
        "aggregation_depth": 2 if aggregation_depth is None else aggregation_depth,
        "family": 'auto' if family is None else family,
        "lower_bounds_coefficients": None if lower_bounds_coefficients is None else lower_bounds_coefficients,
        "upper_bounds_coefficients": None if upper_bounds_coefficients is None else upper_bounds_coefficients,
        "lower_bounds_intercepts": None if lower_bounds_intercepts is None else lower_bounds_intercepts,
        "upper_bounds_intercepts": None if upper_bounds_intercepts is None else upper_bounds_intercepts
    }
    return conf

def train(df: DataFrame, **kargs: Any) -> LogisticRegressionModel:
    config_parameters = __build_parameters_model(**kargs)
    lr = LogisticRegression(
        featuresCol=config_parameters["features_col"],
        labelCol=config_parameters["label_col"],
        maxIter=config_parameters["max_iter"],
        predictionCol=config_parameters["prediction_col"],
        regParam=config_parameters["reg_param"],
        elasticNetParam=config_parameters["elastic_net_param"],
        tol=config_parameters["tolerance"],
        fitIntercept=config_parameters["fit_intercept"],
        threshold=config_parameters["threshold"],
        thresholds=config_parameters["thresholds"],
        probabilityCol=config_parameters["probability_col"],
        rawPredictionCol=config_parameters["raw_prediction_col"],
        standardization=config_parameters["standardization"],
        weightCol=config_parameters["weight_col"],
        aggregationDepth=config_parameters["aggregation_depth"],
        family=config_parameters["family"],
        lowerBoundsOnCoefficients=config_parameters["lower_bounds_coefficients"],
        upperBoundsOnCoefficients=config_parameters["upper_bounds_coefficients"],
        lowerBoundsOnIntercepts=config_parameters["lower_bounds_intercepts"],
        upperBoundsOnIntercepts=config_parameters["upper_bounds_intercepts"]
    )
    return lr.fit(df)

In [None]:
train(train, maxIter=10)

# Exploraremos el valor de la información (iv) y el peso de la evidencia (woe) que aporta cada variable predictora al resultado de la variable dependiente

In [None]:
def __class_mark(observation, min_val, class_length, bins):
    """
    Computes the corresponding bin to a certain data observation given the data set minimum, size, bins
    and the class length
    :param observation: float, value of which is required to obtain your bin number
    :param min_val: float, minimum value observed in the rdd
    :param class_length: float, length of each sub interval
    :param bins: int, number of sub intervals
    :return: int, bin corresponding to the given observation
    """
    interval = int((observation - min_val) / class_length)
    if interval >= bins:
        return bins - 1
    else:
        return interval
    
def __frequency_rdd_continuous(data_set_rdd, min_val, class_length, bins, n):
    """
    Generates the frequency table rdd from certain continuous column rdd

    :param data_set_rdd: rdd, rdd of the continuous column of which the histogram will be computed
    :param min_val: float, minimum value observed in the rdd
    :param class_length: float, length of each sub interval
    :param bins: int, number of sub intervals
    :param n: int, table length
    :return: rdd, rdd containing the frequencies for each class of the histogram
    """
    frequency_rdd = data_set_rdd \
        .map(lambda x: (__class_mark(x, min_val, class_length, bins), 1)) \
        .reduceByKey(lambda x, y: x + y) \
        .map(lambda y: (y[0]+1, min_val+class_length*y[0], min_val+class_length*(y[0]+1), y[1], y[1]/n))
    return frequency_rdd

def __frequency_rdd_discrete(data_set_rdd, n):
    """
    Generates the frequency table rdd from certain discrete column rdd
    :param data_set_rdd: rdd, rdd of the continuous column of which the histogram will be computed
    :param n: int, table length
    :return: rdd, rdd containing the frequencies for each class of the histogram
    """
    frequency_rdd = data_set_rdd \
        .map(lambda r: (r, 1)) \
        .reduceByKey(lambda x, y: x + y) \
        .map(lambda x: (x[0], x[1], x[1] / n))
    return frequency_rdd

def frequency_table_continuous(data_set_df, column, bins=None, suffix=''):
    """
    Computes the histogram frequency table from a column with continuous values for a table DataFrameUtils

    :param data_set_df: DataFrameUtils, table of which it is required to calculate the frequency histogram of some of
    its columns
    :param column: string, column with continuous values which is required to calculate its histogram
    :param bins: int, number of sub intervals
    :param suffix: string, assign the suffix to each column of the frequency table
    :return: DataFrameUtils with the histogram frequency table
    """
    freq_schema = ['bin', 'lower_limit'+suffix, 'upper_limit'+suffix, 'fa_'+column+suffix, 'f_'+column+suffix]
    window_freq = Window.orderBy('bin').rangeBetween(Window.unboundedPreceding, 0)

    cumulative_rel_freq = spark_sum('f_'+column+suffix)\
        .over(window_freq)\
        .alias('F_cumulative_'+column+suffix)

    cumulative_abs_freq = spark_sum('fa_'+column+suffix)\
        .over(window_freq)\
        .alias('Fa_cumulative_'+column+suffix)

    data_set_rdd = data_set_df.select(column).rdd.map(lambda row: (row[0]))
    n = data_set_rdd.count()

    if bins is None:
        bins = 1 + int(3.322 * np.log(n))

    maximum = data_set_rdd.max()
    minimum = data_set_rdd.min()
    class_length = (maximum - minimum) / bins

    frequency_table_df = __frequency_rdd_continuous(data_set_rdd, minimum, class_length, bins, n)\
        .toDF(freq_schema)\
        .select('*', cumulative_abs_freq, cumulative_rel_freq)
    return frequency_table_df

def frequency_table_discrete(data_set_df, column, suffix=''):
    """
    Generates the frequency table rdd from certain discrete column rdd.

    :param data_set_df: DataFrameUtils, table of which it is required to calculate the frequency histogram of some of
    its columns
    :param column: string, column with continuous values ​​which is required to calculate its histogram
    :param suffix: string, assign the suffix to each column of the frequency table
    :return: DataFrameUtils with the histogram frequency table
    """
    freq_schema = ['bin', 'fa_' + column + suffix, 'f_' + column + suffix]
    window_freq = Window.orderBy('bin').rangeBetween(Window.unboundedPreceding, 0)

    cumulative_rel_freq = spark_sum('f_' + column + suffix).over(window_freq).alias(
        'F_cumulative_' + column + suffix)

    cumulative_abs_freq = spark_sum('fa_' + column + suffix).over(window_freq).alias(
        'Fa_cumulative_' + column + suffix)

    data_set_rdd = data_set_df.select(column).rdd.map(lambda row: (row[0]))

    n = data_set_rdd.count()

    frequency_table_df = __frequency_rdd_discrete(data_set_rdd, n)\
        .toDF(freq_schema)\
        .select('*', cumulative_abs_freq, cumulative_rel_freq)
    return frequency_table_df

def compute_information_value(mixed_dist_df, column):
    """
    Calculate the weight of evidence (woe) and the information value (iv) give a table that contains the
    histograms frequency tables of the negative and positive populations.

    :param mixed_dist_df: DataFrameUtils, table with the mixed histograms of negative and positive populations
    :param column: string, column with continuous values which is required to calculate its histogram
    :return: DataFrameUtils, table that includes the woe and iv for each bin inside the histograms
    """
    epsilon = 0.000000001
    woe_expr = spark_log((col('f_' + column + '_0') + epsilon) / (col('f_' + column + '_1') + epsilon))
    iv_expr = (col('f_' + column + '_0') - col('f_' + column + '_1')) * col('woe_' + column)

    iv_table_df = mixed_dist_df\
        .fillna(0)\
        .withColumn('woe_' + column, woe_expr)\
        .withColumn('iv_' + column, iv_expr)\
        .fillna(0, subset=['woe_' + column, 'iv_' + column])\
        .orderBy('bin')
    return iv_table_df

def information_value_continuous(data_set_df, column, target, neg_label=0, pos_label=1, bins=None):
    """
    Computes de frequency table histograms for the negative and positive populations and then compute the
    weight of evidence (woe) and the information value (iv) table.

    :param data_set_df: DataFrameUtils, table of which it is required to calculate the frequency histogram of some of
    its columns
    :param column: string, column with continuous values which is required to calculate its histogram
    :param target: string, target column name that contains observations of negative and positive populations
    :param neg_label: int, value to identify a negative row observation
    :param pos_label: int, value to identify a positive row observation
    :param bins: int, number of sub intervals
    :return: DataFrameUtils, table that includes the woe and iv for each bin of frequency table histogram
    """
    freq_neg_schema = ['bin', 'lower_limit_' + column, 'upper_limit_' + column, 'fa_' + column + '_0', 'f_' + column + '_0']
    freq_pos_schema = ['bin', 'lower_limit_1', 'upper_limit_1', 'fa_' + column + '_1', 'f_' + column + '_1']

    data_set_rdd = data_set_df.select(column).rdd.map(lambda row: (row[0]))
    n = data_set_rdd.count()

    if bins is None:
        bins = 1 + int(3.322 * np.log(n))

    maximum = data_set_rdd.max()
    minimum = data_set_rdd.min()
    class_length = (maximum - minimum) / bins

    lower_lim_expr = (lit(minimum) + lit(class_length) * (col('bin') - lit(1))).alias('lower_limit')
    upper_lim_expr = (lit(minimum) + lit(class_length) * col('bin')).alias('upper_limit')

    neg_rdd = data_set_df\
        .where(col(target) == neg_label)\
        .select(column).rdd\
        .map(lambda row: (row[0]))

    pos_rdd = data_set_df\
        .where(col(target) == pos_label)\
        .select(column).rdd\
        .map(lambda row: (row[0]))

    freq_neg_df = __frequency_rdd_continuous(neg_rdd, minimum, class_length, bins, n)\
        .toDF(freq_neg_schema)\
        .select('bin', 'f_' + column + '_0')

    freq_pos_df = __frequency_rdd_continuous(pos_rdd, minimum, class_length, bins, n)\
        .toDF(freq_pos_schema)\
        .select('bin', 'f_' + column + '_1')

    mixed_dist_df = freq_pos_df.join(freq_neg_df, on='bin', how='full')\
        .select('*', lower_lim_expr, upper_lim_expr)\

    return compute_information_value(mixed_dist_df, column)

def information_value_discrete(data_set_df, column, target, neg_label=0, pos_label=1):
    """
    Computes de frequency table histograms for the negative and positive populations and then compute the
    weight of evidence (woe) and the information value (iv) table.

    :param data_set_df: DataFrameUtils, table of which it is required to calculate the frequency histogram of some of
    its columns
    :param column: string, column with continuous values ​​which is required to calculate its histogram
    :param target: string, target column name that contains observations of negative and positive populations
    :param neg_label: int, value to identify a negative row observation
    :param pos_label: int, value to identify a positive row observation
    :return: DataFrameUtils, table that includes the woe and iv for each bin of frequency table histogram
    """
    freq_neg_schema = ['bin', 'fa_' + column + '_0', 'f_' + column + '_0']
    freq_pos_schema = ['bin', 'fa_' + column + '_1', 'f_' + column + '_1']

    data_set_rdd = data_set_df.select(column).rdd.map(lambda row: (row[0]))
    n = data_set_rdd.count()

    neg_rdd = data_set_df.where(col(target) == neg_label)\
        .select(column).rdd\
        .map(lambda row: (row[0]))

    pos_rdd = data_set_df.where(col(target) == pos_label)\
        .select(column).rdd\
        .map(lambda row: (row[0]))

    freq_neg_df = __frequency_rdd_discrete(neg_rdd, n)\
        .toDF(freq_neg_schema)\
        .select('bin', 'f_' + column + '_0')

    freq_pos_df = __frequency_rdd_discrete(pos_rdd, n)\
        .toDF(freq_pos_schema)\
        .select('bin', 'f_' + column + '_1')

    mixed_dist_df = freq_pos_df\
        .join(freq_neg_df, on='bin', how='full')\
        .fillna(0)
    return compute_information_value(mixed_dist_df, column)

In [None]:
import time
import traceback
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from matplotlib.pyplot import figure
from pyspark_dist_explore import hist
from pyspark.sql import SparkSession
from pyspark.sql.window import Window
from pyspark.sql.functions import col, lit, sum as spark_sum, log as spark_log

def create_frequency_table(df, coln, targt, function_type):
    f_type = {
        "continuous": information_value_continuous,
        "discrete": information_value_discrete
    }
    
    freq_df = f_type[function_type](data_set_df=df, column=coln, target=targt)
    
    return freq_df

In [None]:
create_frequency_table(predictions, "job", "prediction", "discrete").show(100, False)
create_frequency_table(predictions, "age", "prediction", "continuous").show(100, False)
create_frequency_table(predictions, "education", "prediction", "discrete").show(100, False)
create_frequency_table(predictions, "gender", "prediction", "discrete").show(100, False)

## Decision Tree Classifier

In [None]:
from pyspark.ml.classification import DecisionTreeClassifier

dt = DecisionTreeClassifier(featuresCol='features', labelCol='label', maxDepth = 3)
dtModel = dt.fit(train)
predictions = dtModel.transform(test)
predictions.select(*predict_cols).show(5, False)

In [None]:
evaluator = BinaryClassificationEvaluator()
print("Test Area Under ROC: " + str(evaluator.evaluate(predictions, {evaluator.metricName: "areaUnderROC"})))

## Random Forest Classifier

In [None]:
from pyspark.ml.classification import RandomForestClassifier
rf = RandomForestClassifier(featuresCol='features', labelCol='label')
rfModel = rf.fit(train)
predictions = rfModel.transform(test)
predictions.select(*predict_cols).show(5, False)

In [None]:
evaluator = BinaryClassificationEvaluator()
print("Test Area Under ROC: " + str(evaluator.evaluate(predictions, {evaluator.metricName: "areaUnderROC"})))

## Gradient-Boosted Tree Classifier

In [None]:
from pyspark.ml.classification import GBTClassifier

gbt = GBTClassifier(maxIter=10)
gbtModel = gbt.fit(train)
predictions = gbtModel.transform(test)
predictions.select(*predict_cols).show(10, False)

In [None]:
predict_cols

In [None]:
gbtModel.featureImportances

In [None]:
gbtModel.getNumTrees

In [None]:
gbtModel.treeWeights

In [None]:
gbtModel.featureSubsetStrategy

In [None]:
gbtModel.cacheNodeIds

In [None]:
gbtModel.checkpointInterval

In [None]:
gbtModel.impurity

In [None]:
gbtModel.lossType

In [None]:
evaluator = BinaryClassificationEvaluator()
print("Test Area Under ROC: " + str(evaluator.evaluate(predictions, {evaluator.metricName: "areaUnderROC"})))

In [None]:
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

paramGrid = ParamGridBuilder()\
                .addGrid(gbt.maxDepth, [2, 4, 6])\
                .addGrid(gbt.maxBins, [20, 60])\
                .addGrid(gbt.maxIter, [10, 20])\
                .build()

# Run cross validations.  This can take about 6 minutes since it is training over 20 trees!
cv = CrossValidator(estimator=gbt, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=5)
cvModel = cv.fit(train)

boosting_predictions = cvModel.transform(test)
evaluator.evaluate(boosting_predictions)

In [None]:
cvModel.bestModel.featureImportances.values

In [None]:
cvModel.bestModel.evaluateEachIteration(train)

In [None]:
cvModel.bestModel.extractParamMap()

In [None]:
print(cvModel.bestModel.explainParams())

In [None]:
cvModel.bestModel.totalNumNodes

In [None]:
boosting_predictions.select(*predict_cols).where(col("risk_label") == "C").show(1000, False)

# Multilayer Perceptron Classifier

# SVC

# KMeans

# KNN

# word2vec
# tokenizer
# CountVectorizer
# TF-IDF
# FeatureHasher
# StopWordsRemover
# n-gram
# binarizer
# PCA
# PolynomialExpansion
# Discrete Cosine Transform
# IndexToString
# Interaction
# VectorIndexer
# Normalizer
# StandardScaler
# RobustScaler
# MinMaxScaler
# MaxAbsScaler
# Bucketizer
# ElementwiseProduct
# SQLTransformer
# VectorSizeHint
# QuantileDiscretizer
# Imputer
# VectorSlicer
# RFormula
# ChiSqSelector
# Locality Sensitive Hashing
# Approx Nearest Neighbors