In [None]:
%%html
<style>
.output_subarea.output_text.output_stream.output_stdout > pre {
    width:max-content;
}
.p-Widget.jp-RenderedText.jp-OutputArea-output > pre {
   width:max-content;
}
</style>

In [None]:
import sys
import os
import operator

from pyspark.sql import Row
from pyspark.sql.functions import udf, when, rand, struct, col, lit, current_date, current_timestamp, monotonically_increasing_id
from pyspark.sql.types import StringType, IntegerType, StructField, StructType, DoubleType, ArrayType, FloatType

In [None]:
import visualize
from ophelia.ophelib.OpheliaMain import Ophelia

In [None]:
module_path = os.path.abspath(os.path.join('..'))

In [None]:
ophelia = Ophelia("Risk Classification")
sc = ophelia.Spark.build_spark_context()
#sc.addFile(module_path+"/enquire/", recursive = True)

In [None]:
spark = ophelia.SparkSession
path = "data/raw/csv/bank.csv"
customer_data = spark.read.csv(path, header=True, inferSchema=True)
customer_data.printSchema()
customer_data.show(5, False)

In [None]:
column_selection = [
    col('age').cast('int').alias('age'),
    col('job').cast('string').alias('job'),
    col('marital').cast('string').alias('marital'),
    col('education').cast('string').alias('education'),
    col('gender').cast('string').alias('gender'),
    col('child').cast('float').alias('child'),
    col('saving').cast('float').alias('saving'),
    col('insight').cast('float').alias('insight'),
    col('backup').cast('float').alias('backup')
]

struct_assembler = customer_data.select(*column_selection, struct(column_selection).alias("struct_assembler"))
struct_assembler.show(5, False)
struct_assembler.printSchema()

In [None]:
def build_tree_classifier():
    schema_tree = ArrayType(StructType([
        StructField("vote", FloatType(), False),
        StructField("weight", FloatType(), False),
        StructField("risk_label", StringType(), False)
    ]))
    return udf(ophelia.opClassify.tree_generator, schema_tree)

def build_risk_class():
    return udf(ophelia.opClassify.run_classification_risk, StringType())

In [None]:
tree_model_udf = build_tree_classifier()
risk_class_udf = build_risk_class()

In [None]:
tree_model_df = struct_assembler.select("*", tree_model_udf(col("struct_assembler")).alias("tree")).cache()
tree_model_df.show(5, False)
tree_model_df.printSchema()

In [None]:
risk_class_df = tree_model_df.select(risk_class_udf(col("tree")).alias("risk_label"),
                                     *column_selection, current_date().alias("information_date"),
                                     current_timestamp().alias("model_date"))

risk_class_df.groupBy("risk_label").count().show(10, False)
risk_class_df.show(10, False)
risk_class_df.printSchema()

In [None]:
columns_risk = risk_class_df.columns
when_conditions = (
    when(col("risk_label") == "A", lit(5.0)).otherwise(
        when(col("risk_label") == "MA", lit(4.0)).otherwise(
            when(col("risk_label") == "M", lit(3.0)).otherwise(
                when(col("risk_label") == "MC", lit(2.0)).otherwise(
                    when(col("risk_label") == "C", lit(1.0))))))
)
mapped_risk_label = risk_class_df.select((monotonically_increasing_id() + 100000000).alias("customer_id"), 
                                         when_conditions.alias("risk_label_id"), *columns_risk)
mapped_risk_label.groupBy('risk_label_id', 'risk_label').count().orderBy('risk_label_id').show()
mapped_risk_label.show(5, False)

In [None]:
riskClassPath = ophelia.opWrite.write_parquet(
    df=mapped_risk_label,
    output_type="model",
    project="RiskClassifier",
    part="information_date"
)