In [1]:
from pyspark.sql.functions import udf, when, rand, struct, col
from pyspark.sql.types import StringType, IntegerType, StructField, StructType, DoubleType, ArrayType

In [3]:
from enquire.vendetta import Ophelia

In [4]:
def factor_weights(gender, age, education, occupation, marital):
    return [gender, age, education, occupation, marital]

In [5]:
W = factor_weights(
    gender=0.21,
    age=0.49,
    education=0.16,
    occupation=0.09,
    marital=0.05
)
sum(W)

1.0

In [6]:
ophelia = Ophelia(W=W)


-Ophelia: ¡Hullo! My Name Is Ophelia, I Am Pleased To Meet You     [...]
-Ophelia: I Am An Artificial Assistant For Intelligent Investment  [...]
-Ophelia: Welcome To Your Asset Allocation System                  [...]


-Ophelia: V For VenData                                            [...]

                    - By. Vendetta Gentleman Club -                     

                         - Author. @LuisFalva -                         

      █ █ █ █ █ █ █ █ █ █ █ █ █ █ █ █ █ █ █ █ █ █ █ █ █ █ █ █ █ █ █
      █ █ █ █ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ █ █ █ █
      █ █ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ █ █
      █ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ █
      █ ╬ ╬ ╬ █ █ █ █ █ █ █ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ █ █ █ █ █ █ █ ╬ ╬ ╬ █
      █ ╬ ╬ █ █ ╬ ╬ ╬ ╬ █ █ █ ╬ ╬ ╬ ╬ ╬ ╬ ╬ █ █ █ ╬ ╬ ╬ ╬ █ █ ╬ ╬ █
      █ ╬ █ █ ╬ ╬ ╬ ╬ ╬ ╬ ╬ █ █ ╬ ╬ ╬ ╬ ╬ █ █ ╬ ╬ ╬ ╬ ╬ ╬ ╬ █ █ ╬ █
      █ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ █
      █ ╬ ╬ 

In [7]:
spark = ophelia.spk
customer_banking = ophelia.tr.read_file("data-resources/customer_banking/", "csv", spark)

-Ophelia[INFO]: Reading Spark File [...]
-Ophelia[INFO]: Read CSV Successfully From Path: data-resources/customer_banking/ [...]


In [8]:
schema_tree = ArrayType(StructType([StructField("weight", DoubleType(), True),
                                    StructField("risk_label", StringType(), True),
                                    StructField("vote", IntegerType(), True)]))

In [9]:
tree_udf = udf(ophelia.clss.tree_generator, schema_tree)
classification_udf = udf(ophelia.clss.run_classification_risk, StringType())

In [10]:
test_customer_df = customer_banking.select("age", "job", "marital", "education")\
                                   .withColumn("gender", when(rand() > 0.5, "male").otherwise("female"))\
                                   .withColumn("struct", struct(col("gender"), col("age"), col("marital"), col("education"), col("job")))

In [11]:
test_customer_df.select("*", tree_udf(col("struct")).alias("tree"))\
                .select("age", "job", "marital", "education", "gender", 
                        classification_udf(col("tree")).alias("risk_label"))\
                .show()

+---+-----------+--------+---------+------+----------+
|age|        job| marital|education|gender|risk_label|
+---+-----------+--------+---------+------+----------+
| 59|     admin.| married|secondary|female|        MC|
| 56|     admin.| married|secondary|  male|        MC|
| 41| technician| married|secondary|  male|        MC|
| 55|   services| married|secondary|female|        MC|
| 54|     admin.| married| tertiary|  male|        MC|
| 42| management|  single| tertiary|  male|        MA|
| 56| management| married| tertiary|female|        MC|
| 60|    retired|divorced|secondary|female|        MC|
| 37| technician| married|secondary|  male|        MC|
| 28|   services|  single|secondary|  male|        MC|
| 38|     admin.|  single|secondary|female|        MC|
| 30|blue-collar| married|secondary|  male|        MC|
| 29| management| married| tertiary|  male|        MA|
| 46|blue-collar|  single| tertiary|female|        MA|
| 31| technician|  single| tertiary|  male|        MA|
| 35| mana

**============================================================ FINAL CLASSIFICATION PROCESS =============================================================**