In [1]:
from collections import defaultdict, Counter
from pyspark.sql.functions import lit, col, round as spark_round, sum as spark_sum

In [2]:
from enquire.vendetta import Ophelia

In [3]:
def factor_weights(gender, age, education, occupation, marital):
    return [gender, age, education, occupation, marital]

In [4]:
W = factor_weights(
    gender=0.21,
    age=0.49,
    education=0.16,
    occupation=0.09,
    marital=0.05
)
sum(W)

1.0

In [5]:
ophelia = Ophelia(W=W)


-Ophelia: ¡Hullo! My Name Is Ophelia, I Am Pleased To Meet You     [...]
-Ophelia: I Am An Artificial Assistant For Intelligent Investment  [...]
-Ophelia: Welcome To Your Asset Allocation System                  [...]


-Ophelia: V For VenData                                            [...]

                    - By. Vendetta Gentleman Club -                     

                         - Author. @LuisFalva -                         

      █ █ █ █ █ █ █ █ █ █ █ █ █ █ █ █ █ █ █ █ █ █ █ █ █ █ █ █ █ █ █
      █ █ █ █ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ █ █ █ █
      █ █ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ █ █
      █ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ █
      █ ╬ ╬ ╬ █ █ █ █ █ █ █ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ █ █ █ █ █ █ █ ╬ ╬ ╬ █
      █ ╬ ╬ █ █ ╬ ╬ ╬ ╬ █ █ █ ╬ ╬ ╬ ╬ ╬ ╬ ╬ █ █ █ ╬ ╬ ╬ ╬ █ █ ╬ ╬ █
      █ ╬ █ █ ╬ ╬ ╬ ╬ ╬ ╬ ╬ █ █ ╬ ╬ ╬ ╬ ╬ █ █ ╬ ╬ ╬ ╬ ╬ ╬ ╬ █ █ ╬ █
      █ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ █
      █ ╬ ╬ 

In [6]:
class CustomerObject(object):
    
    def __init__(self, gender, age, marital, education, job):
        """
        Constructor object for input parameters.
        :param NAME_PARAM: "DESCRIPTION HERE"        
       """
        self.gender = gender
        self.age = age
        self.marital = marital
        self.education = education
        self.job = job

In [None]:
def collect_data_structure(customer_object):
    key = ophelia.key
    customer_tree = ophelia.demograph.tree_generator(customer_object)
    compute_probability = ophelia.demograph.probability_label(customer_tree)
    new_structure = []
    for k in compute_probability:
        new_structure.append(
            {key.gender: customer_object.gender,
             key.age: customer_object.age,
             key.education: customer_object.education,
             key.marital: customer_object.marital,
             key.occupation: customer_object.job,
             key.prob: compute_probability[k],
             key.risk_label: k})
    return new_structure

In [None]:
def object_to_dataframe(customer_object):
    key = ophelia.key
    new_customer_structure = collect_data_structure(customer_object)
    new_customer_df = ophelia.spark.createDataFrame(new_customer_structure)
    return new_customer_df.withColumn(key.id, lit(key.index))

def tree_to_dataframe(customer_object):
    tree_object = ophelia.demograph.tree_generator(customer_object)
    customer_tree_df = ophelia.spark.createDataFrame(tree_object)
    return customer_tree_df

In [None]:
def join_tree(tree_df, structure_df, how="left"):
    key = ophelia.key
    join_tree_customer = tree_df\
    .join(structure_df, on=key.risk_label, how=how)\
    .select("*", (col(key.weight) * col(key.prob)).alias(key.weighted_prob))\
    .select("*", (col(key.vote) + col(key.weighted_prob)).alias(key.final_vote))
    return join_tree_customer

def group_tree(join_df):
    key = ophelia.key
    group_tree_customer = join_df\
    .groupBy(key.grouping)\
    .agg(spark_round(spark_sum(key.final_vote)).alias(key.final_vote),
         spark_sum(key.vote).alias(key.vote),
         spark_sum(key.weight).alias(key.weight),
         spark_sum(key.prob).alias(key.prob),
         spark_sum(key.weighted_prob).alias(key.weighted_prob))
    return group_tree_customer

def filter_max_vote(group_df):
    key = ophelia.key
    max_filter_vote = group_df.groupBy().agg({key.vote: "max"}).collect()[0][0]
    max_vote = group_df.where(col(key.vote) == max_filter_vote)
    max_filter_weight = max_vote.groupBy().agg({key.weight: "max"}).collect()[0][0]
    customer_df = max_vote.where(col(key.weight) == max_filter_weight)
    return customer_df

**=================================================================== METODO MAIN ====================================================================**

In [None]:
def create_risk_class(person):
    new_customer_df = object_to_dataframe(person)
    tree_df = tree_to_dataframe(person)
    join_tree_customer = join_tree(tree_df, new_customer_df)
    group_tree_customer = group_tree(join_tree_customer)
    customer_df = filter_max_vote(group_tree_customer)
    return customer_df

**================================================================== CASOS TEST F&F ==================================================================**

In [7]:
Luis = CustomerObject(
    gender="male",
    age=30,
    education="bachelor",
    marital="married",
    job="employee"
)

In [8]:
test_tree = ophelia.demograph.tree_generator(Luis)
test_tree

[{'vote': 1, 'weight': 0.09958602599250237, 'risk_label': 'A'},
 {'vote': 1, 'weight': 0.005163691320346524, 'risk_label': 'A'},
 {'vote': 1, 'weight': 0.14401142926954083, 'risk_label': 'MA'},
 {'vote': 1, 'weight': 0.3251857140491525, 'risk_label': 'MA'},
 {'vote': 1, 'weight': 0.42605313936845784, 'risk_label': 'MA'}]

In [None]:
tree_test = [(0.3879590227402336, 'A', 1),
             (0.012391383543770888, 'A', 1),
             (0.26102554704240866, 'MA', 1),
             (0.26562465382106204, 'MA', 1),
             (0.07299939285252478, 'MA', 1)]
tree_test

In [9]:
def filter_dict(dic):
    filter_dic = {}
    for item in dic:
        if dic[item] > 0:
            filter_dic[item] = dic[item]
    return filter_dic

In [10]:
def groupBy_key(tree, key, a=0, ma=0, m=0, mc=0, c=0, counter=0):
    for item in tree:
        counter += tree.count(item) 
        if item["risk_label"] == "A":
            a += item[key]
        elif item["risk_label"] == "MA":
            ma += item[key]
        elif item["risk_label"] == "M":
            m += item[key]
        elif item["risk_label"] == "MC":
            mc += item[key]
        elif item["risk_label"] == "C":
            c += item[key]
    return filter_dict({"A": a, "MA": ma, "M": m, "MC": mc, "C": c, "tot": counter})

In [11]:
def prob_label(dic, a=0, ma=0, m=0, mc=0, c=0):
    for item in dic:
        if item == "A":
            a = dic[item] / dic["tot"]
        elif item == "MA":
            ma = dic[item] / dic["tot"]
        elif item == "M":
            m = dic[item] / dic["tot"]
        elif item == "MC":
            mc = dic[item] / dic["tot"]
        elif item == "C":
            c = dic[item] / dic["tot"]
    return filter_dict({"A": a, "MA": ma, "M": m, "MC": mc, "C": c})

In [12]:
def solver(w):
    N = len(w)-1
    if N is 0:
        return float(1.0)
    return float(1 / N)

In [14]:
def truncate(x, threshold=0.6, truncate=0.5):
    return round(x - threshold + truncate)

In [15]:
def matmul_dict(weight, prob, w_a=0, w_ma=0, w_m=0, w_mc=0, w_c=0):
    for item in weight:
        if item == "A":
            w_a = (weight[item] * prob[item]) + weight[item]
        elif item == "MA":
            w_ma = (weight[item] * prob[item]) + weight[item]
        elif item == "M":
            w_m = (weight[item] * prob[item]) + weight[item]
        elif item == "MC":
            w_mc = (weight[item] * prob[item]) + weight[item]
        elif item == "C":
            w_c = (weight[item] * prob[item]) + weight[item]
    return filter_dict({"A": w_a, "MA": w_ma, "M": w_m, "MC": w_mc, "C": w_c})

In [16]:
def result(dot, freq, w_a=0, w_ma=0, w_m=0, w_mc=0, w_c=0):
    for item in dot:
        if item == "A":
            w_a = truncate(dot[item] + freq[item])
        elif item == "MA":
            w_ma = truncate(dot[item] + freq[item])
        elif item == "M":
            w_m = truncate(dot[item] + freq[item])
        elif item == "MC":
            w_mc = truncate(dot[item] + freq[item])
        elif item == "C":
            w_c = truncate(dot[item] + freq[item])
    return filter_dict({"A": w_a, "MA": w_ma, "M": w_m, "MC": w_mc, "C": w_c})

In [17]:
def assign_label(dic):
    if len(dic) is 0:
        return str("null")
    return str(max(dic))

In [18]:
def run_classification_risk(tree):
    aggregate_weight = groupBy_key(tree, "weight")
    label_frequency = groupBy_key(tree, "vote")
    label_probability = prob_label(label_frequency)
    dot = matmul_dict(aggregate_weight, label_probability)
    final_vote = result(dot, label_frequency)
    return assign_label(final_vote)

In [19]:
run_classification_risk(test_tree)

'MA'

In [None]:
class ClassificationEngine:    
    
    def __compute(dataset, dic, key):
        weight = dic[key]["weight"]
        vote = dic[key]["vote"]
        N = sum(data['vote'] for data in dataset)
        return round(((weight * (vote / N)) + weight) + vote)

    def compute_probability(dataset):
        group_by_key = "risk_label"
        sum_value_keys = ["vote", "weight"]
        dic = defaultdict(Counter)
        for item in dataset:
            key = item[group_by_key]
            vals = {k:item[k] for k in sum_value_keys}
            dic[key].update(vals)
        for key in dic:
            dic[key] = {
                "vote":dic[key]["vote"],
                "weight":dic[key]["weight"],
                "dotp":ClassificationEngine.__compute(dataset, dic, key)}
        return dic

In [None]:
r = ClassificationEngine.compute_probability(test_tree)
r

In [None]:
Bruno = CustomerObject(
    gender="masculino",
    age=62,
    education="primaria",
    marital="casado",
    occupation="retirado"
)

create_risk_class(Bruno).show()

In [None]:
Claudia = CustomerObject(
    gender="femenino",
    age=52,
    education="media",
    marital="casado",
    occupation="hogar"
)

create_risk_class(Claudia).show()

In [None]:
Jose = CustomerObject(
    gender="masculino",
    age=62,
    education="superior",
    marital="casado",
    occupation="retirado"
)

create_risk_class(Jose).show()

In [None]:
Jorge = CustomerObject(
    gender="masculino",
    age=52,
    education="postgrado",
    marital="casado",
    occupation="empleado"
)

create_risk_class(Jorge).show()

In [None]:
Javier = CustomerObject(
    gender="masculino",
    age=48,
    education="media",
    marital="casado",
    occupation="independiente"
)

create_risk_class(Javier).show()

In [None]:
Carolina = CustomerObject(
    gender="femenino",
    age=46,
    education="superior",
    marital="casado",
    occupation="independiente"
)

create_risk_class(Carolina).show()

In [None]:
Magu = CustomerObject(
    gender="femenino",
    age=74,
    education="secundaria",
    marital="viuda",
    occupation="retirado"
)

create_risk_class(Magu).show()

**================================================================== CASOS TEST H ==================================================================**

In [None]:
# viejo viril
Hipotetico = CustomerObject(
    gender="masculino",
    age=55,
    education="superior",
    marital="casado",
    occupation="empleado"
)
create_risk_class(Hipotetico).show()

In [None]:
# mujer con negocio de arreglos
Hipotetico2 = CustomerObject(
    gender="femenino",
    age=55,
    education="superior",
    marital="casado",
    occupation="independiente"
)
create_risk_class(Hipotetico2).show()

In [None]:
# CASO ATÍPICO 1
# caso hombre con edad de triunfo pero con suerte de perro (tipo que se acaba de quedar sin chamba, liquidación? recorte?)
Hipotetico3 = CustomerObject(
    gender="masculino",
    age=47,
    education="superior",
    marital="casado",
    occupation="desempleado"
)
create_risk_class(Hipotetico3).show()

In [None]:
# caso hipster, hombre regresando de cancún con su pareja, luna de miel?
# con un negocio propio de imprenta en la san antonio abad, estudió diseño gráfico.
Hipotetico4 = CustomerObject(
    gender="masculino",
    age=30,
    education="superior",
    marital="casado",
    occupation="independiente"
)
create_risk_class(Hipotetico4).show()

In [None]:
# caso hipster, hombre regresando de cancún con su pareja, lina de miel?
# con un negocio propio de imprenta en la san antonio abad, estudió diseño gráfico.
Hipotetico5 = CustomerObject(
    gender="masculino",
    age=35,
    education="secundaria",
    marital="casado",
    occupation="independiente"
)
create_risk_class(Hipotetico5).show()

**=================================================================== FINAL TEST H ===================================================================**

In [20]:
from pyspark.sql.functions import udf, when, rand, struct, array
from pyspark.sql.types import StringType, IntegerType, StructField, StructType, DoubleType, MapType, ArrayType

In [21]:
spark = ophelia.spark
customer_banking = ophelia.mazterize.read_file("data-resources/customer_banking/", "csv", spark)

-Ophelia[INFO]: Reading Spark File [...]
-Ophelia[INFO]: Read CSV Successfully From Path: data-resources/customer_banking/ [...]


In [None]:
from collections import defaultdict, Counter

def __compute(dataset, dic, key):
    weight = dic[key]["weight"]
    vote = dic[key]["vote"]
    N = sum(data['vote'] for data in dataset)
    return round(((weight * (vote / N)) + weight) + vote)

def compute_probability(obj):
    dataset = ophelia.demograph.tree_generator(obj)
    group_by_key = "risk_label"
    sum_value_keys = ["vote", "weight"]
    dic = defaultdict(Counter)
    for item in dataset:
        key = item[group_by_key]
        vals = {k:item[k] for k in sum_value_keys}
        dic[key].update(vals)
    for key in dic:
        dic[key] = {
            "vote":dic[key]["vote"],
            "weight":dic[key]["weight"],
            "dotp":__compute(dataset, dic, key)}
    return dict(dic)

In [None]:
Luis = CustomerObject(
    gender="male",
    age=30,
    education="bachelor",
    marital="married",
    job="employee"
)

In [24]:
schema_tree = ArrayType(StructType([StructField("weight", DoubleType(), True),
                                    StructField("risk_label", StringType(), True),
                                    StructField("vote", IntegerType(), True)]))

tree_generator_udf = udf(ophelia.demograph.tree_generator, schema_tree)
classification_risk_udf = udf(run_classification_risk)

test_customer_df = customer_banking.select("age", "job", "marital", "education")\
                                   .withColumn("gender", when(rand() > 0.5, "male").otherwise("female"))\
                                   .withColumn("struct", struct(col("gender"), col("age"), col("marital"), col("education"), col("job")))\
                                   .select("*", tree_generator_udf(col("struct")).alias("tree"))\
                                   .select("*", classification_risk_udf(col("tree")).alias("risk_label"))
test_customer_df.show()

+---+-----------+--------+---------+------+--------------------+--------------------+----------+
|age|        job| marital|education|gender|              struct|                tree|risk_label|
+---+-----------+--------+---------+------+--------------------+--------------------+----------+
| 59|     admin.| married|secondary|  male|[male, 59, marrie...|[[0.0922014801652...|        MC|
| 56|     admin.| married|secondary|  male|[male, 56, marrie...|[[0.0922014801652...|        MC|
| 41| technician| married|secondary|  male|[male, 41, marrie...|[[0.0922014801652...|        MC|
| 55|   services| married|secondary|  male|[male, 55, marrie...|[[0.0922014801652...|        MC|
| 54|     admin.| married| tertiary|female|[female, 54, marr...|[[0.0922014801652...|        MC|
| 42| management|  single| tertiary|female|[female, 42, sing...|[[0.0922014801652...|        MA|
| 56| management| married| tertiary|female|[female, 56, marr...|[[0.0922014801652...|        MC|
| 60|    retired|divorced|seco

In [25]:
test_customer_df.printSchema()

root
 |-- age: integer (nullable = true)
 |-- job: string (nullable = true)
 |-- marital: string (nullable = true)
 |-- education: string (nullable = true)
 |-- gender: string (nullable = false)
 |-- struct: struct (nullable = false)
 |    |-- gender: string (nullable = false)
 |    |-- age: integer (nullable = true)
 |    |-- marital: string (nullable = true)
 |    |-- education: string (nullable = true)
 |    |-- job: string (nullable = true)
 |-- tree: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- weight: double (nullable = true)
 |    |    |-- risk_label: string (nullable = true)
 |    |    |-- vote: integer (nullable = true)
 |-- risk_label: string (nullable = true)

