In [1]:
import visualize
from com.ophelia.OpheliaVendata import OpheliaVendata

In [2]:
ophelia = OpheliaVendata('Risk Recommend Analysis')


15:06:56.241 Ophelia [INFO] ¡Hi! My name is Ophelia
15:06:56.241 Ophelia [INFO] I am an artificial assistant for machine learning applications in Spark
15:06:56.241 Ophelia [INFO] Welcome to Ophelia Assisted Intelligence System (OAIS)
15:06:56.241 Ophelia [INFO] V for Vendata...

                                 - By. Vendata-Gentleman Club -                            

                   █ █ █ █ █ █ █ █ █ █ █ █ █ █ █ █ █ █ █ █ █ █ █ █ █ █ █ █ █ █ █
                   █ █ █ █ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ █ █ █ █
                   █ █ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ █ █
                   █ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ █
                   █ ╬ ╬ ╬ █ █ █ █ █ █ █ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ █ █ █ █ █ █ █ ╬ ╬ ╬ █
                   █ ╬ ╬ █ █ ╬ ╬ ╬ ╬ █ █ █ ╬ ╬ ╬ ╬ ╬ ╬ ╬ █ █ █ ╬ ╬ ╬ ╬ █ █ ╬ ╬ █
                   █ ╬ █ █ ╬ ╬ ╬ ╬ ╬ ╬ ╬ █ █ ╬ ╬ ╬ ╬ ╬ █ █ ╬ ╬ ╬ ╬ ╬ ╬ ╬ █ █ ╬ █
                   █ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬ ╬

In [3]:
from pyspark.sql import SparkSession, Window
from pyspark.sql.functions import col, max as spark_max, row_number
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler, StringIndexer, OneHotEncoder
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator

In [4]:
spark = ophelia.ophelia_session

In [5]:
portfolio_train_df = spark.read.parquet("data/ophelia/out/model/TrainPortfolio", inferSchema=True)
risk_class_df = spark.read.parquet("data/ophelia/out/model/RiskClassifier")

In [6]:
portfolio_train_df.show(5, False)
portfolio_train_df.printSchema()
portfolio_train_df.describe("sharpe").show(5, False)

+-------------------+------------------+------------------+-------------------+-------------------+--------------------+--------------------+--------------------+--------------------+---------------------+-------------------+--------------------+--------------------+------------+--------------------------+--------------------+-------------+----------------+
|ret                |stdev             |sharpe            |fund_w_zero        |fund_w_one         |fund_w_two          |fund_w_three        |fund_w_four         |fund_w_five         |fund_w_six           |fund_w_seven       |fund_w_eight        |fund_w_nine         |portfolio_id|model_date                |sharpe_centile      |sharpe_bucket|information_date|
+-------------------+------------------+------------------+-------------------+-------------------+--------------------+--------------------+--------------------+--------------------+---------------------+-------------------+--------------------+--------------------+------------+

In [7]:
risk_class_df.show(5, False)
risk_class_df.printSchema()

+-----------+-------------+----------+---+----------+-------+---------+------+-----+------+-------+------+--------------------------+----------------+
|customer_id|risk_label_id|risk_label|age|job       |marital|education|gender|child|saving|insight|backup|model_date                |information_date|
+-----------+-------------+----------+---+----------+-------+---------+------+-----+------+-------+------+--------------------------+----------------+
|100000000  |4.0          |MC        |59 |admin.    |married|secondary|female|0.0  |1.0   |0.0    |1.0   |2020-09-02 17:33:15.009706|2020-09-02      |
|100000001  |2.0          |MA        |56 |admin.    |married|secondary|female|1.0  |0.0   |1.0    |0.0   |2020-09-02 17:33:15.009706|2020-09-02      |
|100000002  |2.0          |MA        |41 |technician|married|secondary|male  |1.0  |0.0   |1.0    |1.0   |2020-09-02 17:33:15.009706|2020-09-02      |
|100000003  |3.0          |M         |55 |services  |married|secondary|male  |0.0  |1.0   |0.0

In [8]:
risk_class_col_prune = ["information_date", "model_date"]
portfolio_train_col_prune = ["information_date", "model_date", "sharpe_centile"]
join_over_col = [col("risk_label_id")==col("sharpe_bucket")]
join_col_prune = ["sharpe_bucket", "risk_label_id"]

join_recommend = risk_class_df.drop(*risk_class_col_prune)\
                              .join(portfolio_train_df.drop(*portfolio_train_col_prune), on=join_over_col, how="inner")\
                              .drop(*join_col_prune)

# One option is to filter by the top 5 best sharpe for each assignment

In [9]:
w = Window.partitionBy("customer_id").orderBy(col("sharpe").desc())
join_recommend.select("*", row_number().over(w).alias("rank_sharpe")).where(col("rank_sharpe") <= 5).show(10, False)

+-----------+----------+---+-----------+--------+---------+------+-----+------+-------+------+-------------------+------------------+------------------+-------------------+-------------------+--------------------+--------------------+-------------------+--------------------+--------------------+--------------------+--------------------+--------------------+------------+-----------+
|customer_id|risk_label|age|job        |marital |education|gender|child|saving|insight|backup|ret                |stdev             |sharpe            |fund_w_zero        |fund_w_one         |fund_w_two          |fund_w_three        |fund_w_four        |fund_w_five         |fund_w_six          |fund_w_seven        |fund_w_eight        |fund_w_nine         |portfolio_id|rank_sharpe|
+-----------+----------+---+-----------+--------+---------+------+-----+------+-------+------+-------------------+------------------+------------------+-------------------+-------------------+--------------------+-----------------

In [10]:
filter_max_sharpe = join_recommend.groupBy('customer_id').agg(spark_max('sharpe').alias('sharpe'))
portfolio_base_table = join_recommend.join(filter_max_sharpe, on=["customer_id", "sharpe"], how='left_semi')

In [11]:
portfolio_base_table.show(10, False)
portfolio_base_table.printSchema()
portfolio_base_table.groupBy("risk_label").count().show()
portfolio_base_table.groupBy("portfolio_id").count().show()
portfolio_base_table.count()

+-----------+------------------+----------+---+------------+--------+---------+------+-----+------+-------+------+-------------------+------------------+-------------------+-------------------+-------------------+--------------------+-------------------+--------------------+--------------------+--------------------+--------------------+--------------------+------------+
|customer_id|sharpe            |risk_label|age|job         |marital |education|gender|child|saving|insight|backup|ret                |stdev             |fund_w_zero        |fund_w_one         |fund_w_two         |fund_w_three        |fund_w_four        |fund_w_five         |fund_w_six          |fund_w_seven        |fund_w_eight        |fund_w_nine         |portfolio_id|
+-----------+------------------+----------+---+------------+--------+---------+------+-----+------+-------+------+-------------------+------------------+-------------------+-------------------+-------------------+--------------------+-------------------+

11162

In [13]:
opb = ophelia.ophelia_build
multi_col = ['risk_label', 'job', 'marital', 'education', 'gender']
indexed_df = opb.build_string_index(df=portfolio_base_table, col_name=multi_col, indexer_type='multi')
indexed_df.show(5, False)

+-----------+------------------+----------+---+-----------+--------+---------+------+-----+------+-------+------+-------------------+------------------+-------------------+-------------------+-------------------+--------------------+-------------------+--------------------+--------------------+--------------------+--------------------+--------------------+------------+----------------+---------+-------------+---------------+------------+
|customer_id|sharpe            |risk_label|age|job        |marital |education|gender|child|saving|insight|backup|ret                |stdev             |fund_w_zero        |fund_w_one         |fund_w_two         |fund_w_three        |fund_w_four        |fund_w_five         |fund_w_six          |fund_w_seven        |fund_w_eight        |fund_w_nine         |portfolio_id|risk_label_index|job_index|marital_index|education_index|gender_index|
+-----------+------------------+----------+---+-----------+--------+---------+------+-----+------+-------+------+---

- cambiar el index del benchmark
- hacer back-testing con datos actuales
- simular portafolios con constraint min 10% por fondo
- revisar que porcentaje de equity tiene cada portafolio
- pendiente definir constraint de equity por portafolio perfil
- nuevo cuestionario de perfilamiento
- pendiente definir benchmark

In [16]:
@staticmethod
def ohe_estimator(col_list: list) -> OneHotEncoder:
    indexers = OpheliaFeature.multi_string_indexer(multi_col=col_list)
    encoder = OneHotEncoder(
        inputCols=[indexer.getOutputCol() for indexer in indexers],
        outputCols=["{0}_encoded".format(indexer.getOutputCol()) for indexer in indexers])
    return encoder

@staticmethod
def indexer_encoded(index_list: list) -> list:
    multi_indexers = OpheliaFeature.multi_string_indexer(index_list)
    encode_index_list = []
    for column in range(len(multi_indexers)):
        encode_index_list.append(multi_indexers[column].getOutputCol() + "_encoded")
    return encode_index_list

In [None]:
indexer_encoded()

# now we can see cluster to verify that cluster splitting was difined correctly

In [None]:


kmeans = KMeans().setK(2).setSeed(1)
model = kmeans.fit(dataset)

# Make predictions
predictions = model.transform(dataset)

# Evaluate clustering by computing Silhouette score
evaluator = ClusteringEvaluator()

silhouette = evaluator.evaluate(predictions)
print("Silhouette with squared euclidean distance = " + str(silhouette))

# Shows the result.
centers = model.clusterCenters()
print("Cluster Centers: ")
for center in centers:
    print(center)