In [None]:
import visualize
from com.ophelia.OpheliaVendata import OpheliaVendata

In [None]:
ophelia = OpheliaVendata('Risk Recommend Analysis')

In [None]:
from pyspark.sql import SparkSession, Window
from pyspark.sql.functions import col, max as spark_max, row_number, avg, count
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler, StringIndexer, OneHotEncoder
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator

In [None]:
import OpSpark

In [None]:
spark = ophelia.opSession

In [None]:
portfolio_train_df = spark.read.parquet("data/ophelia/out/model/TrainPortfolio", inferSchema=True)

In [None]:
print(portfolio_train_df.shape)
portfolio_train_df.show(5, False)
portfolio_train_df.printSchema()
portfolio_train_df.describe("sharpe").show(5, False)

In [None]:
filter_max_sharpe = portfolio_train_df.join(
    portfolio_train_df.groupBy('risk_bucket').agg(spark_max('sharpe').alias('max_sharpe')),
    on=[col('sharpe') == col('max_sharpe')],
    how='left_semi').orderBy('risk_bucket')
filter_max_sharpe.show(10, False)

In [None]:
risk_class_df = spark.read.parquet("data/ophelia/out/model/RiskClassifier")
print(risk_class_df.shape)
risk_class_df.show(5, False)
risk_class_df.groupBy('risk_label', 'risk_label_id').count().orderBy(col('risk_label_id')).show(5, False)
risk_class_df.printSchema()

In [None]:
risk_class_col_prune = ["information_date", "model_date"]
portfolio_train_col_prune = ["information_date", "model_date"]
join_over_col = [col("risk_label_id")==col("risk_bucket")]
join_col_prune = ["risk_bucket", "risk_label_id"]

portfolio_base_table = risk_class_df.drop(*risk_class_col_prune)\
                                    .join(filter_max_sharpe.drop(*portfolio_train_col_prune), on=join_over_col, how="inner")\
                                    #.drop(*join_col_prune)
portfolio_base_table.show(5, False)

In [None]:
print(portfolio_base_table.shape)
portfolio_base_table.printSchema()
portfolio_base_table.groupBy('risk_bucket', 'risk_label_id', 'risk_label', 'portfolio_id')\
                    .agg(count('risk_label'), avg('sharpe'), avg('stdev'), 
                         avg('ret')).orderBy('avg(sharpe)').show()
portfolio_base_table.groupBy("portfolio_id").count().show()
portfolio_base_table.orderBy('customer_id').show(50, False)

In [None]:
vec_assembled = VectorAssembler(inputCols=['ret', 'stdev', 'sharpe'], outputCol='features').transform(portfolio_base_table)
vec_assembled.show(5, False)

In [None]:
#from pyspark.ml.clustering import KMeans

kmeans = KMeans().setK(5).setSeed(1)
model_kmeans = kmeans.fit(vec_assembled)
transformed = model_kmeans.transform(vec_assembled)
transformed.groupBy("prediction")\
           .agg(count('risk_label'), count('prediction'), avg('sharpe'), avg('stdev'), avg('ret')).orderBy('avg(sharpe)').show()
transformed.groupBy("prediction", 'risk_label')\
           .agg(count('risk_label'), count('prediction'), avg('sharpe'), avg('stdev'), avg('ret')).orderBy('avg(sharpe)').show()
transformed.groupBy("prediction", 'risk_label', 'risk_label_id', 'risk_bucket', 'portfolio_id')\
           .agg(count('risk_label'), count('prediction'), avg('sharpe'), avg('stdev'), avg('ret')).orderBy('avg(sharpe)').show() 

- cambiar el index del benchmark
- hacer back-testing con datos actuales
- simular portafolios con constraint min 10% por fondo
- revisar que porcentaje de equity tiene cada portafolio
- pendiente definir constraint de equity por portafolio perfil
- nuevo cuestionario de perfilamiento
- pendiente definir benchmark

# now we can see cluster to verify that cluster splitting was difined correctly

In [None]:
evaluator = ClusteringEvaluator()

silhouette = evaluator.evaluate(transformed)
print("Silhouette with squared euclidean distance = " + str(silhouette))

# Shows the result.
centers = model_kmeans.clusterCenters()
print("Cluster Centers: ")
for center in centers:
    print(center)

In [None]:
cluster_zero = transformed.where(col('prediction') == '0').toNumpyArray()
cluster_one = transformed.where(col('prediction') == '1')
cluster_two = transformed.where(col('prediction') == '2')
cluster_three = transformed.where(col('prediction') == '3')
cluster_four = transformed.where(col('prediction') == '4')

In [None]:
scatter1 = dict(
    mode = "markers",
    name = "Cluster 1",
    type = "scatter3d",    
    x = cluster_zero.as_matrix()[:,0], y = cluster_zero.as_matrix()[:,1], z = cluster_zero.as_matrix()[:,2],
    marker = dict( size=2, color='green')
)

In [None]:
import numpy as np
from numpy.linalg import norm


class Kmeans:
    '''Implementing Kmeans algorithm.'''

    def __init__(self, n_clusters, max_iter=100, random_state=123):
        self.n_clusters = n_clusters
        self.max_iter = max_iter
        self.random_state = random_state

    def initializ_centroids(self, X):
        np.random.RandomState(self.random_state)
        random_idx = np.random.permutation(X.shape[0])
        centroids = X[random_idx[:self.n_clusters]]
        return centroids

    def compute_centroids(self, X, labels):
        centroids = np.zeros((self.n_clusters, X.shape[1]))
        for k in range(self.n_clusters):
            centroids[k, :] = np.mean(X[labels == k, :], axis=0)
        return centroids

    def compute_distance(self, X, centroids):
        distance = np.zeros((X.shape[0], self.n_clusters))
        for k in range(self.n_clusters):
            row_norm = norm(X - centroids[k, :], axis=1)
            distance[:, k] = np.square(row_norm)
        return distance

    def find_closest_cluster(self, distance):
        return np.argmin(distance, axis=1)

    def compute_sse(self, X, labels, centroids):
        distance = np.zeros(X.shape[0])
        for k in range(self.n_clusters):
            distance[labels == k] = norm(X[labels == k] - centroids[k], axis=1)
        return np.sum(np.square(distance))
    
    def fit(self, X):
        self.centroids = self.initializ_centroids(X)
        for i in range(self.max_iter):
            old_centroids = self.centroids
            distance = self.compute_distance(X, old_centroids)
            self.labels = self.find_closest_cluster(distance)
            self.centroids = self.compute_centroids(X, self.labels)
            if np.all(old_centroids == self.centroids):
                break
        self.error = self.compute_sse(X, self.labels, self.centroids)
    
    def predict(self, X):
        distance = self.compute_distance(X, old_centroids)
        return self.find_closest_cluster(distance)

In [None]:
#df = portfolio_train_df.select('ret', 'stdev', 'sharpe').toPandas()

In [None]:
import matplotlib.pyplot as plt
from matplotlib.image import imread
import pandas as pd
from sklearn.datasets.samples_generator import (make_blobs,
                                                make_circles,
                                                make_moons)
from sklearn.cluster import KMeans, SpectralClustering
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_samples, silhouette_score

%matplotlib inline
plt.style.use('fivethirtyeight')
from warnings import filterwarnings
filterwarnings('ignore')

In [None]:
X_std

In [None]:
np.array(df)

In [None]:
# Standardize the data
X_std = StandardScaler().fit_transform(df)

# Run local implementation of kmeans
km = Kmeans(n_clusters=2, max_iter=100)
km.fit(X_std)
centroids = km.centroids

# Plot the clustered data
fig, ax = plt.subplots(figsize=(20, 20))
plt.scatter(X_std[km.labels == 0, 0], X_std[km.labels == 0, 1],
            c='green', label='cluster 1')
plt.scatter(X_std[km.labels == 1, 0], X_std[km.labels == 1, 1],
            c='blue', label='cluster 2')
plt.scatter(centroids[:, 0], centroids[:, 1], marker='*', s=300,
            c='r', label='centroid')
plt.legend()
plt.xlim([-5, 5])
plt.ylim([-5, 5])
plt.xlabel('Eruption time in mins')
plt.ylabel('Waiting time to next eruption')
plt.title('Visualization of clustered data', fontweight='bold')
ax.set_aspect('equal');

In [None]:
sse = []
list_k = list(range(1, 10))

for k in list_k:
    km = KMeans(n_clusters=k)
    km.fit(X_std)
    sse.append(km.inertia_)

# Plot sse against k
plt.figure(figsize=(6, 6))
plt.plot(list_k, sse, '-o')
plt.xlabel(r'Number of clusters *k*')
plt.ylabel('Sum of squared distance');

In [None]:
plt.scatter(pd.DataFrame(centroids))