In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from matplotlib import style
from sklearn import preprocessing
from sklearn.decomposition import PCA
from functools import reduce
from pyspark.sql.window import Window 
from pyspark.sql import SparkSession, DataFrame
from pyspark.sql.functions import monotonically_increasing_id, col, lit, isnull, when, udf, row_number,\
                                  avg as spark_avg, stddev as spark_stddev, sqrt, abs as spark_abs
style.use('ggplot')

In [None]:
pd.set_option('display.max_columns', 10000000)
pd.set_option('display.max_rows', 10000000)
pd.set_option('display.width', 10000000)

------------------------------------------------------------------------------------------------------------------------------------------------------------------

# Defining Spark Session for pseudo-distributed computing:

In [None]:
spark = SparkSession.builder.appName('Sharpe&Sortino_ratio').getOrCreate()
sc = spark.sparkContext
sc

# Reading persisted Portfolio Yields dataframe:

In [None]:
portfolio_yield_window_path = 'data/master/ophelia/data/OpheliaData/portfolio_yield_window/'
portfolio_yield_df = spark.read.parquet(portfolio_yield_window_path)
portfolio_yield_df.limit(5).toPandas()

In [None]:
def unionAll_df(*dfs):
    return reduce(DataFrame.unionAll, dfs)

In [None]:
dataframes = [
    portfolio_yield_df.select(lit(fund).alias('fund_name'), 
                              col(fund).alias('fund_yield')) for fund in portfolio_yield_df.columns[5:-1]
]
portfolio_yield_T = unionAll_df(*dataframes).cache()

In [None]:
portfolio_yield_T.show()

# Writing Portfolio's Yield Transpose dataframe.

In [None]:
writing_path_mod3 = 'data/master/ophelia/data/OpheliaData/portfolio_yield_transpose/'

print('\nWriting parquets ...')
portfolio_yield_T.repartition(1).write.mode('overwrite').parquet(writing_path_mod3)

%time
print('\nSUCCESS \nPARQUET DATA SAVED!')
print('\nNew root path tabla data:', writing_path_mod3)
spark.catalog.clearCache()

# Reading persisted Portfolio Yields Transpose.

In [None]:
portfolio_yield_T_path = '/data/core/fince/data/portfolioOptimization/portfolio_yield_transpose/'
portfolio_yield_T_df = spark.read.parquet(portfolio_yield_T_path)
len(portfolio_yield_T_df.columns)

In [None]:
TRESHOLD = float(0.0)
CASE = "BMERGOB"

negative_fund_yield = portfolio_yield_T_df.where(col("fund_yield") < TRESHOLD)
negative_fund_yield.where(col("fund_name") == CASE).show(5)
negative_fund_yield.where(col("fund_name") == CASE)\
                   .describe("fund_yield")\
                   .where((col("summary") == "min")
                        | (col("summary") == "max")
                        | (col("summary") == "mean")
                        | (col("summary") == "stddev")).show()
print("after filtering negative yields we've got following parameters:")
print("{stddev:8.547E-5 , min: -1.872, max: -4.646}")

In [None]:
mean_yield_df = negative_fund_yield.groupBy("fund_name")\
                                   .agg(spark_abs(spark_avg(col('fund_yield'))).alias("downside_mean_yield"),
                                        when(isnull(spark_stddev(col('fund_yield'))), 0).otherwise(
                                            spark_stddev(col('fund_yield'))).alias("downside_stddev_yield"))
                                                                
print("mean yield df:")
mean_yield_df.show(5)

# Sortino ratio:

## **The Formula for the Sortino Ratio Is:**
## Sortino Ratio = $\frac{ R_p - r_f }{ \sigma_d }$ 
## **Where:**
### *R_p = Actual or expected portfolio return*
### *r_f = Risk-free rate*
### *sigma_d = Standard deviation of the downside*

In [None]:
risk_free_rate = 0
sortino_df = mean_yield_df.select("*", 
                                  ((col("downside_mean_yield") - lit(risk_free_rate)) / col("downside_stddev_yield")).alias("sortino_ratio"))\
                          .na.fill(0)
sortino_df.orderBy(col("sortino_ratio")).show(5)
sortino_df.where(col("fund_name") == CASE).show(100)

# Sharpe ratio:

In [None]:
sharpe_df = portfolio_yield_T_df.groupBy("fund_name")\
                                .agg(spark_avg('fund_yield').alias("mean_yield"), spark_stddev('fund_yield').alias("stddev_yield"))\
                                .select("*", ((col("mean_yield") - lit(risk_free_rate)) / col("stddev_yield")).alias("sharpe_ratio"))
sharpe_df.orderBy(col("sharpe_ratio").desc()).show(5)
sharpe_df.where(col("fund_name") == CASE).show(100)

# Joined Both Ratios:

In [None]:
w = Window.orderBy("fund_name") 
joined_ratios_df = sortino_df.join(sharpe_df, on="fund_name", how="left").select("*", row_number().over(w).alias("id"))
joined_ratios_df.printSchema()
joined_ratios_df.show(5)

# PCA analysis

In [None]:
joined_ratios_df.show(5)

In [None]:
f = joined_ratios_df.columns[1:-1]

def pca_sklearn(spark_df, col_features, scree_plot=False):
    data = spark_df.select(col_features).toPandas()
    scaled_data = preprocessing.scale(data.T)
    pca = PCA()
    pca.fit(scaled_data)
    pca_data = pca.transform(scaled_data)
    per_var = np.round(pca.explained_variance_* 100, decimals=1)
    labels = ["PC" + str(x) for x in range(1, len(per_var)+1)]
    if scree_plot == True:
        plt.bar(x=range(1, len(per_var)+1), height=per_var, tick_label=labels)
        plt.ylabel("Percentage of Explained Variance")
        plt.xlabel("Principal Component")
        plt.show()
    return pca_data

In [None]:
pca_sklearn(spark_df=joined_ratios_df, col_features=f, scree_plot=True)

In [None]:
def numpy_spark(numpy_array, labels_col):
    pca_rdd = sc.parallelize(numpy_array)
    return pca_rdd.map(lambda x: x.tolist()).toDF(labels_col)

def numpy_pandas(numpy_array, labels_col):
    pca_rdd = sc.parallelize(numpy_array)
    return pca_rdd.map(lambda x: x.tolist()).toDF(labels_col).toPandas()

In [None]:
pca_df = numpy_pandas(numpy_array=pca_data, labels_col=labels)

In [None]:
plt.scatter(pca_df.PC1, pca_df.PC2)
plt.title("My PCA Graph")
plt.xlabel("PC1 - {0}%".format(per_var[0]))
plt.ylabel("PC2 - {0}%".format(per_var[1]))
for sample in pca_df.index:
    plt.annotate(sample, (pca_df.PC1.loc[sample], pca_df.PC2.loc[sample]))
plt.show()

### Centils grouping test

In [None]:
def column_collection(df, col_collect):
    row_collection = df.select(col_collect).collect()
    list_collection = []
    for row in range(len(row_collection)):
        list_collection.append(row_collection[row][0])
    return sorted(list_collection)

In [None]:
volatile_yield_list = column_collection(df=joined_ratios_df, col_collect="stddev_yield")

In [None]:
import numpy as np
a = np.array(volatile_yield_list)
p = np.percentile(a, 50)
p

In [None]:
33, 66, 100, 

In [None]:
33+33+33

# K-Means model for Clustering Sharpe and Sortino Ratios

In [None]:
from pyspark.ml.clustering import KMeans
from pyspark.ml.linalg import Vector, VectorUDT, Vectors
from pyspark.mllib.linalg.distributed import IndexedRowMatrix, IndexedRow
from pyspark.ml.feature import VectorAssembler, StandardScaler

### But first! PCA dimension reduction analysis

### After PCA dimension reduction analysis, the 93% of de variance is explained by sortino and sharpe ratio (we don't need more)

In [None]:
features_list = ["sortino_ratio", "sharpe_ratio"]
vector_assembler = VectorAssembler(inputCols=features_list, outputCol="features")

In [None]:
vector_assembler_df = vector_assembler.transform(joined_ratios_df)
vector_assembler_df.limit(5).toPandas()

In [None]:
n = joined_ratios_df.count()
print("n:", n)
p = len(features_list)
print("p:", p)

In [None]:
udf_change = udf(lambda x: Vectors.dense(x), VectorUDT())
vector_dense_df = vector_assembler_df.withColumn("features", udf_change("features"))
vector_dense_df.limit(5).toPandas()

In [None]:
standard_scaler = StandardScaler(inputCol="features", outputCol="scaled_features", withStd=True, withMean=False)

In [None]:
scaler_model = standard_scaler.fit(vector_dense_df.select("features"))

In [None]:
scaled_feature_df = scaler_model.transform(vector_dense_df)
scaled_feature_df.limit(5).toPandas()

In [None]:
scaled_feature_df.columns[1:]

In [None]:
picking_columns = scaled_feature_df.select("id", "scaled_features")
feature_df = IndexedRowMatrix(picking_columns.rdd.map(lambda x: IndexedRow(x[0], x[1].tolist())))

In [None]:
SVD = feature_df.computeSVD(p, True)
U = SVD.U
S = SVD.s.toArray()
eigen_vals = S**2/(n-1)
eigvals = np.flipud(np.sort(eigen_vals))
cumsum = eigvals.cumsum()
total_variance_explained = cumsum/eigvals.sum()
K = np.argmax(total_variance_explained>0.95)+1
V = SVD.V
U = U.rows.map(lambda x: (x.index, x.vector[0:K]*S[0:K]))
princ_comps = np.array(list(map(lambda x:x[1], sorted(U.collect(), key = lambda x:x[0]))))

In [None]:
total_variance_explained

### K-Means with selected feature

In [None]:
kmeans_5 = KMeans(featuresCol="features", k=5)

In [None]:
model_kmeans = kmeans_5.fit(scaled_feature_df)
final_fund_class = model_kmeans.transform(scaled_feature_df).select("id", "fund_name", "sortino_ratio", "sharpe_ratio", "prediction")

In [None]:
final_fund_class.groupBy("prediction").count().show()

In [None]:
final_fund_class.where(col("prediction") == 3).show(100000)

In [None]:
final_fund_class.orderBy(col("prediction")).show(10000)

In [None]:
final_fund_pd = final_fund_class.toPandas()
plt.scatter(final_fund_pd.sortino_ratio, final_fund_pd.sharpe_ratio, alpha=0.5)
plt.show()

In [None]:
for k in range(2, 9):
    kmean = KMeans(featuresCol="features", k=k)
    model = kmean.fit(scaled_feature_df)
    squared_error = model.computeCost(scaled_feature_df)
    print("with k={}".format(k))
    print("within set sum of squared errors = ", str(squared_error))
    print("---"*30)

# Se queda con 5 grupos!

-------------------------------------------------------------------------------------------------------------------------------------------------------------------