In [None]:
import numpy as np
import pandas as pd

from pyspark.mllib.linalg.distributed import IndexedRowMatrix, IndexedRow, RowMatrix
from pyspark.ml.feature import StandardScaler, PCA
from pyspark.ml.linalg import Vectors, VectorUDT
from pyspark.sql.functions import udf, col
from pyspark.sql import SparkSession

In [None]:
pd.set_option('display.max_columns', int(1e7))
pd.set_option('display.max_rows', int(1e7))
pd.set_option('display.width', int(1e7))

------------------------------------------------------------------------------------------------------------------------------------------------------------------

# Defining Spark Session for pseudo-distributed computing:

In [None]:
spark = SparkSession.builder.appName('Single_Value_Decomposition_Portfolio').getOrCreate()
sc = spark.sparkContext
sc

# Reading persisted Portfolio Yields dataframe:

In [None]:
portfolio_yield_window_path = '/data/core/fince/data/portfolioOptimization/portfolio_yield_window/'
portfolio_yield_df = spark.read.parquet(portfolio_yield_window_path)

In [None]:
portfolio_yield_df.limit(5).toPandas()

In [None]:
field_array = portfolio_yield_df.columns[:-5]
monthly_return = np.array(portfolio_yield_df.select(*field_array).collect())
print('test with p:', len(field_array), 'funds')

In [None]:
print('monthly_return matrix:\n', monthly_return)

-------------------------------------------------------------------------------------------------------------------------------------------------------------------

### El cálculo se realiza utilizando la descomposición de valores singulares (Singular Value Decomposition, SVD). La SVD de cualquier matriz $mxn$ se calcula como:

$$A = U \sum V^T$$

### Donde $U$ es una matriz ortogonal $m×m$ cuyas columnas son los vectores propios (eigenvectores) de  $AA^T$ , $V$ es una matriz ortogonal $n×n$ cuyas columnas son los eigenvectores de  $A^T A$ , y $\sum$ es una matriz diagonal $m×n$ y sus valores son cero excepto a lo largo de la diagonal.

### Al aplicar PCA, tenemos que centrar nuestros datos, es decir, tenemos que restar la media de la columna. Luego, según la naturaleza de nuestros datos, es posible que necesitemos estandarizar nuestros datos (hacer que cada característica tenga una varianza unitaria y una media cero). Si las columnas están en diferentes escalas, como el año, la temperatura, la concentración de dióxido de carbono, por ejemplo, tenemos que estandarizar los datos. Si los datos están en la misma unidad, por otro lado, la estandarización puede provocar la pérdida de información importante. En el primer caso, cuando las columnas están en la misma unidad y en una escala similar, usamos la matriz de covarianza para SVD pero cuando las unidades son diferentes ya que estandarizamos los datos, usamos la matriz de correlación.

### Los componentes principales (PC) son el producto matricial de los datos originales y la matriz $V$, que es igual al producto de las matrices $U$ y $\sum$.

# Single Value Decomposition analysis.

### At the very first step we have to take two input parameters, one is called ___n___, that refers to the total count of rows in dataframe. The second refers to the total number of columns called _features_, i.e. ___p___. So that we will find this matrix with _(n, p)_ dimensions.

In [None]:
monthly_return_rdd = sc.parallelize(monthly_return.tolist()).zipWithIndex()

# Obtaining model parameters:
#      number of total rows n
#      number of total features p
n = monthly_return_rdd.count()
p = len(monthly_return_rdd.take(1)[0][0])

### What do we want to confirm is that every vector $\vec{V_i}$ of length p is a _dense vector_. This is, we want to get fully completed vectors without any null values. For that, we create a udf function (User Defined Function) with this implications.

In [None]:
udf_dense_vector = udf(lambda x: Vectors.dense(x), VectorUDT())

### Now we will overwrite the _features_ column with dense vectors.

In [None]:
monthly_return_df = spark.createDataFrame(monthly_return_rdd).toDF("features", "id")\
                         .withColumn("features", udf_dense_vector("features"))
monthly_return_df.show(5)

In [None]:
monthly_return_df.where(col("id") == 0).collect()

### Let's standarize this dense vectors of length __p__ with the _Standard Scaler_ method, i.e. Mean and Standard Deviation are involved for this standarization (re-scaled vectors of features).

In [None]:
stdScaler = StandardScaler(withMean=True, withStd=True, inputCol="features", outputCol="scaled_features")
model = stdScaler.fit(monthly_return_df)
monthly_return_std_df = model.transform(monthly_return_df)
monthly_return_std_df.show(5)

### In order to compute SVD we have to transfrom spark-dataframe to a matrix object with indexed elements from scaled features, for that, we will use _IndexedRowMatrix_ method.

In [None]:
monthly_return_irm = IndexedRowMatrix(monthly_return_std_df.rdd.map(lambda x: IndexedRow(x[0], x[1].tolist())))

### Now let's compute the singular value decomposition of the IndexedRowMatrix. The given row matrix $A$ of dimension __$(m x n)$__ is decomposed into
### _$$U s V^{T}$$ where:_
* $U$: $(m x k)$ __*left singular vectors* is a IndexedRowMatrix whose columns are the eigenvectors of $(A X A')$__
* $s$: __DenseVector consisting of square root of the eigenvalues *singular values* in descending order.__
* $V$: $(n x k)$ __*right singular vectors* is a Matrix whose columns are the eigenvectors of $(A' X A)$__

### This _computeSVD_ interface recieves two main arguments:
* $k$, for $k^{th}$ int number, thus each element $k$ = {${k_{i} \in \Bbb R}$}
* $U$, with _computeU_ boolean __True__, whether or not to compute $U$. If set to be __True__, then $U$ is computed by $A  V  s^{-1}$

In [None]:
SVD = monthly_return_irm.computeSVD(k=p, computeU=True)
U = SVD.U
S = SVD.s.toArray()

In [None]:
eigen_vals = S**2/(n-1)
eigvals = np.flipud(np.sort(eigen_vals))
cumsum = eigvals.cumsum()
total_variance_explained = cumsum/eigvals.sum()

In [None]:
K = np.argmax(total_variance_explained > 0.95)+1
V = SVD.V
U = U.rows.map(lambda x: (x.index, x.vector[0:K]*S[0:K]))

In [None]:
princ_comps = np.array(list(map(lambda x:x[1], sorted(U.collect(), key = lambda x:x[0]))))

In [None]:
pca = PCA(k=K, inputCol = stdScaler.getOutputCol(), outputCol="pcaFeatures")
model = pca.fit(monthly_return_std_df)
transformed_feature = model.transform(monthly_return_std_df)
np.round(100.00*model.explainedVariance.toArray(), 4)

In [None]:
pcs = np.round(100.00*model.pc.toArray(), 4)
df_pc = pd.DataFrame(pcs, columns = ['PC_'+str(i) for i in range(1, K+1)], index = field_array)
df_pc.sum()

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()

plt.scatter(princ_comps[:, 1], princ_comps[:, 0], alpha=0.7)
plt.axis('equal');

In [None]:
plt.scatter(eigvals, total_variance_explained)

In [None]:
from pyspark.ml.clustering import KMeans

kmeans = KMeans(k=5, seed=1)
model_kmeans = kmeans.fit(monthly_return_std_df.select("features"))
transformed = model_kmeans.transform(monthly_return_std_df)
transformed.groupBy("prediction").count().orderBy(col("count").desc()).show() 