In [None]:
import numpy as np
import pandas as pd

from pyspark.sql.functions import col
from pyspark.sql import SparkSession

In [None]:
pd.set_option('display.max_columns', int(1e7))
pd.set_option('display.max_rows', int(1e7))
pd.set_option('display.width', int(1e7))

------------------------------------------------------------------------------------------------------------------------------------------------------------------

# Defining OpheliaSpark Session for pseudo-distributed computing:

In [None]:
spark = SparkSession.builder.appName('Single_Value_Decomposition_Portfolio').getOrCreate()
sc = spark.sparkContext
sc

# Reading persisted Portfolio Yields dataframe:

In [None]:
portfolio_yield_window_path = 'data/staging/benchmark/close_day_price'
portfolio_yield_df = spark.read.parquet(portfolio_yield_window_path)

In [None]:
portfolio_yield_df.show(5, False)
portfolio_yield_df.printSchema()

-------------------------------------------------------------------------------------------------------------------------------------------------------------------

### The calculation is performed using Singular Value Decomposition (SVD). The SVD of any $m x n$ array is calculated as follows:

$$A = U \sum V^{T}$$

### Where $U$ is an orthogonal matrix $m x m$ whose columns are the eigenvectors (eigenvectors) of $AA^{T}$, $V$ is an orthogonal matrix $n x n$ whose columns are the eigenvectors of $A^{T}A$, and $\sum$ is a diagonal matrix $m x n$ and its values are zero except along the diagonal.

### When applying PCA, we have to center our data, that is, depending on its nature, we may need to standardize (make each characteristic have a variance of 1 and a mean of 0). If the columns are on different scales like the year, the temperature, the concentration of carbon dioxide, we have to standardize the data. If the data is on the same drive, on the other hand, standardization can lead to the loss of important information. In the first case, when the columns are in the same unit and on a similar scale, we use the covariance matrix for SVD but when the units are different since we standardize the data, we use the correlation matrix.

### The principal components (PC) are the matrix product of the original data and the matrix $V$, which is equal to the product of the matrices $U$ and $\sum$.

# Single Value Decomposition analysis.

### At the very first step we have to take two input parameters, one is called ___n___, that refers to the total count of rows in dataframe. The second refers to the total number of columns called _features_, i.e. ___d___. Thus we will find this matrix with _(n, d)_ dimensions.

### What do we want to confirm is that every vector $\vec{V_i}$ of length d is a _dense vector_. This is, we want to get full vectors without any null values.

In [None]:
from pyspark.ml.feature import VectorAssembler, StandardScaler, PCA
from pyspark.sql.functions import monotonically_increasing_id

In [None]:
d_features = portfolio_yield_df.columns[1:-2]
vec_df = VectorAssembler(inputCols=d_features, outputCol='features').transform(portfolio_yield_df)\
                                                                    .select('features', monotonically_increasing_id().alias('id'))
n = vec_df.count()
d = len(d_features)
print('test with d =', d, 'features and n =', n, 'samples')
vec_df.show(5, False)

### Let's standarize this dense vectors of length __d__ with the _Standard Scaler_ method, i.e. Mean and Standard Deviation are involved for this standarization (re-scaled vectors of features).

In [None]:
scaler = StandardScaler(
    withMean=True, withStd=True, inputCol="features", outputCol="scaled_features"
)

vec_scale_df = scaler.fit(vec_df).transform(vec_df)
vec_scale_df.show(5, False)

### In order to compute SVD we have to transfrom spark-dataframe to a matrix object with indexed elements from scaled features, for that, we will use _IndexedRowMatrix_ method.

In [None]:
from pyspark.mllib.util import MLUtils
from pyspark.mllib.linalg.distributed import IndexedRowMatrix, IndexedRow

df = MLUtils.convertVectorColumnsFromML(vec_scale_df, "scaled_features").drop('features')
monthly_return_irm = IndexedRowMatrix(df.select('scaled_features', 'id').rdd.map(lambda x: IndexedRow(x[1], x[0])))
monthly_return_irm.numCols()

In [None]:
svd = {
    'd': 10,
    'n': 50,
    'U': 'matrix_u'
}

In [None]:
pc = {
    'params': svd,
    'pc': [1,2,3,4,5],
    'k': 5
}

In [None]:
pc['params']['d']

### Now let's compute the singular value decomposition of the IndexedRowMatrix. The given row matrix $A$ of dimension __$(m x n)$__ is decomposed into
### _$$U s V^{T}$$ where:_
* $U$: $(m x k)$ __*left singular vectors* is a IndexedRowMatrix whose columns are the eigenvectors of $(A X A')$__
* $s$: __DenseVector consisting of square root of the eigenvalues *singular values* in descending order.__
* $V$: $(n x k)$ __*right singular vectors* is a Matrix whose columns are the eigenvectors of $(A' X A)$__

### This _computeSVD_ interface recieves two main arguments:
* $k$, for $k^{th}$ int number, thus each element $k$ = {${k_{i} \in \Bbb R}$}
* $U$, with _computeU_ boolean __True__, whether or not to compute $U$. If set to be __True__, then $U$ is computed by $A  V  s^{-1}$

In [None]:
SVD = monthly_return_irm.computeSVD(k=10, computeU=True)
print("SVD:", SVD)

U = SVD.U
print("\nU matrix:", U.toRowMatrix().computePrincipalComponents(10))

S = SVD.s.toArray()
print("\nS matrix:", S)

In [None]:
U.toRowMatrix().rows.take(3)

In [None]:
U.rows.take(3)

In [None]:
eigen_vals

In [None]:
np.flipud(np.sort(eigen_vals))

In [None]:
A = np.diag([1.0, 2, 3])
A

In [None]:
np.flipud(np.sort(A))

In [None]:
eigen_vals = S**2 / (n-1)
print("eigen values:", eigen_vals)

eigvals = np.flipud(np.sort(eigen_vals))
print("\neigen vecs:", eigvals)

cumsum = eigvals.cumsum()
print("\ncumsum:", cumsum)

total_variance_explained = cumsum / eigvals.sum()
print("\ntotal variance explained:", total_variance_explained)

In [None]:
V

In [None]:
K = np.argmax(total_variance_explained > 0.95) + 1
print("total K's find:", K)

V = SVD.V

U_ = U.rows.map(lambda x: (x.index, x.vector[0:K] * S[0:K]))
print("\nmatrix U:")
U_.take(5)

In [None]:
princ_comps = np.array(list(map(lambda x: x[1], sorted(U_.collect(), key=lambda x: x[0]))))
princ_comps

In [None]:
pca = PCA(k=K, inputCol=scaler.getOutputCol(), outputCol="pcaFeatures")
model = pca.fit(vec_scale_df)
transformed_feature = model.transform(vec_scale_df)

print("total explained variance by PC:", np.round(100.00 * model.explainedVariance.toArray(), 4))
transformed_feature.show(5, False)

In [None]:
len(d_features)

In [None]:
model.pc.toArray()[1]

In [None]:
pcs = np.round(100.00 * model.pc.toArray(), 4)
df_pc = pd.DataFrame(pcs, columns = ['PC_'+str(i) for i in range(1, K+1)], index=d_features)
print("PC's sum:\n", df_pc.sum())
df_pc

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()

for k in range(1, K):
    plt.scatter(princ_comps[:, 0], princ_comps[:, k], alpha=0.05)
plt.axis('equal')
plt.show()

In [None]:
from pyspark.ml.clustering import KMeans

kmeans = KMeans(featuresCol='pcaFeatures', k=K, seed=12345)
model_kmeans = kmeans.fit(transformed_feature)
transformed = model_kmeans.transform(transformed_feature)
transformed.groupBy("prediction").count().orderBy(col("count").desc()).show()
transformed.show(5)