In [24]:
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
import sklearn.preprocessing as preprocessing
from IPython.display import display
np.set_printoptions(precision=3, suppress=True)
pd.set_option('display.precision', 3)
sns.set_style('whitegrid')
sns.set_context('poster')


FILES = [('ripple_price', 'rip'), ('bitcoin_price', 'btc'),
         ('ethereum_price', 'eth'), ('litecoin_price', 'ltc'),
         ('monero_price', 'mon'), ('nem_price', 'nem'),
         ('dash_price', 'dash')]

In [20]:
def load_returns_matrix (tdelta=pd.Timedelta(days=30), center=True, 
                         scale=True):
    """Returns cryptocurrency rolling returns in three formats.
    
    Returns:
        dfout (DataFrame): Returns without centering/scaling.
        dfout_adj (DataFrame): Returns with centering/scaling (depending 
            on input vals for center and scale).
        xout (np.matrix): Returns matrix with centering/scaling (depending 
            on args), i.e., without index or column names.
    """
    dfs = []
    for file, name in FILES:
        path = 'cryptocurrencypricehistory//{}.csv'.format(file)
        df = pd.read_csv(path, usecols=['Date', 'Close'])
        df['Date'] = pd.to_datetime(df['Date'])
        df = df[['Date', 'Close']]
        df.set_index('Date', drop=True, inplace=True)
        df.rename(columns={'Close':name}, inplace=True)
        dfs.append(df)
    dfout = pd.concat(dfs, axis=1, join='inner')
    dfout = dfout.pct_change(periods=1, freq=tdelta)
    dfout.dropna(axis=0, how='any', inplace=True)
    xout = preprocessing.scale(dfout, axis=0, with_mean=center, with_std=scale)
    dfout_adj = pd.DataFrame(xout, columns=dfout.columns, index=dfout.index)
    return dfout, dfout_adj, np.matrix(xout)


def create_proportion_of_variation_df (eigvals):
    total_var = sum(eigvals)
    cols = ['component', 'eigenvalue', 'proportion', 'cumulative']
    df = pd.DataFrame(columns=cols)
    cum = 0
    for i, e in enumerate(eigvals):
        cum += e
        row = {'component':i + 1,
               'eigenvalue':e,
               'proportion':e/total_var,
               'cumulative':cum/total_var}
        df = df.append(row, ignore_index=True)
    df['component'] = df['component'].astype(int)
    df.set_index('component', drop=True, inplace=True)
    return df


def create_eigvec_df (eigvecs):
    """Create DataFrame of eigenvectors."""
    idx = [i + 1 for i in range(len(eigvecs))]
    df = pd.DataFrame(index=idx)
    for i, v in enumerate(eigvecs):
        df['V{}'.format(i + 1)] = v
    return df

In [21]:
# Init data and obtain covariance matrix.

df_returns_unadj, df_returns, X = load_returns_matrix(center=True, scale=True)
n, p = X.shape
C = np.cov(X, rowvar=False)  # covariance matrix
assert n == len(df_returns.index)
print('Number of returns in data: {}'.format(n))
df_returns.head()

Number of returns in data: 794


Unnamed: 0_level_0,rip,btc,eth,ltc,mon,nem,dash
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2015-09-06,-0.291,-1.261,-1.125,-0.946,-0.783,-0.776,-0.828
2015-09-07,-0.314,-0.998,0.299,-0.803,-0.717,-0.739,-0.814
2015-09-08,-0.345,-0.998,0.427,-0.825,-0.684,-0.693,-0.852
2015-09-09,-0.353,-1.077,0.357,-0.915,-0.68,-0.772,-0.782
2015-09-10,-0.343,-1.157,-0.378,-0.979,-0.619,-0.766,-0.845


In [26]:
# Basic features of the unstandardized price return data.

print('Price return statistics without centering/scaling:')
print('Mean returns:')
display(df_returns_unadj.mean(axis=0))
print('S.D. of returns:')
display(df_returns_unadj.std(axis=0))

Price return statistics without centering/scaling:
Mean returns:


rip     0.395
btc     0.150
eth     0.410
ltc     0.173
mon     0.364
nem     0.585
dash    0.277
dtype: float64

S.D. of returns:


rip     1.468
btc     0.232
eth     0.836
ltc     0.475
mon     0.873
nem     1.190
dash    0.620
dtype: float64

In [27]:
# Get SVD breakdown, cast into matrices.
U, s, V = np.linalg.svd(X)
U = np.matrix(U)  # n x n matrix
S = np.zeros((n, p)) 
S[:p, :p] = np.diag(s)
S = np.matrix(S)  # n x p matrix
V = np.matrix(V).T  # p x p matrix
eigvecs = [np.ravel(V[:,i]) for i in range(p)]

# Get eigenvalues through the singular values.
eigvals = [(s_**2)/(n - 1) for s_ in s]

In [5]:
# Reconstruct data matrix X and covariance matrix C using SVD properties.
# Check equality by looking at the norm of their difference.
Xreconstr = U*S*V.T
print('Norm[Xreconstr - X] = {:.2f}\n'.format(np.linalg.norm(Xreconstr - X)))

Creconstr = V*((S.T*S)/(n - 1))*V.T
print('Norm[Creconstr - C] = {:.2f}\n'.format(np.linalg.norm(Creconstr - C)))

# Get principal components (n-by-p matrix).
XV = X*V

Norm[Xreconstr - X] = 0.00

Norm[Creconstr - C] = 0.00



**Inner product should be 0 between all eigenvectors and principal components since they are orthogonal. Test this out for the first two eigenvectors and PCs.**

In [6]:
pc1, pc2 = np.ravel(XV[:, 0]), np.ravel(XV[:, 1])
v1, v2 = np.ravel(V[:, 0]), np.ravel(V[:, 1])
print('Inner product between 1st and 2nd eigenvectors: {:.2f}\n'.format(
      np.inner(v1, v2)))
print('Inner product between 1st and 2nd PCs: {:.2f}\n'.format(
      np.inner(pc1, pc2)))

Inner product between 1st and 2nd eigenvectors: 0.00

Inner product between 1st and 2nd PCs: 0.00



In [7]:
# Create tables for proportion of variation and eigenvectors.
df_variation_svd = create_proportion_of_variation_df(eigvals)
df_eigvec_svd = create_eigvec_df(eigvecs)

In [8]:
print('Eigenvectors obtained from SVD')
df_eigvec_svd

Eigenvectors obtained from SVD


Unnamed: 0,V1,V2,V3,V4,V5,V6,V7
1,0.517,-0.237,0.105,-0.286,0.015,-0.013,-0.764
2,0.334,-0.15,-0.305,0.867,0.095,-0.064,-0.091
3,0.358,0.508,0.204,0.007,-0.228,-0.712,0.118
4,0.455,-0.309,0.065,-0.226,0.611,-0.087,0.511
5,0.137,0.327,-0.88,-0.311,0.056,0.027,-0.012
6,0.487,-0.063,0.037,-0.044,-0.64,0.472,0.351
7,0.174,0.676,0.274,0.13,0.39,0.508,-0.103


In [9]:
# Print eigenvectors obtained using sklearn.

pca = PCA().fit(X)
sk_comp = pca.components_

print('Eigenvectors obtained from sklearn PCA.components_')
sk_comp.T

Eigenvectors obtained from sklearn PCA.components_


array([[ 0.517, -0.237, -0.105, -0.286, -0.015,  0.013,  0.764],
       [ 0.334, -0.15 ,  0.305,  0.867, -0.095,  0.064,  0.091],
       [ 0.358,  0.508, -0.204,  0.007,  0.228,  0.712, -0.118],
       [ 0.455, -0.309, -0.065, -0.226, -0.611,  0.087, -0.511],
       [ 0.137,  0.327,  0.88 , -0.311, -0.056, -0.027,  0.012],
       [ 0.487, -0.063, -0.037, -0.044,  0.64 , -0.472, -0.351],
       [ 0.174,  0.676, -0.274,  0.13 , -0.39 , -0.508,  0.103]])

In [10]:
print('Variation Explained using Eigenvalues from SVD')
df_variation_svd

Variation Explained using Eigenvalues from SVD


Unnamed: 0_level_0,eigenvalue,proportion,cumulative
component,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,2.657,0.379,0.379
2,1.376,0.196,0.575
3,0.941,0.134,0.71
4,0.769,0.11,0.819
5,0.564,0.08,0.9
6,0.464,0.066,0.966
7,0.239,0.034,1.0


In [11]:
print('Variation Explained using sklearn')
idx = [i + 1 for i in range(p)]
data = {'component': idx,
       'eigenvalue': pca.explained_variance_,
       'proportion': pca.explained_variance_ratio_}
df_variation_sk = pd.DataFrame(data)
df_variation_sk.set_index('component', drop=True, inplace=True)
df_variation_sk

Variation Explained using sklearn


Unnamed: 0_level_0,eigenvalue,proportion
component,Unnamed: 1_level_1,Unnamed: 2_level_1
1,2.657,0.379
2,1.376,0.196
3,0.941,0.134
4,0.769,0.11
5,0.564,0.08
6,0.464,0.066
7,0.239,0.034


In [12]:
# Get principal components using sklearn.
XReduced_sk = pca.transform(X)
XReduced_sk.shape

(794, 7)

### Comparing sklearn's PCA and SVD

The principal components retrieved using `sklearn` and SVD should be the same.

The following cell shows this is true for the first and second principal component.

In [13]:
x1_reduced_sk = np.ravel(XReduced_sk[:,0])
x2_reduced_sk = np.ravel(XReduced_sk[:,1])

x1_reduced_diff = np.linalg.norm(x1_reduced_sk - pc1)
x2_reduced_diff = np.linalg.norm(x2_reduced_sk - pc2)

print("Norm of difference between sklearn's principal component and " \
     "component created using SVD:")
print('PC1: {:.2f}'.format(x1_reduced_diff))
print('PC2: {:.2f}'.format(x2_reduced_diff))

Norm of difference between sklearn's principal component and component created using SVD:
PC1: 0.00
PC2: 0.00


### Loadings

In [14]:
# Calc loadings and put into DataFrame.
loadings = eigvecs * np.sqrt(np.abs(eigvals))
idx = pd.Series([i + 1 for i in range(p)], name='component')

df_loadings = pd.DataFrame(loadings, columns=df_returns.columns, index=idx)
df_loadings

Unnamed: 0_level_0,rip,btc,eth,ltc,mon,nem,dash
component,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,0.844,0.391,0.347,0.399,0.103,0.331,0.085
2,-0.386,-0.176,0.492,-0.271,0.245,-0.043,0.33
3,0.171,-0.358,0.198,0.057,-0.66,0.025,0.134
4,-0.466,1.017,0.007,-0.198,-0.233,-0.03,0.064
5,0.024,0.111,-0.221,0.536,0.042,-0.436,0.191
6,-0.021,-0.075,-0.691,-0.076,0.02,0.321,0.248
7,-1.245,-0.106,0.114,0.448,-0.009,0.239,-0.051


In [29]:
C.shape

(7, 7)

In [31]:
df_returns.corr()

Unnamed: 0,rip,btc,eth,ltc,mon,nem,dash
rip,1.0,0.305,0.325,0.694,0.066,0.63,0.037
btc,0.305,1.0,0.164,0.322,0.102,0.349,0.031
eth,0.325,0.164,1.0,0.192,0.171,0.361,0.47
ltc,0.694,0.322,0.192,1.0,0.044,0.428,0.018
mon,0.066,0.102,0.171,0.044,1.0,0.114,0.128
nem,0.63,0.349,0.361,0.428,0.114,1.0,0.133
dash,0.037,0.031,0.47,0.018,0.128,0.133,1.0
