# Principal Component Analysis (PCA)

Principal component analysis (PCA) is a method that used to make change and bring out strong patterns in a dataset. On the other hand, it is used for to make data easy to explore and visualize. 

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore")

# fix_yahoo_finance is used to fetch data 
import fix_yahoo_finance as yf
yf.pdr_override()

In [2]:
# input
symbol = 'AMD'
start = '2014-01-01'
end = '2018-08-27'

# Read data 
dataset = yf.download(symbol,start,end)

# View Columns
dataset.head()

[*********************100%***********************]  1 of 1 downloaded


Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2014-01-02,3.85,3.98,3.84,3.95,3.95,20548400
2014-01-03,3.98,4.0,3.88,4.0,4.0,22887200
2014-01-06,4.01,4.18,3.99,4.13,4.13,42398300
2014-01-07,4.19,4.25,4.11,4.18,4.18,42932100
2014-01-08,4.23,4.26,4.14,4.18,4.18,30678700


In [3]:
dataset['Increase_Decrease'] = np.where(dataset['Volume'].shift(-1) > dataset['Volume'],1,0)
dataset['Buy_Sell_on_Open'] = np.where(dataset['Open'].shift(-1) > dataset['Open'],1,0)
dataset['Buy_Sell'] = np.where(dataset['Adj Close'].shift(-1) > dataset['Adj Close'],1,0)
dataset['Returns'] = dataset['Adj Close'].pct_change()
dataset = dataset.dropna()

In [4]:
from numpy import array
from numpy import mean
from numpy import cov
from numpy.linalg import eig

In [5]:
# Define A
A = array([dataset['Adj Close'], dataset['High'],dataset['Low']])
# calculate the mean of each column
M = mean(A.T, axis=1)
print(M)
# center columns by subtracting column means
C = A - M
print(C)
# calculate covariance matrix of centered matrix
V = cov(C.T)
print(V)
# eigendecomposition of covariance matrix
values, vectors = eig(V)
print(vectors)
print(values)
# project data
P = vectors.T.dot(C.T)
print(P.T)

[ 3.96        4.1         4.18       ... 21.91666667 23.55
 25.72999933]
[[ 0.04        0.03        0.         ...  0.37333433  0.43
  -0.46999933]
 [ 0.04        0.08        0.07       ...  0.40333333  0.45
   1.56999967]
 [-0.08       -0.11       -0.07       ... -0.77666767 -0.88
  -1.10000033]]
[[0.0048     0.0066     0.0042     ... 0.04660006 0.0528     0.06600002]
 [0.0066     0.0097     0.00665    ... 0.06445007 0.07285    0.11625001]
 [0.0042     0.00665    0.0049     ... 0.04130003 0.04655    0.09345   ]
 ...
 [0.04660006 0.06445007 0.04130003 ... 0.45263448 0.51275065 0.6560505 ]
 [0.0528     0.07285    0.04655    ... 0.51275065 0.5809     0.73620022]
 [0.06600002 0.11625001 0.09345    ... 0.6560505  0.73620022 1.94789953]]
[[ 8.82471564e-03+0.00000000e+00j -1.36601491e-02+0.00000000e+00j
   1.01029061e-02+5.65166152e-03j ... -8.18394615e-16-2.67467478e-16j
  -8.18394615e-16+2.67467478e-16j  0.00000000e+00+0.00000000e+00j]
 [ 1.39491779e-02+0.00000000e+00j -1.01355763e-02+0.00

In [6]:
from sklearn.decomposition import PCA

# create the PCA instance
pca = PCA(2)
# fit on data
pca.fit(A)
# access values and vectors
print(pca.components_)
print(pca.explained_variance_)
# transform data
B = pca.transform(A)
print(B)

[[-8.82471564e-03 -1.39491779e-02 -1.02628976e-02 ... -8.67624714e-02
  -9.77979496e-02 -1.95399750e-01]
 [ 1.36601491e-02  1.01355763e-02 -1.53349784e-04 ...  1.27429348e-01
   1.46802789e-01 -1.64975573e-01]]
[46.52034639  6.30869174]
[[-0.04333568  2.90023005]
 [-6.79881146 -1.46393554]
 [ 6.84214714 -1.4362945 ]]
