### Standard Dimension Reduction
in an n x m matrix where m < n

In [135]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler, StandardScaler, MaxAbsScaler
import pandas as pd
import numpy as np
from bokeh.io import output_notebook
from bokeh.plotting import figure, show

output_notebook()

In [7]:
X1 = np.array([[1,2],
             [2,2.5],
             [3,3],
             [4,5]])
p = figure(plot_width=300, plot_height=200)
p.circle(X1[:,0], X1[:,1])
show(p)

In [35]:
pca1 = PCA()
pca1.fit(X1)
pca1.transform(X1)

array([[-1.85344857, -0.28346678],
       [-0.79632853,  0.08053491],
       [ 0.2607915 ,  0.4445366 ],
       [ 2.38898559, -0.24160473]])

In [34]:
print(pca1.explained_variance_ratio_)

[ 0.96634847  0.03365153]


###  Add extra dimensions with low variance

In [118]:
X2 = np.append(X1, np.array([0.9,0.8,0.95,0.85])[:, None], 1)
X2

array([[ 1.  ,  2.  ,  0.9 ],
       [ 2.  ,  2.5 ,  0.8 ],
       [ 3.  ,  3.  ,  0.95],
       [ 4.  ,  5.  ,  0.85]])

In [119]:
pca2 = PCA()
pca2.fit(X2)
print(pca2.transform(X2))
print(pca2.explained_variance_ratio_)

[[ -1.85350764e+00  -2.81030393e-01  -4.22058598e-02]
 [ -7.96121930e-01   7.42043476e-02   8.32681687e-02]
 [  2.60582939e-01   4.49118931e-01  -4.04810767e-02]
 [  2.38904663e+00  -2.42292886e-01  -5.81232143e-04]]
[ 0.96517137  0.03381353  0.0010151 ]


###  fuck up the PCA result by putting different columns on a diffferent scale
so that column with higher variance is scaled down while column with low variance is scaled up

In [120]:
# X3[:,3] = X3[:,3] * 10
X3 = np.append(X1, np.array([0.99,0.98,0.95,0.97])[:, None], 1)
X3[:,2] = X3[:,2] * 100
X3

array([[  1. ,   2. ,  99. ],
       [  2. ,   2.5,  98. ],
       [  3. ,   3. ,  95. ],
       [  4. ,   5. ,  97. ]])

In [121]:
p = figure(plot_width=300, plot_height=200)
p.circle(X3[:,0], X3[:,1])
show(p)

In [122]:
pca3 = PCA()
pca3.fit(X3)
print(pca3.transform(X3))
print("explained variance ratio")
print(pca3.explained_variance_ratio_)

[[ 2.5617238  -0.07412383 -0.1010041 ]
 [ 1.08609743  0.0170419   0.15240391]
 [-1.7319079   1.52571404 -0.02857768]
 [-1.91591333 -1.46863211 -0.02282212]]
explained variance ratio
[ 0.76104371  0.23712044  0.00183584]


###  un-fuck up the PCA result by normalizing
minmax or standard scaling doesn't work because they center and stretch the last uni-variant column
MaxAbsScaler worked because it make sure that the Max value is 1 and no stretching happened

In [138]:
# scaler = MinMaxScaler()
# scaler = StandardScaler()
scaler = MaxAbsScaler()
X4 = scaler.fit_transform(X3)
X4

array([[ 0.25      ,  0.4       ,  1.        ],
       [ 0.5       ,  0.5       ,  0.98989899],
       [ 0.75      ,  0.6       ,  0.95959596],
       [ 1.        ,  1.        ,  0.97979798]])

In [139]:
pca4 = PCA()
pca4.fit(X4)
print(pca4.transform(X4))
print("explained variance ratio")
print(pca4.explained_variance_ratio_)

[[ -4.33581202e-01  -5.96535556e-02  -3.35602398e-03]
 [ -1.75860985e-01   1.83361790e-02   6.64006433e-03]
 [  8.23486738e-02   9.98733634e-02  -3.24594145e-03]
 [  5.27093513e-01  -5.85559868e-02  -3.80988977e-05]]
explained variance ratio
[  9.66664645e-01   3.32088591e-02   1.26496168e-04]
