### 协方差散度矩阵

In [2]:
import numpy as np
A = np.random.randint(0,10,size = (3,3))
# 协方差
cov = np.cov(A,rowvar=True)
cov

array([[ 2.33333333, -6.83333333, -5.        ],
       [-6.83333333, 24.33333333, 17.        ],
       [-5.        , 17.        , 12.        ]])

In [5]:
A

array([[4, 1, 2],
       [0, 8, 9],
       [0, 6, 6]])

In [6]:
np.cov(A[0],A[0])

array([[2.33333333, 2.33333333],
       [2.33333333, 2.33333333]])

In [10]:
((A[0] - A[0].mean())**2).sum()/(3-1)

2.3333333333333335

In [4]:
# 散度矩阵
B = (A - A.mean(axis = 1).reshape(-1,1))
scatter = B.dot(B.T)
display(scatter/(3 - 1))

array([[ 2.33333333, -6.83333333, -5.        ],
       [-6.83333333, 24.33333333, 17.        ],
       [-5.        , 17.        , 12.        ]])

### 特征值和特征向量

In [13]:
A = np.random.randint(0,10,size = (3,3))
A

array([[8, 2, 8],
       [3, 2, 5],
       [9, 8, 3]])

In [14]:
w,v = np.linalg.eig(A)
display(w,v)

array([16.75565381,  1.67151088, -5.4271647 ])

array([[-0.67330165, -0.6512756 , -0.43197073],
       [-0.35639922,  0.67516   , -0.37706241],
       [-0.64779972,  0.34640881,  0.81928336]])

<font size = 5>$Av = \lambda v$</font>

In [15]:
A.dot(v[:,0])

array([-11.28160943,  -5.971702  , -10.85430783])

In [16]:
w[0] * v[:,0]

array([-11.28160943,  -5.971702  , -10.85430783])

In [20]:
v.dot(np.diag(w)).dot(np.linalg.inv(v))

array([[8., 2., 8.],
       [3., 2., 5.],
       [9., 8., 3.]])

In [21]:
A

array([[8, 2, 8],
       [3, 2, 5],
       [9, 8, 3]])

### PCA降维

In [22]:
from sklearn import datasets
from sklearn.decomposition import PCA

In [23]:
X,y = datasets.load_iris(return_X_y=True)
display(X[:5])

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2]])

In [44]:
pca = PCA(n_components=0.95,whiten=True) # 筛选特征重要性99%特征
X_pca = pca.fit_transform(X)
display(X_pca[:5])

array([[-1.30533786,  0.64836932],
       [-1.31993521, -0.35930856],
       [-1.40496732, -0.29424412],
       [-1.33510889, -0.64613986],
       [-1.32702321,  0.6633044 ]])

### 自己写代码实现PCA降维【特征分解】

In [50]:
# 1、去中心化
B = X - X.mean(axis = 0)

# 2、协方差
V = np.cov(B,rowvar=False,bias = True) # 以列来进行计算的，列是特征

# 3、特征值和特征向量
w,v = np.linalg.eig(V) # 特征值，从大到小排列
# display(w,v)

# 符号翻转，绝对值最大的，如果是负数，才翻转
max_abs_cols = np.argmax(np.abs(v),axis = 0)
signs = np.sign(v[max_abs_cols,[0,1,2,3]]) # 检索
v *= signs # 根据条件进行翻转

# 4、特征的筛选
cond = (w/w.sum()).cumsum() >= 0.95
index = cond.argmax()
v_ = v[:,:index + 1] # 特征向量筛选

# 5、举证运算
pca_result = B.dot(v_)
pca_result = (pca_result - pca_result.mean(axis =0))/pca_result.std(axis=0,ddof = 1)
pca_result[:5]

array([[-1.30533786,  0.64836932],
       [-1.31993521, -0.35930856],
       [-1.40496732, -0.29424412],
       [-1.33510889, -0.64613986],
       [-1.32702321,  0.6633044 ]])

In [48]:
max_abs_cols = np.argmax(np.abs(v),axis = 0)
v[max_abs_cols,[0,1,2,3]]

array([ 0.85667061, -0.73016143,  0.59791083,  0.75365743])

In [49]:
max_abs_cols = np.argmax(np.abs(v),axis = 0)
signs = np.sign(v[max_abs_cols,[0,1,2,3]]) # 检索
signs

array([ 1., -1.,  1.,  1.])

In [46]:
v

array([[ 0.36138659, -0.65658877, -0.58202985,  0.31548719],
       [-0.08452251, -0.73016143,  0.59791083, -0.3197231 ],
       [ 0.85667061,  0.17337266,  0.07623608, -0.47983899],
       [ 0.3582892 ,  0.07548102,  0.54583143,  0.75365743]])

In [51]:
w/w.sum() # 计算每个特征值的权重，百分比

array([0.92461872, 0.05306648, 0.01710261, 0.00521218])

In [52]:
(w/w.sum()).cumsum() # 累加和

array([0.92461872, 0.97768521, 0.99478782, 1.        ])

In [54]:
(w/w.sum()).cumsum() >= 0.95

array([False,  True,  True,  True])

In [55]:
cond.argmax() # 当第一个为True时，索引就会返回~

1

In [35]:
cond = (w/w.sum()).cumsum() >= 0.99 # 第一个为True的位置，刚好满足条件
display(cond)
index = cond.argmax()

array([False, False,  True,  True])

2

In [33]:
(w/w.sum()).cumsum()

array([0.92461872, 0.97768521, 0.99478782, 1.        ])

### SVD奇异值分解

In [56]:
A = np.random.randint(0,10,size = (5,3))
A

array([[9, 6, 8],
       [3, 4, 6],
       [7, 0, 9],
       [9, 5, 1],
       [5, 7, 9]])

In [None]:
# 满秩矩阵，可以求逆矩阵

In [57]:
# np.linalg.eig(A)

LinAlgError: Last 2 dimensions of the array must be square

In [58]:
U,S,Vt = np.linalg.svd(A,full_matrices=False)
display(U,S,Vt)

array([[-0.56910866,  0.13144845,  0.00599043],
       [-0.31746267, -0.21803493, -0.30393608],
       [-0.43488654, -0.42249048,  0.75560588],
       [-0.35852428,  0.83620844,  0.14020996],
       [-0.50760893, -0.23966501, -0.56301679]])

array([23.58326429,  7.07114342,  5.27527971])

array([[-0.63109732, -0.42531799, -0.64870701],
       [ 0.55140286,  0.34222726, -0.76081232],
       [ 0.54559239, -0.83784552,  0.01854285]])

In [60]:
# 奇异值分解，是约等于
U.dot(np.diag(S)).dot(Vt)

array([[9., 6., 8.],
       [3., 4., 6.],
       [7., 0., 9.],
       [9., 5., 1.],
       [5., 7., 9.]])

In [61]:
A

array([[9, 6, 8],
       [3, 4, 6],
       [7, 0, 9],
       [9, 5, 1],
       [5, 7, 9]])

### SVD奇异值分解-PCA降维

In [76]:
from sklearn import datasets
from sklearn.decomposition import PCA
X,y = datasets.load_iris(return_X_y=True)
pca = PCA(n_components=0.90,whiten=True) # 筛选特征重要性99%特征
X_pca = pca.fit_transform(X)
display(X_pca[:5])

array([[-1.30533786],
       [-1.31993521],
       [-1.40496732],
       [-1.33510889],
       [-1.32702321]])

#### SVD分解

In [75]:
n_components = 2
# 1、去中心化
B = X - X.mean(axis = 0)

# 2、奇异值分解
U,S,Vt = np.linalg.svd(B,full_matrices=False)

# 3、符号翻转
# 符号翻转，绝对值最大的，如果是负数，才翻转
max_abs_cols = np.argmax(np.abs(U),axis = 0)
signs = np.sign(U[max_abs_cols,[0,1,2,3]]) # 检索
U *= signs # 根据条件进行翻转

# 4、降维特征筛选
# U = U[:,:n_components]
cond = (S/S.sum()).cumsum() > 0.90
index = cond.argmax()
U = U[:,:index + 1]

# 归一化
U = (U - U.mean(axis = 0))/(U.std(axis = 0,ddof = 1))
U[:5]

array([[-1.30533786,  0.64836932, -0.09981716],
       [-1.31993521, -0.35930856, -0.75257299],
       [-1.40496732, -0.29424412,  0.0640073 ],
       [-1.33510889, -0.64613986,  0.11284924],
       [-1.32702321,  0.6633044 ,  0.32210314]])

In [71]:
X_pca[:5]

array([[-1.30533786,  0.64836932],
       [-1.31993521, -0.35930856],
       [-1.40496732, -0.29424412],
       [-1.33510889, -0.64613986],
       [-1.32702321,  0.6633044 ]])

In [74]:
S # 奇异值，如果根据比例进行筛选，那么得到结论和PCA，筛选的特征，就会不完全一样【接受】

array([25.09996044,  6.01314738,  3.41368064,  1.88452351])

In [77]:
A

array([[9, 6, 8],
       [3, 4, 6],
       [7, 0, 9],
       [9, 5, 1],
       [5, 7, 9]])

In [78]:
A.shape

(5, 3)