# 潜在语义分析
## 单词向量空间与话题向量空间

In [1]:
import numpy as np

X = np.array([
    [0,0,1,1,0,0,0,0,0],
    [0,0,0,0,0,0,1,0,1],
    [0,1,0,0,0,0,0,1,0],
    [0,0,0,0,0,0,1,0,1],
    [1,0,0,0,0,1,0,0,0],
    [1,1,1,1,1,1,1,1,1],
    [1,0,1,0,0,0,0,0,0],
    [0,0,0,0,0,0,1,0,1],
    [0,0,0,0,0,2,0,0,1],
    [1,0,1,0,0,0,0,1,0],
    [0,0,0,1,1,0,0,0,0]
]) # 权值表示单词出现在文中的次数
   # 一行表示，一个单词在不同文本出现的次数
   # 单词-文本矩阵
X.shape

(11, 9)

## 矩阵奇异值分解

In [2]:
U,S,V = np.linalg.svd(X,full_matrices=False)
V = V.T
S = np.diag(S)
U.shape,S.shape,V.shape

((11, 9), (9, 9), (9, 9))

In [3]:
U.dot(S).dot(V.T).round(2)

array([[ 0.,  0.,  1.,  1.,  0., -0.,  0.,  0., -0.],
       [-0., -0., -0., -0., -0., -0.,  1., -0.,  1.],
       [-0.,  1., -0.,  0.,  0., -0., -0.,  1., -0.],
       [-0., -0., -0., -0., -0.,  0.,  1., -0.,  1.],
       [ 1., -0.,  0., -0., -0.,  1., -0., -0., -0.],
       [ 1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.],
       [ 1.,  0.,  1.,  0.,  0., -0., -0.,  0., -0.],
       [-0., -0., -0., -0., -0.,  0.,  1., -0.,  1.],
       [-0., -0., -0., -0., -0.,  2., -0., -0.,  1.],
       [ 1.,  0.,  1.,  0.,  0., -0., -0.,  1., -0.],
       [ 0.,  0.,  0.,  1.,  1., -0.,  0.,  0.,  0.]])

In [4]:
k = 3
U_k = U[:,:k]
V_k = V[:k]
S_k = S[:k,:k]

In [5]:
U_k.shape,S_k.shape,V_k.shape

((11, 3), (3, 3), (3, 9))

In [6]:
U_k.round(2),S_k.round(2),V_k.round(2) # U_k话题空间，（S_k*V_k.T）为文本在话题空间的表示

(array([[-0.16,  0.26, -0.13],
        [-0.2 , -0.36, -0.3 ],
        [-0.14,  0.16, -0.09],
        [-0.2 , -0.36, -0.3 ],
        [-0.2 ,  0.04,  0.46],
        [-0.76,  0.13, -0.11],
        [-0.18,  0.32,  0.04],
        [-0.2 , -0.36, -0.3 ],
        [-0.33, -0.43,  0.67],
        [-0.26,  0.44, -0.  ],
        [-0.13,  0.12, -0.14]]),
 array([[3.88, 0.  , 0.  ],
        [0.  , 2.54, 0.  ],
        [0.  , 0.  , 2.13]]),
 array([[-0.36,  0.37,  0.19,  0.43, -0.21,  0.59, -0.14, -0.15,  0.3 ],
        [-0.23,  0.11, -0.09, -0.09,  0.6 , -0.06, -0.68,  0.27,  0.13],
        [-0.35,  0.46, -0.09,  0.18, -0.4 , -0.53, -0.02,  0.37, -0.24]]))

In [7]:
U_k.dot(S_k.dot(V_k)).round(2)

array([[ 0.17, -0.27, -0.15, -0.37,  0.63, -0.26, -0.35,  0.17, -0.03],
       [ 0.72, -0.69,  0.  , -0.36, -0.14, -0.06,  0.75, -0.37, -0.2 ],
       [ 0.16, -0.23, -0.12, -0.3 ,  0.43, -0.23, -0.2 ,  0.12, -0.06],
       [ 0.72, -0.69,  0.  , -0.36, -0.14, -0.06,  0.75, -0.37, -0.2 ],
       [-0.09,  0.17, -0.24, -0.16, -0.17, -0.98,  0.01,  0.51, -0.45],
       [ 1.06, -1.15, -0.55, -1.32,  0.9 , -1.62,  0.18,  0.44, -0.77],
       [ 0.03, -0.12, -0.22, -0.36,  0.61, -0.52, -0.46,  0.36, -0.13],
       [ 0.72, -0.69,  0.  , -0.36, -0.14, -0.06,  0.75, -0.37, -0.2 ],
       [ 0.2 ,  0.06, -0.26, -0.18, -0.96, -1.43,  0.87,  0.43, -0.86],
       [ 0.11, -0.24, -0.29, -0.53,  0.88, -0.66, -0.61,  0.45, -0.15],
       [ 0.21, -0.28, -0.09, -0.29,  0.4 , -0.15, -0.13,  0.05, -0.04]])

## 非负矩阵分解算法

In [10]:
np.set_printoptions(suppress=True, threshold=np.inf,precision=3)

def cost(X,W,H):
    return np.linalg.norm(X-W.dot(H),2)**0.5

X = np.array([
[0, 0, 0, 1, 0],
[0, 0, 0, 0, 1],
[0, 0, 0, 0, 1],
[1, 0, 1, 0, 0],
[1, 0, 0, 0, 0],
[0, 1, 0, 0, 0],
[1, 0, 1, 1, 0],
[0, 1, 1, 0, 0],
[0, 0, 1, 1, 1],
[0, 1, 1, 0, 0]
])

m,n = X.shape
k = 2
W = np.ones((m,k))
H = np.ones((k,n))

epsilon = 1e-5

# current_cost = cost(X,W,H)
# old_cost = 0


U,S,V = np.linalg.svd(X)
W[:,0] = U[:,0]
for j in range(1,k):
    C = U[:,j].reshape(-1,1).dot(V[:,j].T.reshape(1,-1))
    C[C<0] = 0
    u,s,v = np.linalg.svd(C)
    W[:,j] = u[:,0]
W[W<0] = 0
W = W/np.sum(W,axis=0)
print(W)

Q,R = np.linalg.qr(W)
H = np.linalg.inv(R).dot(Q.T).dot(X)
H[H<0] = 0
# H = H/np.sum(H,axis=0)
print(H)


for _ in range(10000):
#     Ha = W.T.dot(X)
#     Hb = W.T.dot(W).dot(H)+epsilon
#     for l in range(k):
#         for j in range(n):
#             H[l,j] = H[l,j]*Ha[l,j] / Hb[l,j]
            
    H = H*(W.T.dot(X)/(W.T.dot(W).dot(H)+epsilon))
#     Wa = X.dot(H.T)
#     Wb = W.dot(H).dot(H.T)+epsilon
#     for i in range(m):
#         for l in range(k):
#             W[i,l] = W[i,l]*Wa[i,l] / Wb[i,l]
    W = W*(X.dot(H.T)/(W.dot(H).dot(H.T)+epsilon))
#     old_cost = current_cost
#     current_cost = cost(X,W,H)
    # W的列向量进行归一化
    W = W/np.sum(W,axis=0)
    H = H/np.sum(H,axis=0)
W,H

[[0.053 0.1  ]
 [0.029 0.9  ]
 [0.029 0.   ]
 [0.147 0.   ]
 [0.049 0.   ]
 [0.038 0.   ]
 [0.2   0.   ]
 [0.136 0.   ]
 [0.181 0.   ]
 [0.136 0.   ]]
[[2.849 2.241 5.77  3.102 1.477]
 [0.    0.    0.    0.001 1.04 ]]


(array([[0.055, 0.   ],
        [0.   , 1.   ],
        [0.026, 0.   ],
        [0.153, 0.   ],
        [0.051, 0.   ],
        [0.04 , 0.   ],
        [0.208, 0.   ],
        [0.142, 0.   ],
        [0.183, 0.   ],
        [0.142, 0.   ]]),
 array([[1.   , 1.   , 1.   , 1.   , 0.584],
        [0.   , 0.   , 0.   , 0.   , 0.416]]))

In [11]:
cost(X,W,H)

1.4571822329177835