<a href="https://colab.research.google.com/github/KRiver28/TIL/blob/master/4_9_PMI_LSA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import TruncatedSVD

In [2]:
docs = ['성진과 창욱은 야구장에 갔다',
        '성진과 태균은 도서관에 갔다',
        '성진과 창욱은 공부를 좋아한다']

In [3]:
# 동시발생 빈도 행렬
count_model = CountVectorizer(ngram_range=(1,1))
x = count_model.fit_transform(docs)

word2idx = count_model.vocabulary_
idx2word = {v:k for k, v in word2idx.items()}

xc = x.T * x  # this is co-occurrence matrix in sparse csr format
xc.setdiag(0) # sometimes you want to fill same word cooccurence to 0
xc = xc.toarray()
print(xc)
idx2word

[[0 0 1 2 1 0 1 1]
 [0 0 0 1 0 1 1 0]
 [1 0 0 1 0 0 0 1]
 [2 1 1 0 1 1 2 1]
 [1 0 0 1 0 0 1 0]
 [0 1 0 1 0 0 1 0]
 [1 1 0 2 1 1 0 0]
 [1 0 1 1 0 0 0 0]]


{0: '갔다',
 1: '공부를',
 2: '도서관에',
 3: '성진과',
 4: '야구장에',
 5: '좋아한다',
 6: '창욱은',
 7: '태균은'}

In [4]:
# 동시발생 결합확률 행렬
xp = xc / xc.sum()
xp

array([[0.        , 0.        , 0.02777778, 0.05555556, 0.02777778,
        0.        , 0.02777778, 0.02777778],
       [0.        , 0.        , 0.        , 0.02777778, 0.        ,
        0.02777778, 0.02777778, 0.        ],
       [0.02777778, 0.        , 0.        , 0.02777778, 0.        ,
        0.        , 0.        , 0.02777778],
       [0.05555556, 0.02777778, 0.02777778, 0.        , 0.02777778,
        0.02777778, 0.05555556, 0.02777778],
       [0.02777778, 0.        , 0.        , 0.02777778, 0.        ,
        0.        , 0.02777778, 0.        ],
       [0.        , 0.02777778, 0.        , 0.02777778, 0.        ,
        0.        , 0.02777778, 0.        ],
       [0.02777778, 0.02777778, 0.        , 0.05555556, 0.02777778,
        0.02777778, 0.        , 0.        ],
       [0.02777778, 0.        , 0.02777778, 0.02777778, 0.        ,
        0.        , 0.        , 0.        ]])

In [5]:
# PMI 행렬
# pmi = log[p(X_ij) / {p(X_i*) * p(X_*j)}]
def calc_pmi(cm):
    sum_col = cm.sum(axis=0)
    sum_row = cm.sum(axis=1)
    sum_tot = sum_col.sum()

    j_prob = np.outer(sum_row, sum_col) / sum_tot
    cm = cm / j_prob

    return np.log(cm + 1e-8)

pmi = calc_pmi(xp)
pmi

array([[-1.84206807e+01, -1.84206807e+01,  6.93147186e-01,
         2.87682080e-01,  6.93147186e-01, -1.84206807e+01,
         9.99999967e-09,  6.93147186e-01],
       [-1.84206807e+01, -1.84206807e+01, -1.84206807e+01,
         2.87682080e-01, -1.84206807e+01,  1.38629436e+00,
         6.93147186e-01, -1.84206807e+01],
       [ 6.93147186e-01, -1.84206807e+01, -1.84206807e+01,
         2.87682080e-01, -1.84206807e+01, -1.84206807e+01,
        -1.84206807e+01,  1.38629436e+00],
       [ 2.87682080e-01,  2.87682080e-01,  2.87682080e-01,
        -1.84206807e+01,  2.87682080e-01,  2.87682080e-01,
         2.87682080e-01,  2.87682080e-01],
       [ 6.93147186e-01, -1.84206807e+01, -1.84206807e+01,
         2.87682080e-01, -1.84206807e+01, -1.84206807e+01,
         6.93147186e-01, -1.84206807e+01],
       [-1.84206807e+01,  1.38629436e+00, -1.84206807e+01,
         2.87682080e-01, -1.84206807e+01, -1.84206807e+01,
         6.93147186e-01, -1.84206807e+01],
       [ 9.99999967e-09,  6.931471

In [6]:
ppmi = pmi.copy()
ppmi[ppmi < 0] = 0.0
ppmi

array([[0.00000000e+00, 0.00000000e+00, 6.93147186e-01, 2.87682080e-01,
        6.93147186e-01, 0.00000000e+00, 9.99999967e-09, 6.93147186e-01],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 2.87682080e-01,
        0.00000000e+00, 1.38629436e+00, 6.93147186e-01, 0.00000000e+00],
       [6.93147186e-01, 0.00000000e+00, 0.00000000e+00, 2.87682080e-01,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 1.38629436e+00],
       [2.87682080e-01, 2.87682080e-01, 2.87682080e-01, 0.00000000e+00,
        2.87682080e-01, 2.87682080e-01, 2.87682080e-01, 2.87682080e-01],
       [6.93147186e-01, 0.00000000e+00, 0.00000000e+00, 2.87682080e-01,
        0.00000000e+00, 0.00000000e+00, 6.93147186e-01, 0.00000000e+00],
       [0.00000000e+00, 1.38629436e+00, 0.00000000e+00, 2.87682080e-01,
        0.00000000e+00, 0.00000000e+00, 6.93147186e-01, 0.00000000e+00],
       [9.99999967e-09, 6.93147186e-01, 0.00000000e+00, 2.87682080e-01,
        6.93147186e-01, 6.93147186e-01, 0.00000000e+00, 0.

In [7]:
# PMI matrix를 SVD로 분해한다.
# C = U.S.VT
# sklearn을 이용한 SVD 예시

# 특이값 (S)이 큰 4개를 주 성분으로 C의 차원을 축소한다.
svd = TruncatedSVD(n_components=4)
D = svd.fit_transform(pmi)

U = D / svd.singular_values_
S = np.diag(svd.singular_values_)
VT = svd.components_

In [8]:
print("\nU, S, VT :")
print(np.round(U, 2), '\n')
print(np.round(S, 2), '\n')
print(np.round(VT, 2), '\n')

print("C를 4개 차원으로 축소 : truncated (U * S)")
print(np.round(D, 2))

idx2word


U, S, VT :
[[ 0.22 -0.33 -0.63  0.  ]
 [ 0.41  0.44 -0.23  0.6 ]
 [ 0.41 -0.44  0.23 -0.37]
 [-0.01  0.   -0.    0.  ]
 [ 0.46 -0.   -0.    0.  ]
 [ 0.41  0.44 -0.23 -0.6 ]
 [ 0.22  0.33  0.63 -0.  ]
 [ 0.41 -0.44  0.23  0.37]] 

[[83.83  0.    0.    0.  ]
 [ 0.   33.8   0.    0.  ]
 [ 0.    0.   32.41  0.  ]
 [ 0.    0.    0.   19.81]] 

[[-0.22 -0.41 -0.41  0.01 -0.46 -0.41 -0.22 -0.41]
 [-0.33  0.44 -0.44 -0.    0.    0.44  0.33 -0.44]
 [ 0.63  0.23 -0.23  0.    0.    0.23 -0.63 -0.23]
 [ 0.   -0.6   0.37 -0.   -0.    0.6   0.   -0.37]] 

C를 4개 차원으로 축소 : truncated (U * S)
[[ 18.45 -10.99 -20.35   0.  ]
 [ 34.79  15.01  -7.45  11.96]
 [ 34.79 -15.01   7.45  -7.28]
 [ -0.94   0.    -0.     0.  ]
 [ 38.8   -0.    -0.     0.  ]
 [ 34.79  15.01  -7.45 -11.96]
 [ 18.45  10.99  20.35  -0.  ]
 [ 34.79 -15.01   7.45   7.28]]


{0: '갔다',
 1: '공부를',
 2: '도서관에',
 3: '성진과',
 4: '야구장에',
 5: '좋아한다',
 6: '창욱은',
 7: '태균은'}