### 통계적 가중치 기반 연관어 분석
- 통계적으로 가중치를 구한 후 두 단어 간의 유사도를 단어간의 연관도로 적용하는 방법
    1. 단어마다 가중치를 할당해야 함(출현빈도, tf-idf 등으로 계산)
    2. 단어간의 유사도 계산(cosine similarity 등의 방법)

In [1]:
#긍정리뷰 100개 로드
import glob
pos_review = (glob.glob('c:/data/imdb/train/pos/*.txt'))[0:100]
lines_pos = []
for i in pos_review:
    try:
        f = open(i, 'r')
        temp = f.readlines()[0]
        lines_pos.append(temp)
        f.close()
    except:
        continue
len(lines_pos)

100

In [2]:
#TF-IDF가중치 할당
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
tokenizer = RegexpTokenizer('[\w]+') # 알파벳, 숫자, -
stop_words = stopwords.words('english')
vec = TfidfVectorizer(stop_words=stop_words)
vector_lines_pos = vec.fit_transform(lines_pos)
A = vector_lines_pos.toarray()
print(A.shape)
print(A)

(100, 4001)
[[0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.06538462 0.         ... 0.         0.         0.        ]
 [0.         0.23078109 0.         ... 0.         0.         0.        ]]


In [3]:
#단어간의 유사도를 구하기 위해 행렬 전치
A = A.T
print(A.shape)
print(A)

(4001, 100)
[[0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.06538462 0.23078109]
 [0.         0.         0.         ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]]


In [5]:
#코사인 유사도 계산
from scipy import sparse
from sklearn.metrics.pairwise import cosine_similarity
# 희소행렬로 변환
A_sparse = sparse.csr_matrix(A)
similarities_sparse = cosine_similarity(A_sparse, dense_output=False)
# todok(): 행렬을 딕셔너리 형태로 변환
list(similarities_sparse.todok().items())[35000:35010]

[((1469, 108), 0.37803585968894865),
 ((1470, 108), 0.2189685434746738),
 ((1476, 108), 0.06407477897013734),
 ((1477, 108), 0.185189577514238),
 ((1480, 108), 0.20111036876169444),
 ((1489, 108), 0.06995711757772019),
 ((1496, 108), 0.10714874067068783),
 ((1503, 108), 0.30487333830091773),
 ((1504, 108), 0.30487333830091773),
 ((1512, 108), 0.30487333830091773)]

In [6]:
vec.get_feature_names_out([1469])

array(['06', '10', '100', ..., 'zhu', 'zone', 'zooms'], dtype=object)

In [9]:
#결과값을 데이터프레임으로 출력
import pandas as pd
import numpy as np
df = pd.DataFrame(list(similarities_sparse.todok().items()), columns=['words', 'weight'])
df2 = df.sort_values('weight', ascending=False)
df2 = df2.reset_index(drop=True)
# 자신끼리 연결된 항목을 제외
df3 = df2[np.round(df2['weight']) < 1]
df3 = df3.reset_index(drop=True)
df3.head(10)

Unnamed: 0,words,weight
0,"(3971, 3372)",0.499961
1,"(3372, 3971)",0.499961
2,"(1192, 2554)",0.499958
3,"(2554, 1192)",0.499958
4,"(2468, 1321)",0.499957
5,"(2468, 710)",0.499957
6,"(710, 2468)",0.499957
7,"(1321, 2468)",0.499957
8,"(2146, 889)",0.499909
9,"(889, 2146)",0.499909


In [11]:
for i,row in enumerate(df3.iterrows()): # iterrows: 데이터프레임 개별 행 처리
    a = vec.get_feature_names_out()[row[1][0][0]]
    b = vec.get_feature_names_out()[row[1][0][1]]
    print(f'{a}, {b} => {row[1][1]:.2f}')    
    if i > 10:
        break

writers, stop => 0.50
stop, writers => 0.50
essence, past => 0.50
past, essence => 0.50
older, farewell => 0.50
older, concubine => 0.50
concubine, older => 0.50
farewell, older => 0.50
made, deeply => 0.50
deeply, made => 0.50
classes, watch => 0.50
watch, classes => 0.50
