### 잠재의미분석(LSA)
- 특이값 분해(Singular Value Decomposition, SVD) 차원축소 방법 중 하나

In [1]:
#20가지 주제의 뉴스 데이터
import pandas as pd
from sklearn.datasets import fetch_20newsgroups
dataset = fetch_20newsgroups(shuffle=True, random_state=1, 
                             remove=('headers', 'footers', 'quotes'))
documents = dataset.data
len(documents)

11314

In [2]:
#첫번째 뉴스
documents[0]

"Well i'm not sure about the story nad it did seem biased. What\nI disagree with is your statement that the U.S. Media is out to\nruin Israels reputation. That is rediculous. The U.S. media is\nthe most pro-israeli media in the world. Having lived in Europe\nI realize that incidences such as the one described in the\nletter have occured. The U.S. media as a whole seem to try to\nignore them. The U.S. is subsidizing Israels existance and the\nEuropeans are not (at least not to the same degree). So I think\nthat might be a reason they report more clearly on the\natrocities.\n\tWhat is a shame is that in Austria, daily reports of\nthe inhuman acts commited by Israeli soldiers and the blessing\nreceived from the Government makes some of the Holocaust guilt\ngo away. After all, look how the Jews are treating other races\nwhen they got power. It is unfortunate.\n"

In [3]:
#뉴스 카테고리
dataset.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [4]:
#전처리
news_df = pd.DataFrame({'document': documents})
# 알파벳 이외의 문자 제거
news_df['clean_doc'] = news_df['document'].str.replace('[^a-zA-Z]', ' ')

# 길이가 3이하인 단어 제거
news_df['clean_doc'] = news_df['clean_doc'].apply(
    lambda x: ' '.join([w for w in x.split() if len(w) > 3]))

# 소문자 변환
news_df['clean_doc'] = news_df['clean_doc'].str.lower()

news_df['clean_doc'][0]

  news_df['clean_doc'] = news_df['document'].str.replace('[^a-zA-Z]', ' ')


'well sure about story seem biased what disagree with your statement that media ruin israels reputation that rediculous media most israeli media world having lived europe realize that incidences such described letter have occured media whole seem ignore them subsidizing israels existance europeans least same degree think that might reason they report more clearly atrocities what shame that austria daily reports inhuman acts commited israeli soldiers blessing received from government makes some holocaust guilt away after look jews treating other races when they power unfortunate'

In [5]:
#불용어 처리
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
# 토큰화
tokenized_doc = news_df['clean_doc'].str.split()

# 불용어 제거
tokenized_doc = tokenized_doc.apply(
    lambda x: [item for item in x if item not in stop_words])

tokenized_doc[0]

['well',
 'sure',
 'story',
 'seem',
 'biased',
 'disagree',
 'statement',
 'media',
 'ruin',
 'israels',
 'reputation',
 'rediculous',
 'media',
 'israeli',
 'media',
 'world',
 'lived',
 'europe',
 'realize',
 'incidences',
 'described',
 'letter',
 'occured',
 'media',
 'whole',
 'seem',
 'ignore',
 'subsidizing',
 'israels',
 'existance',
 'europeans',
 'least',
 'degree',
 'think',
 'might',
 'reason',
 'report',
 'clearly',
 'atrocities',
 'shame',
 'austria',
 'daily',
 'reports',
 'inhuman',
 'acts',
 'commited',
 'israeli',
 'soldiers',
 'blessing',
 'received',
 'government',
 'makes',
 'holocaust',
 'guilt',
 'away',
 'look',
 'jews',
 'treating',
 'races',
 'power',
 'unfortunate']

In [6]:
#역토큰화
detokenized_doc = []
for i in range(len(news_df)):
    t = ' '.join(tokenized_doc[i])
    detokenized_doc.append(t)
news_df['clean_doc'] = detokenized_doc
news_df['clean_doc'][0]

'well sure story seem biased disagree statement media ruin israels reputation rediculous media israeli media world lived europe realize incidences described letter occured media whole seem ignore subsidizing israels existance europeans least degree think might reason report clearly atrocities shame austria daily reports inhuman acts commited israeli soldiers blessing received government makes holocaust guilt away look jews treating races power unfortunate'

In [7]:
#TF-IDF행렬
from sklearn.feature_extraction.text import TfidfVectorizer
# 상위 1000개의 단어만 처리
vectorizer = TfidfVectorizer(stop_words='english', max_features=1000)
X = vectorizer.fit_transform(news_df['clean_doc'])
#TF-IDF행렬의 크기
X.shape


(11314, 1000)

In [8]:
#특이값 분해(SVD)
from sklearn.decomposition import TruncatedSVD
svd_model = TruncatedSVD(n_components=20) # 11314개의 행 => 20개 축소
svd_model.fit(X)
len(svd_model.components_)

20

In [9]:
#토픽수 x 단어수
import numpy as np
np.shape(svd_model.components_)

(20, 1000)

In [10]:
svd_model.components_

array([[ 0.01469448,  0.05019038,  0.02132608, ...,  0.0786596 ,
         0.01432354,  0.01788787],
       [-0.00534551,  0.0165534 , -0.01643801, ..., -0.06356598,
        -0.01063255, -0.01906471],
       [ 0.0018262 , -0.00369262, -0.01801967, ...,  0.05878623,
         0.0262691 ,  0.022265  ],
       ...,
       [-0.00573767, -0.00045085,  0.0083785 , ...,  0.00471099,
         0.0094785 , -0.00282111],
       [ 0.00500334, -0.00901498, -0.004393  , ...,  0.0793181 ,
         0.00287461,  0.00493287],
       [-0.00132898, -0.01115991, -0.00464496, ..., -0.00388068,
         0.01768014,  0.00057727]])

In [11]:
#20개의 뉴스그룹별로 추출한 토픽 리스트 출력
terms = vectorizer.get_feature_names_out() # 단어 집합(1000개의 단어)

def get_topics(components, feature_names, n=5):
    for idx, topic in enumerate(components):
        print('Topic %d:' % (idx + 1),
              [(feature_names[i], topic[i].round(5)) for i in topic.argsort()[:-n - 1:-1]])

get_topics(svd_model.components_, terms)

Topic 1: [('like', 0.21386), ('know', 0.20046), ('people', 0.19293), ('think', 0.17805), ('good', 0.15128)]
Topic 2: [('thanks', 0.3288), ('windows', 0.29081), ('card', 0.1808), ('drive', 0.17451), ('mail', 0.15121)]
Topic 3: [('game', 0.36919), ('team', 0.32548), ('year', 0.28179), ('games', 0.25408), ('season', 0.18453)]
Topic 4: [('drive', 0.53192), ('scsi', 0.20426), ('hard', 0.15591), ('disk', 0.15513), ('drives', 0.13795)]
Topic 5: [('windows', 0.39942), ('file', 0.24993), ('window', 0.18905), ('files', 0.16168), ('program', 0.13752)]
Topic 6: [('mail', 0.16164), ('chip', 0.16034), ('government', 0.15572), ('space', 0.15023), ('information', 0.13649)]
Topic 7: [('like', 0.66747), ('bike', 0.13601), ('chip', 0.11478), ('know', 0.10956), ('sounds', 0.10196)]
Topic 8: [('card', 0.45326), ('video', 0.21996), ('sale', 0.21344), ('monitor', 0.15775), ('offer', 0.14634)]
Topic 9: [('know', 0.45241), ('card', 0.33527), ('chip', 0.1884), ('government', 0.14678), ('video', 0.14436)]
Topic 