## 토픽 모델링-LDA

In [1]:
import numpy as np
import pandas as pd

In [2]:
from sklearn.datasets import fetch_20newsgroups
news = fetch_20newsgroups(subset='all', random_state=2021,
    remove=('header','footers','quotes'))

In [3]:
df = pd.DataFrame({'article':news.data})
df.shape

(18846, 1)

In [4]:
# 특수문자 제거
df['article'] = df.article.str.replace('[^A-Za-z]', ' ')

  df['article'] = df.article.str.replace('[^A-Za-z]', ' ')


In [5]:
# 소문자로 변환하고 길이가 3 이하인 단어 제거
df['article'] = df.article.apply(lambda x: ' '.join(w.lower() for w in x.split() if len(w) > 3))

In [6]:
df.article[0][:100]

'from dagibbs quantum david gibbs subject countersteering sans hands organization software systems li'

## NLTK를 통해서 단어 토큰화

In [7]:
import nltk

In [8]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

In [9]:
tokenized_doc = df.article.apply(lambda x: [w for w in x.split() if w not in stop_words])

In [10]:
tokenized_doc[:5]

0    [dagibbs, quantum, david, gibbs, subject, coun...
1    [kehoe, netcom, thomas, david, kehoe, subject,...
2    [rexlex, fnal, fnal, subject, assurance, hell,...
3    [scss, mark, riordan, subject, list, large, in...
4    [adam, stratus, mark, adam, subject, space, fo...
Name: article, dtype: object

## 정수 인코딩과 단어 접합 만들기 - gensim

In [10]:
!pip install gensim

Collecting gensim
  Downloading gensim-4.1.0-cp38-cp38-win_amd64.whl (24.0 MB)
Collecting smart-open>=1.8.1
  Downloading smart_open-5.2.1-py3-none-any.whl (58 kB)
Collecting Cython==0.29.23
  Downloading Cython-0.29.23-cp38-cp38-win_amd64.whl (1.7 MB)
Installing collected packages: smart-open, Cython, gensim
  Attempting uninstall: Cython
    Found existing installation: Cython 0.29.24
    Uninstalling Cython-0.29.24:
      Successfully uninstalled Cython-0.29.24
Successfully installed Cython-0.29.23 gensim-4.1.0 smart-open-5.2.1


In [11]:
from gensim import corpora
dictionary = corpora.Dictionary(tokenized_doc)

In [15]:
corpus = [dictionary.doc2bow(text) for text in tokenized_doc]
print(corpus[0])

[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 2), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 2), (16, 1), (17, 1), (18, 2), (19, 1), (20, 1), (21, 1), (22, 2), (23, 1), (24, 1), (25, 1), (26, 1), (27, 1), (28, 1), (29, 1), (30, 1), (31, 1), (32, 1), (33, 1), (34, 1), (35, 1), (36, 1), (37, 1), (38, 1), (39, 1), (40, 1), (41, 1), (42, 1), (43, 1), (44, 1)]


In [16]:
dictionary[0], dictionary[1], dictionary[2], dictionary[3]

('answer', 'basically', 'bicycle', 'bike')

## LDA 모델 훈련시키기

In [18]:
from gensim.models.ldamodel import LdaModel
NUM_TOPICS = 20

In [20]:
ldamodel = LdaModel(
    corpus,num_topics=NUM_TOPICS,
    id2word=dictionary,passes=20
)
topics = ldamodel.print_topics(num_topics=4)
for topic in topics:
    print(topic)

(1, '0.019*"georgia" + 0.018*"buffalo" + 0.016*"gatech" + 0.016*"ncsu" + 0.012*"prism" + 0.010*"mask" + 0.010*"sharks" + 0.009*"books" + 0.009*"doug" + 0.009*"chinese"')
(8, '0.025*"space" + 0.015*"nasa" + 0.006*"power" + 0.005*"data" + 0.005*"system" + 0.005*"earth" + 0.005*"mission" + 0.005*"shuttle" + 0.004*"launch" + 0.004*"moon"')
(4, '0.026*"would" + 0.018*"think" + 0.015*"know" + 0.015*"like" + 0.013*"people" + 0.013*"lines" + 0.013*"subject" + 0.012*"organization" + 0.010*"could" + 0.010*"good"')
(10, '0.021*"scsi" + 0.020*"myers" + 0.015*"arizona" + 0.013*"ground" + 0.012*"wire" + 0.012*"weaver" + 0.011*"rushdie" + 0.011*"harris" + 0.007*"randy" + 0.007*"nmsu"')


## 훈련 결과 시각화

In [24]:
!pip install pyLDAvis==2.1.2 > /dev/null

������ ��θ� ã�� �� �����ϴ�.


In [28]:
import pyLDAvis.gensim
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary)
pyLDAvis.display(vis)

ValidationError: 
 * Not all rows (distributions) in topic_term_dists sum to 1.