### 잠재디리클레할당(LDA)
- 주제별 단어수 분포를 바탕으로 주어진 문서에서 발견된 단어수 분포를 분석하여 해당 문서의 주제들을 예측하는 기법
- 교환성 가정: 단어들의 순서는 상관하지 않고 단어들의 유무만이 중요하다는 가정

In [1]:
#전처리 함수
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re
# 영어 단어의 어근만 추출
stm = PorterStemmer()

# 영어 단어의 불용어 집합
stopwords = set(stopwords.words('english'))

# 특수문자 제거를 위한 정규식
pattern = re.compile('[a-zA-Z][-_a-zA-Z0-9.]*')

def tokenize(sentence):
    def stem(w):
        try:
            return stm.stem(w)
        except:
            return w
    return [stem(w) for w in word_tokenize(sentence.lower()) 
            if w not in stopwords and pattern.match(w)]

In [2]:
#LDA모델 생성
import tomotopy as tp
# 토픽의 개수: 20개, 5회 미만 등장한 단어들: 제거
model = tp.LDAModel(k=20, min_cf=5)
# 파일에서 한 줄씩 읽어 model에 추가
for i, line in enumerate(open('c:/data/text/news2.txt', encoding='utf-8')):
    model.add_doc(tokenize(line))
# model의 num_words, num_vocabs 확인
model.train(0) # train(0): 0회 학습
print('Total docs:', len(model.docs))
print('Total words:', model.num_words)
print(list(model.vocabs)[:50])

Total docs: 23
Total words: 87
['prigozhin', 'putin', 'shell', 'militari', 'russia', 'system', 'bakhmut', 'said', 'one', 'polit', 'russian', 'critic', 'day', 'boss', 'wagner', 'grandfath', 'victori', 'recent', 'seem', 'may', 'public', 'continu', 'line', 'gener', 'war', 'flank', 'warehous', 'save', 'whether', 'privat', 'compani', 'social-media', 'telegram', 'among', 'battlefield', 'week', 'brigad', 'around', 'eastern', 'troop', 'ukrainian', 'live', 'today', 'almost', 'fight', 'suppos', 'countri', 'comment', 'fighter', 'leadership']


In [3]:
#200회 학습
model.train(200)
for i in range(model.k):
    # 20개의 토픽별 상위 단어 10개 추출
    res = model.get_topic_words(i, top_n=10)
    print(f'Topic #{i}', end='\t')
    print(', '.join(w for w, p in res))

Topic #0	militari, prigozhin, putin, shell, russia, system, bakhmut, said, one, polit
Topic #1	prigozhin, putin, shell, militari, russia, system, bakhmut, said, one, polit
Topic #2	polit, prigozhin, putin, shell, militari, russia, system, bakhmut, said, one
Topic #3	prigozhin, putin, shell, militari, russia, system, bakhmut, said, one, polit
Topic #4	prigozhin, putin, shell, militari, russia, system, bakhmut, said, one, polit
Topic #5	russia, militari, prigozhin, putin, shell, system, bakhmut, said, one, polit
Topic #6	prigozhin, putin, shell, militari, russia, system, bakhmut, said, one, polit
Topic #7	prigozhin, putin, shell, militari, russia, system, bakhmut, said, one, polit
Topic #8	prigozhin, bakhmut, putin, shell, militari, russia, system, said, one, polit
Topic #9	prigozhin, putin, shell, militari, russia, system, bakhmut, said, one, polit
Topic #10	shell, said, prigozhin, putin, militari, russia, system, bakhmut, one, polit
Topic #11	prigozhin, putin, shell, militari, russia, 

In [4]:
#제인 오스틴의 엠마 문서 로드
import nltk
emma_raw = nltk.corpus.gutenberg.raw('austen-emma.txt')
tokenize(emma_raw)[:100]

['emma',
 'jane',
 'austen',
 'volum',
 'chapter',
 'emma',
 'woodhous',
 'handsom',
 'clever',
 'rich',
 'comfort',
 'home',
 'happi',
 'disposit',
 'seem',
 'unit',
 'best',
 'bless',
 'exist',
 'live',
 'nearli',
 'twenty-on',
 'year',
 'world',
 'littl',
 'distress',
 'vex',
 'youngest',
 'two',
 'daughter',
 'affection',
 'indulg',
 'father',
 'consequ',
 'sister',
 'marriag',
 'mistress',
 'hous',
 'earli',
 'period',
 'mother',
 'die',
 'long',
 'ago',
 'indistinct',
 'remembr',
 'caress',
 'place',
 'suppli',
 'excel',
 'woman',
 'gover',
 'fallen',
 'littl',
 'short',
 'mother',
 'affect',
 'sixteen',
 'year',
 'miss',
 'taylor',
 'mr.',
 'woodhous',
 'famili',
 'less',
 'gover',
 'friend',
 'fond',
 'daughter',
 'particularli',
 'emma',
 'intimaci',
 'sister',
 'even',
 'miss',
 'taylor',
 'ceas',
 'hold',
 'nomin',
 'offic',
 'gover',
 'mild',
 'temper',
 'hardli',
 'allow',
 'impos',
 'restraint',
 'shadow',
 'author',
 'long',
 'pass',
 'away',
 'live',
 'togeth',
 'friend

In [5]:
#LDA모델
# 모델생성
model = tp.LDAModel(k=5, min_cf=5)
# 전처리 후 모형에 추가
model.add_doc(tokenize(emma_raw))
# 0회 학습
model.train(0)
print('Total docs:', len(model.docs))
print('Total words:', model.num_words)
print(model.vocabs)

Total docs: 1
Total words: 67220
['mr.', 'emma', 'could', 'would', 'mrs.', 'miss', 'must', 'harriet', 'much', 'said', 'think', 'thing', 'one', 'weston', 'everi', 'elton', 'say', 'know', 'knightley', 'well', 'littl', 'never', 'might', 'good', 'woodhous', 'time', 'jane', 'quit', 'look', 'like', 'come', 'see', 'thought', 'great', 'feel', 'friend', 'noth', 'go', 'dear', 'alway', 'man', 'fairfax', 'even', 'churchil', 'soon', 'give', 'may', 'make', 'wish', 'shall', 'without', 'hope', 'seem', 'day', 'want', 'frank', 'first', 'sure', 'father', 'made', 'happi', 'inde', 'bodi', 'ever', 'oh', 'young', 'talk', 'two', 'though', 'better', 'way', 'love', 'hartfield', 'upon', 'long', 'walk', 'speak', 'realli', 'take', 'believ', 'rather', 'letter', 'bate', 'us', 'word', 'done', 'mani', 'marri', 'away', 'mean', 'poor', 'appear', 'hear', 'home', 'howev', 'mind', 'moment', 'woman', 'last', 'manner', 'enough', 'suppos', 'ye', 'highburi', 'pleasur', 'doubt', 'comfort', 'heard', 'came', 'place', 'ladi', 'pre

In [6]:
#100회 학습
model.train(100)
for i in range(model.k):
    # 5개의 토픽별 상위 단어 2개 추출
    res = model.get_topic_words(i, top_n=2)
    print(res)
    print(f'Topic #{i}', end='\t')
    print(', '.join(w for w, p in res))

[('much', 0.02730192430317402), ('emma', 0.02602687105536461)]
Topic #0	much, emma
[('could', 0.029140541329979897), ('one', 0.02802216447889805)]
Topic #1	could, one
[('mrs.', 0.04703768342733383), ('would', 0.04161332920193672)]
Topic #2	mrs., would
[('think', 0.030721284449100494), ('emma', 0.030056335031986237)]
Topic #3	think, emma
[('mr.', 0.03733336552977562), ('come', 0.028377624228596687)]
Topic #4	mr., come
