In [12]:
import numpy as np
from sklearn.datasets import fetch_20newsgroups

newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS

vectorizer = CountVectorizer(lowercase=True, stop_words=ENGLISH_STOP_WORDS,
                             analyzer='word', binary=True, max_df=0.7, min_df=3)
vectorizer.fit(newsgroups_train.data)

CountVectorizer(analyzer='word', binary=True, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=0.7, max_features=None, min_df=3,
                ngram_range=(1, 1), preprocessor=None,
                stop_words=frozenset({'a', 'about', 'above', 'across', 'after',
                                      'afterwards', 'again', 'against', 'all',
                                      'almost', 'alone', 'along', 'already',
                                      'also', 'although', 'always', 'am',
                                      'among', 'amongst', 'amoungst', 'amount',
                                      'an', 'and', 'another', 'any', 'anyhow',
                                      'anyone', 'anything', 'anyway',
                                      'anywhere', ...}),
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [13]:
len(vectorizer.vocabulary_)

26747

In [24]:
X_train = vectorizer.fit_transform(newsgroups_train.data)
X_train.shape

(11314, 26747)

In [34]:

def LDA(X_train, Number_Of_Topics, alpha, beta, n_iter=10):
    n_kw = np.zeros((Number_Of_Topics, X_train.shape[1]))
    n_dk = np.zeros((X_train.shape[0], Number_Of_Topics))
    n_k = np.zeros(Number_Of_Topics)
    docs, words = X_train.nonzero()
    z = np.random.choice(Number_Of_Topics, len(docs))
    
    for doc, word, cur_z in zip(docs, words, z):
        n_dk[doc, cur_z] += 1
        n_kw[cur_z, word] += 1
        n_k[cur_z] += 1
    
    for cur_iter in range(n_iter):
        for i in range(len(docs)):
            cur_word = words[i]
            cur_doc = docs[i]
            cur_topic = z[i]
            
            n_dk[cur_doc, cur_topic] -= 1
            n_kw[cur_topic, cur_word] -= 1
            n_k[cur_topic] -= 1
            
            p = (n_dk[cur_doc, :] + alpha) * (n_kw[:, cur_word] + beta[cur_word]) / (n_k + beta.sum())
            z[i] = np.random.choice(np.arange(Number_Of_Topics), p=p / p.sum())
            
            n_dk[cur_doc, z[i]] += 1
            n_kw[z[i], cur_word] += 1
            n_k[z[i]] += 1
    
    return z, n_kw, n_dk, n_k

In [35]:
Number_Of_Topics=20
z, n_kw, n_dk, n_k = LDA(X_train, Number_Of_Topics, 1 * np.ones(Number_Of_Topics),1 * np.ones(X_train.shape[1]), 30)

на таком словаре работало очень долго, так что пришлось запустить всего 30 итераций

In [36]:
top_words = np.argsort(n_kw, axis=1)[:, :-11:-1]

for topic in range(20):
    doc = np.zeros((1, X_train.shape[1]))
    for word in top_words[topic]:
        doc[0, word] = 1
    print('Topic {}:\t{}'.format(topic, '\t'.join(vectorizer.inverse_transform(doc)[0])))

Topic 0:	does	just	know	like	need	problem	thanks	use	using	windows
Topic 1:	called	edu	hear	posted	recall	saw	send	steve	times	wondering
Topic 2:	interested	missing	negative	noticed	past	school	sort	tank	thank	wouldn
Topic 3:	children	government	israel	israeli	jewish	jews	killed	said	war	world
Topic 4:	cheers	instead	pair	posting	problem	putting	test	theory	wanted	word
Topic 5:	14	end	ma	mi	mn	mq	mr	mt	mw	pl
Topic 6:	banks	edu	geb	gordon	intellect	pitt	shameful	skepticism	soon	surrender
Topic 7:	air	bike	car	cars	engine	ground	high	light	miles	road
Topic 8:	game	games	hockey	league	play	players	season	team	win	year
Topic 9:	205	al	com	dave	intergraph	internet	look	office	uucp	uunet
Topic 10:	bike	come	comments	edu	great	michael	obvious	posting	probably	various
Topic 11:	don	good	just	know	like	make	people	really	think	time
Topic 12:	article	edu	frank	good	guy	hand	huh	middle	small	thought
Topic 13:	1993	available	data	edu	following	general	information	program	space	university
Topic 14:

темы очень плохо выявляются, хотя какие-то выделить можно, например 
15 вера
17 политика
18 транспорт
5 что-то связано с windows
7 машины
соотнести конкретно с темами изначальными можно с трудом
поэтому уменьшим словарь и запустим большее число итераций


In [38]:
vectorizer = CountVectorizer(lowercase=True, stop_words=ENGLISH_STOP_WORDS,
                             analyzer='word', binary=True, max_df=0.04, min_df=11)
vectorizer.fit(newsgroups_train.data)

CountVectorizer(analyzer='word', binary=True, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=0.04, max_features=None, min_df=11,
                ngram_range=(1, 1), preprocessor=None,
                stop_words=frozenset({'a', 'about', 'above', 'across', 'after',
                                      'afterwards', 'again', 'against', 'all',
                                      'almost', 'alone', 'along', 'already',
                                      'also', 'although', 'always', 'am',
                                      'among', 'amongst', 'amoungst', 'amount',
                                      'an', 'and', 'another', 'any', 'anyhow',
                                      'anyone', 'anything', 'anyway',
                                      'anywhere', ...}),
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [39]:
len(vectorizer.vocabulary_)

9544

Отлично, слов втрое меньше

In [40]:
X_train = vectorizer.fit_transform(newsgroups_train.data)
X_train.shape

(11314, 9544)

In [41]:
Number_Of_Topics=20
z, n_kw, n_dk, n_k = LDA(X_train, Number_Of_Topics, 1 * np.ones(Number_Of_Topics),1 * np.ones(X_train.shape[1]), 60)
top_words = np.argsort(n_kw, axis=1)[:, :-11:-1]

for topic in range(20):
    doc = np.zeros((1, X_train.shape[1]))
    for word in top_words[topic]:
        doc[0, word] = 1
    print('Topic {}:\t{}'.format(topic, '\t'.join(vectorizer.inverse_transform(doc)[0])))

Topic 0:	100	50	asking	condition	offer	original	price	sale	sell	shipping
Topic 1:	application	code	file	files	ftp	graphics	running	server	version	window
Topic 2:	board	card	computer	disk	mac	memory	monitor	pc	speed	video
Topic 3:	certain	deal	exactly	known	mind	nice	posting	similar	stuff	tried
Topic 4:	advance	anybody	appreciate	appreciated	hi	info	net	reply	thank	wondering
Topic 5:	article	hear	interesting	news	reading	recall	sounds	stuff	university	wonder
Topic 6:	1993	center	date	earth	nasa	research	science	space	systems	university
Topic 7:	11	12	13	14	16	17	18	23	24	25
Topic 8:	bible	christ	christian	christians	church	faith	jesus	john	love	man
Topic 9:	banks	cause	disease	food	gordon	pitt	skepticism	soon	surrender	usually
Topic 10:	chip	clipper	encryption	key	keys	message	phone	public	secure	security
Topic 11:	american	business	care	change	clinton	house	money	pay	president	states
Topic 12:	control	crime	gun	guns	law	laws	police	rights	self	weapons
Topic 13:	current	difference	heat	

все намного лучше
здесь темы выявяются гораздо, а некоторые очень хорошо соотносятся с изначальными: 14 хоккей, 8 вера и так далее