In [8]:
import numpy as np
from sklearn.datasets import fetch_20newsgroups
newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))
newsgroups_train.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [9]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
vectorizer = CountVectorizer(lowercase=True, stop_words=ENGLISH_STOP_WORDS, analyzer='word', binary=True, min_df = 10, max_df =.04)
vectorizer.fit(newsgroups_train.data)

CountVectorizer(analyzer='word', binary=True, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.04, max_features=None, min_df=10,
        ngram_range=(1, 1), preprocessor=None,
        stop_words=frozenset({'is', 'via', 'there', 'although', 'where', 'towards', 'five', 'most', 'about', 're', 'whoever', 'themselves', 'never', 'hereafter', 'this', 'last', 'call', 'indeed', 'much', 'few', 'them', 'yourself', 'full', 'thereupon', 'thru', 'herein', 'see', 'former', 'thin', 'already', 'f...ehow', 'now', 'his', 'has', 'how', 'if', 'either', 'inc', 'nothing', 'will', 'too', 'him', 'would'}),
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [10]:
X_train = vectorizer.fit_transform(newsgroups_train.data)
X_train[1]

<1x10297 sparse matrix of type '<class 'numpy.int64'>'
	with 37 stored elements in Compressed Sparse Row format>

In [15]:
from tqdm import tqdm
def lda(n, X, niter , a, b):
    nkw = np.zeros((n, X.shape[1]))
    ndk = np.zeros((X.shape[0], n))
    nk = np.zeros(n)
    docs, words = X.nonzero()
    tags = np.random.choice(n, len(docs))
    for w,d,t in zip(words, docs, tags):
        nkw[t,w] += 1
        ndk[d,t] += 1
        nk[t] +=1
    for i in tqdm(range(niter)):
        for j in range(len(docs)):
            t=tags[j]
            nkw[t,words[j]] -= 1
            ndk[docs[j], t] -=1
            nk[t] -=1
            p=(ndk[docs[j], :] + a)*(nkw[:,words[j]] + b[words[j]]) /(nk + b.sum())
            tags[j] = np.random.choice(np.arange(n), p = p / p.sum())
            nkw[tags[j], words[j]] += 1
            ndk[ docs[j], tags[j]] += 1
            nk[tags[j]] += 1
    return nkw, ndk, nk, tags

In [16]:
n=20
nkw, ndk, nk, tags= lda(n, X_train, 50, 1*np.ones(n), 1*np.ones(X_train.shape[1]))

100%|██████████| 50/50 [40:27<00:00, 48.43s/it]


In [18]:
word = np.argsort(nkw)[:,:-11:-1]
for k in range(20):
    a = np.zeros((1, X_train.shape[1]))
    for w in word[k]:
        a[0, w] = 1
    print('Topic {}:\t{}'.format(k, '\t'.join(vectorizer.inverse_transform(a)[0])))

Topic 0:	anybody	article	cheers	curious	deleted	knows	posting	recall	sound	stuff
Topic 1:	11	12	13	14	16	17	18	19	24	93
Topic 2:	chicago	game	games	hockey	play	rangers	st	team	toronto	win
Topic 3:	bible	christ	christian	christians	claim	jesus	man	religion	saying	word
Topic 4:	armenian	armenians	children	history	killed	men	source	today	turkish	war
Topic 5:	came	days	happened	home	left	saw	started	told	took	went
Topic 6:	banks	cause	disease	effect	gordon	medical	normal	pitt	soon	surrender
Topic 7:	14	24	50	ah	hp	ma	mi	mr	ms	tm
Topic 8:	care	change	feel	guess	hand	happen	haven	money	oh	sort
Topic 9:	earth	large	low	nasa	project	research	science	small	space	systems
Topic 10:	agree	anti	arab	arabs	israel	israeli	jewish	jews	peace	war
Topic 11:	application	code	display	file	files	ftp	running	server	version	window
Topic 12:	address	advance	appreciate	current	fax	hi	internet	simple	thank	university
Topic 13:	american	clinton	control	gun	house	law	national	public	rights	states
Topic 14:	bike	bu

Получили темы:
2. Хоккей
3. Религия
4. Преступления и их статистика
7. Сокращения из Windows
8. Разговорная речь
9. Космос
10. История/Страны
15. Баскетболл
16. Компьютер
17. Криптография

В исходном датасете все присутствуют. при увеличении числа итерации и расширении словаря получаем более точные данные. 
(проблемя в том, что занимает очень много времени)