In [13]:
import numpy as np
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS

In [14]:
newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))
vectorizer = CountVectorizer(lowercase=True, stop_words=ENGLISH_STOP_WORDS,analyzer='word', 
                             binary=True,min_df=10, max_df=.04)
X_train = vectorizer.fit_transform(newsgroups_train.data)

X_train.toarray()
#docs,words=X_train.nonzero() 
#print(X_train.shape) #(11314, 10299)
#print(len(docs)) #it is N=480590

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [15]:
topics = 20
n_k=np.zeros(topics)
n_dk=np.zeros(topics*X_train.shape[0]).reshape(X_train.shape[0],topics)
n_kw=np.zeros(topics*X_train.shape[1]).reshape(topics,X_train.shape[1])
doc, word = X_train.nonzero()
z = np.random.choice(topics, len(doc))
    
for i, j, k in zip(doc, word, z):
    n_dk[i, k] = n_dk[i, k] + 1
    n_kw[k, j] = n_kw[k, j] + 1
    n_k[k] = n_k[k] + 1

In [16]:
def lda(n_dk, n_kw, n_k, z, doc, word, topics, alpha, beta, it):    
    for i in range(it):
        for j in range(len(doc)):
            cur_word = word[j]
            cur_doc = doc[j]
            cur_topic = z[j]
            
            n_dk[cur_doc, cur_topic] = n_dk[cur_doc, cur_topic] - 1
            n_kw[cur_topic, cur_word] = n_kw[cur_topic, cur_word - 1
            n_k[cur_topic] = n_k[cur_topic] - 1
            
            p = (n_dk[cur_doc, :] + alpha) * (n_kw[:, cur_word] + beta[cur_word]) / (n_k + beta.sum())
            z[j] = np.random.choice(np.arange(topics), p=p / p.sum())
            
            n_dk[cur_doc, z[j]] = n_dk[cur_doc, z[j]] + 1
            n_kw[z[j], cur_word] = n_kw[z[j], cur_word] + 1
            n_k[z[j]] = n_k[z[j]] + 1
    return z, n_kw, n_dk, n_k

In [17]:
z,n_kw, n_dk, n_k=lda(n_dk, n_kw, n_k, z, doc, word, 20, 2*np.ones(20), 2*np.ones(X_train.shape[1]), 100)

In [18]:
answer = np.argsort(n_kw, axis=1)[:, -10:]
for i in range(20):
    matrix = np.zeros((1, X_train.shape[1]))
    for j in answer[i]:
        matrix[0, j] = 1
    print('\t'.join(vectorizer.inverse_transform(matrix)[0]))

banks	cadre	chastity	geb	gordon	pitt	shameful	skepticism	soon	surrender
car	couple	money	net	nice	oh	says	stuff	thank	wasn
appreciated	bike	car	cars	looks	small	stuff	week	wonder	worth
anybody	check	comes	effect	goes	left	oh	simple	single	wouldn
code	cost	couldn	couple	deal	head	hear	nice	reply	type
came	children	israel	israeli	jews	killed	land	took	war	women
1993	april	center	general	low	national	research	science	space	university
car	certainly	couple	exactly	home	knows	nice	posting	price	radio
algorithm	chip	clipper	encryption	key	keys	phone	public	secure	security
cause	change	experience	feel	instead	love	ok	wasn	wondering	wouldn
car	current	difference	guess	mike	money	sorry	sounds	stuff	thank
card	computer	disk	dos	memory	pc	price	sale	speed	video
11	12	14	17	24	game	games	play	season	team
advance	anybody	buy	car	couple	guy	mentioned	phone	rest	service
aren	difference	haven	hear	left	months	open	running	sorry	wanted
answer	anybody	bike	cost	feel	sorry	type	unless	usually	value
100	an

Мы вывели топ-10 слов из заданного количества категорий (20) согласно алгоритму. Заметим, что можно дать интерпретацию большинству из групп слов.
К некоторым категориям нельзя однозначно подобрать тэги, как и к некоторым тэгам нельзя однозначно подобрать категории. 