In [1]:
import numpy as np
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction import _stop_words
from sklearn.feature_extraction.text import CountVectorizer
import nltk
from nltk.stem import WordNetLemmatizer 
from tqdm import tqdm

In [2]:
newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))
nameOfTag = newsgroups_train.target_names
n_features = 4700
n_components = 20
n_top_words = 10


vectorizer = CountVectorizer(
                    lowercase=True, stop_words=_stop_words.ENGLISH_STOP_WORDS,
                    analyzer='word', binary=True,
                    max_df=0.95, min_df=2,
                    max_features=n_features
)

X_train = vectorizer.fit_transform(newsgroups_train.data).toarray()

In [3]:
def _customLDA(n_d_k, n_k_w, n_k, _z, _document, _word, _alpha, _beta, _topic,  max_iter=10):
    for i in tqdm(range(max_iter)):
        for j in range(len(_document)):
            cur_word = _word[j]
            cur_document = _document[j]
            cur_topic = _z[j]
            n_d_k[cur_document, cur_topic] -= 1
            n_k_w[cur_topic, cur_word] -= 1
            n_k[cur_topic] -= 1
            p = (n_d_k[cur_document, :] + _alpha) * (n_k_w[:, cur_word] + _beta[cur_word]) / (n_k + _beta.sum())
            _z[j] = np.random.choice(np.arange(_topic), p = p / p.sum())
            n_d_k[cur_document, _z[j]] += 1
            n_k_w[_z[j], cur_word] += 1
            n_k[_z[j]] += 1
    return n_d_k, n_k_w, n_k, _z
topic = 20
n_d_k = np.zeros( topic * X_train.shape[0]).reshape(X_train.shape[0], topic)
n_k_w = np.zeros( topic * X_train.shape[1]).reshape(topic, X_train.shape[1])
n_k = np.zeros(topic)
document, word = X_train.nonzero()
z = np.random.choice(topic, len(document))
for i, j, k in zip(document, word, z):
    n_d_k[i, k] += 1
    n_k_w[k, j] += 1
    n_k[k] += 1

In [4]:
n_d_k, n_k_w,  n_k, z = _customLDA(n_d_k, n_k_w, n_k, z, document, word, np.ones(20), np.ones(X_train.shape[1]), 20, max_iter=30)

100%|██████████████████████████████████████████| 30/30 [24:44<00:00, 49.47s/it]


In [5]:
result = np.argsort(n_k_w, axis=1)[:, -10:]
for i in range(20):
    matrix = np.zeros((1, X_train.shape[1]))
    for j in result[i]:
        matrix[0, j] = 1
    print('Tag {} \t{}'.format(i + 1, '\t'.join(vectorizer.inverse_transform(matrix)[0])))

Tag 1 	don	just	know	like	make	people	really	say	things	think
Tag 2 	code	end	number	posting	problem	questions	time	try	use	way
Tag 3 	bike	car	does	good	just	like	need	power	want	work
Tag 4 	course	doesn	don	like	people	point	probably	problem	say	ve
Tag 5 	condition	edu	interested	mail	new	offer	price	sale	sell	send
Tag 6 	article	day	days	later	like	number	old	probably	times	year
Tag 7 	did	didn	going	just	old	right	said	time	ve	way
Tag 8 	doesn	don	going	just	know	make	thing	think	time	ve
Tag 9 	country	government	israel	law	people	rights	state	states	war	world
Tag 10 	cost	development	general	low	major	nasa	national	research	science	space
Tag 11 	does	don	good	just	know	like	little	make	new	want
Tag 12 	chip	clipper	encryption	government	key	keys	public	security	use	used
Tag 13 	let	make	people	question	really	right	think	time	trying	way
Tag 14 	address	com	edu	email	information	know	looking	mail	post	thanks
Tag 15 	best	better	game	games	good	play	team	think	win	year
Tag 16 	10	11