In [6]:
import numpy as np
from tqdm import tqdm

def LDA(words, docs, words_num, texts_num, niter = 50):
    
    topics_num = 20
    n_dk = np.zeros((texts_num, topics_num))
    n_kw = np.zeros((topics_num, words_num))
    n_k = np.zeros(topics_num)
    N = len(docs)
    alpha = np.ones(topics_num)
    beta = np.ones(words_num)
    
    z = [np.random.choice(topics_num) for i in range(N)]
    
    for d, w, zed in zip(docs, words, z):
        n_dk[d, zed] += 1
        n_kw[zed, w] += 1
        n_k[zed] += 1
        
    for it in tqdm(range(niter)):
        for i in range(N):
            word = words[i]
            topic = z[i]
            d = docs[i]
            
            n_dk[d, topic] -= 1
            n_kw[topic, word] -= 1
            n_k[topic] -= 1
            p = (n_dk[d, :] + alpha) * (n_kw[:, word] + beta[word]) / (n_k + beta.sum())
            topic = np.random.choice(np.arange(topics_num), p = p / p.sum())
            z[i] = topic
            
            n_dk[d, z[i]] += 1
            n_kw[z[i], word] += 1
            n_k[z[i]] += 1
    return z, n_dk, n_kw, n_k

In [2]:
from sklearn.datasets import fetch_20newsgroups

newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))

In [3]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS

vectorizer = CountVectorizer(lowercase=True, stop_words=ENGLISH_STOP_WORDS,
                             analyzer='word', binary=True, min_df=10, max_df=0.04) #сократили словарь
vectorizer.fit(newsgroups_train.data)

CountVectorizer(analyzer='word', binary=True, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.04, max_features=None, min_df=10,
        ngram_range=(1, 1), preprocessor=None,
        stop_words=frozenset({'move', 'first', 'nine', 'con', 'in', 'go', 'well', 'eleven', 'too', 'a', 'own', 'along', 'we', 'herein', 'yourself', 'thus', 'since', 'can', 'full', 'may', 'two', 'where', 'other', 'now', 'for', 'seeming', 'least', 'eight', 'their', 'which', 'eg', 'name', 'alone', 'show', 'is'... 'please', 'serious', 'was', 'elsewhere', 'wherever', 'no', 'or', 'they', 'your', 'whatever', 'if'}),
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [5]:
X_train = vectorizer.fit_transform(newsgroups_train.data)
X = X_train.toarray()
docs, words = X.nonzero()
shape = X.shape
texts_num = shape[0]
words_num = shape[1]
print(texts_num, words_num)

11314 10299


In [7]:
X_train = vectorizer.fit_transform(newsgroups_train.data)
X = X_train.toarray()
docs, words = X.nonzero()
shape = X.shape
texts_num = shape[0]
words_num = shape[1]

z, n_dk, n_kw, n_k = LDA(words, docs, words_num, texts_num)

100%|███████████████████████████████████████████████████████████████████████████████| 50/50 [5:23:30<00:00, 120.16s/it]


In [11]:
words_sorted = np.argsort(n_kw, axis = 1)

print(words_sorted)
topics_num = 20
voc={}
for item in vectorizer.vocabulary_:
    voc[vectorizer.vocabulary_[item]] = item

for i in range(topics_num):
    print("Topic =", i, ": ")
    for t in range(10):
        print(voc[words_sorted[i, words_num - 1 - t]])
    print("\n")

[[    0  5758  5756 ...  1539  2093  5269]
 [ 5149  5156  5155 ...  1930  1547  1905]
 [ 8880  4543  8516 ...  9587  9979  4668]
 ...
 [10298  4269  4268 ...  1198  6322  8718]
 [ 5149  5787  5786 ...  9407  2867 10052]
 [10298  7448  7449 ... 10091  3969  3966]]
Topic = 0 : 
jesus
christian
bible
christians
christ
religion
word
claim
agree
faith


Topic = 1 : 
car
bike
cars
engine
speed
road
miles
buy
ride
turn


Topic = 2 : 
history
war
turkish
source
armenians
armenian
men
argic
million
women


Topic = 3 : 
deleted
stuff
internet
knows
reading
al
dave
home
uucp
anybody


Topic = 4 : 
sorry
anybody
cheers
longer
hey
oh
love
article
sort
nice


Topic = 5 : 
oh
love
wonder
book
wondering
thank
doubt
define
couldn
came


Topic = 6 : 
card
pc
memory
video
disk
computer
mac
dos
advance
monitor


Topic = 7 : 
11
24
14
18
12
13
17
16
40
23


Topic = 8 : 
gun
rights
law
israel
soon
laws
crime
country
weapons
guns


Topic = 9 : 
sorry
anybody
stay
looks
info
stuff
oh
correct
ok
unless


Topic

По подборке слов можно сделать выводы о темах текстов:

0 - religion.christian
1 - autos
2 - politics.mideast
6 - computers
8 - guns
10 - space
11 - crypt
12 - politics
13 - sports.hockey
14 - for sale
16 - medicine
17 - space
19 - ms-windows(?)