In [1]:
import numpy as np
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
import time

newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))
num_docs = 2000
texts = newsgroups_train.data[:num_docs]

vectorizer = CountVectorizer(
    max_features=2000, 
    stop_words='english',
    min_df=5,
    max_df=0.7
)
X = vectorizer.fit_transform(texts)
vocab = vectorizer.get_feature_names_out()
V = len(vocab)
print(f"Словарь: {V} слов")

Словарь: 2000 слов


In [2]:
M = X.shape[0]

rows, cols = X.nonzero()
counts = X.data.astype(int)

all_words = []
all_doc_ids = []
for i in range(len(rows)):
    all_words.extend([cols[i]] * counts[i])
    all_doc_ids.extend([rows[i]] * counts[i])

all_words = np.array(all_words, dtype=np.int32)
all_doc_ids = np.array(all_doc_ids, dtype=np.int32)
W = len(all_words)
print(f"Всего слов: {W}")

K = 20
alpha = 0.1
beta = 0.01
n_iter = 50
beta_sum = beta * V

n_dk = np.zeros((M, K), dtype=np.int32)
n_kw = np.zeros((K, V), dtype=np.int32)
n_k = np.zeros(K, dtype=np.int32)

z = np.random.randint(0, K, size=W, dtype=np.int32)
for i in range(W):
    doc_id = all_doc_ids[i]
    word_id = all_words[i]
    topic = z[i]
    n_dk[doc_id, topic] += 1
    n_kw[topic, word_id] += 1
    n_k[topic] += 1

print(f"Начало Gibbs Sampling ({n_iter} итераций)...")
start_time = time.time()

alpha_vec = np.full(K, alpha, dtype=np.float32)
beta_vec = beta
for iteration in range(n_iter):
    order = np.random.permutation(W)
    
    for idx in order:
        doc_id = all_doc_ids[idx]
        word_id = all_words[idx]
        old_topic = z[idx]
        n_dk[doc_id, old_topic] -= 1
        n_kw[old_topic, word_id] -= 1
        n_k[old_topic] -= 1
        p = (n_dk[doc_id, :] + alpha_vec) * (n_kw[:, word_id] + beta_vec) / (n_k + beta_sum)
        p_sum = p.sum()
        if p_sum > 0:
            p = p / p_sum
            new_topic = np.random.choice(K, p=p)
        else:
            new_topic = np.random.randint(0, K)
        
        z[idx] = new_topic
        n_dk[doc_id, new_topic] += 1
        n_kw[new_topic, word_id] += 1
        n_k[new_topic] += 1
    
    if (iteration + 1) % 10 == 0:
        print(f"  Итерация {iteration + 1}/{n_iter}")

end_time = time.time()
print(f"Время выполнения: {end_time - start_time:.2f} секунд")
phi = (n_kw + beta) / (n_k[:, np.newaxis] + beta_sum)
for k in range(K):
    top_indices = np.argsort(phi[k])[-10:][::-1]
    top_words = [vocab[i] for i in top_indices]
    print(f"Тема {k:2d}: {', '.join(top_words)}")

Всего слов: 107496
Начало Gibbs Sampling (50 итераций)...
  Итерация 10/50
  Итерация 20/50
  Итерация 30/50
  Итерация 40/50
  Итерация 50/50
Время выполнения: 220.59 секунд
Тема  0: 10, type, contact, comments, health, net, control, disease, medical, research
Тема  1: god, jesus, does, think, know, believe, good, things, christian, say
Тема  2: max, 145, pl, 1t, wm, 7u, 5u, 04, wt, mq
Тема  3: people, argument, true, truth, non, public, example, war, general, rights
Тема  4: key, use, motif, data, server, clipper, chip, encryption, image, does
Тема  5: good, just, like, think, know, ve, really, don, got, going
Тема  6: law, like, don, people, gun, point, fact, killed, sure, right
Тема  7: year, time, just, don, way, like, make, say, think, good
Тема  8: edu, com, os, cs, comp, windows, ca, mark, mail, colorado
Тема  9: games, game, team, play, 12, 10, season, period, 13, power
Тема 10: 00, 17, mv, 14, hz, lk, 24, 27, dos, 15
Тема 11: israel, jewish, used, son, father, conclusion, spi

1 soc.religion.christian
3 talk.politics.misc
4 comp.os
11 soc.religion.christian
13 comp.os.ms-windows.misc
16 talk.politics.misc