In [1]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from random import seed, randrange
import numpy as np

seed(0)
newsgroups = fetch_20newsgroups(remove=('headers', 'footers', 'quotes'))

D = len(newsgroups.data)
K = 50

In [2]:
newsgroups.data[0]

'I was wondering if anyone out there could enlighten me on this car I saw\nthe other day. It was a 2-door sports car, looked to be from the late 60s/\nearly 70s. It was called a Bricklin. The doors were really small. In addition,\nthe front bumper was separate from the rest of the body. This is \nall I know. If anyone can tellme a model name, engine specs, years\nof production, where this car is made, history, or whatever info you\nhave on this funky looking car, please e-mail.'

In [3]:
vectorizer = CountVectorizer(stop_words='english', min_df=2, token_pattern=r'[A-Za-z][A-Za-z]+')
T = vectorizer.fit_transform(newsgroups.data).toarray()

In [4]:
T.shape

(11314, 33815)

In [5]:
W = T.shape[1]
print(f'topics: {K}, vocab size: {W}, docs: {D}')

topics: 50, vocab size: 33815, docs: 11314


In [6]:
word_names = vectorizer.get_feature_names_out()

In [7]:
pi = []
for i in range(D):
    pi.append(i%K)
pi = np.array(pi)

In [8]:
mu = np.zeros((K, W))

In [9]:
iter_no = 1
current_docset_ind = 0

while True:
    current_docset_ind = (current_docset_ind + 1) % 10
    docset = range(int(current_docset_ind/10*D), int((current_docset_ind+1)/10*D))
    for topic in range(K):
        total_words_in_topic = W
        word_dist_in_topic = np.ones((W))
        for doc in docset:
            if pi[doc] != topic:
                continue
            word_dist_in_topic += T[doc]
            total_words_in_topic += np.sum(T[doc])
        if total_words_in_topic > 0:
            word_dist_in_topic /= total_words_in_topic
        else:
            word_dist_in_topic = np.ones((W)) / W
        mu[topic] = word_dist_in_topic

    log_mu = np.log(mu, out=np.zeros_like(mu), where=(mu!=0))
    old_pi = np.copy(pi)
    
    for doc in docset:
        max_log_prob = -np.inf
        max_log_prob_topic = -1
        for topic in range(K):
            log_probability_of_doc_given_topic = np.dot(log_mu[topic], T[doc])
            if log_probability_of_doc_given_topic > max_log_prob:
                max_log_prob = log_probability_of_doc_given_topic
                max_log_prob_topic = topic
        pi[doc] = max_log_prob_topic
        
    num_docs_changed = np.count_nonzero(pi - old_pi)
    print(f'iter {iter_no}, num docs w changed topics {num_docs_changed}')
    
    if num_docs_changed == 0:
        print('done.')
        break
    
    iter_no += 1

iter 1, num docs w changed topics 213
iter 2, num docs w changed topics 172
iter 3, num docs w changed topics 271
iter 4, num docs w changed topics 189
iter 5, num docs w changed topics 218
iter 6, num docs w changed topics 235
iter 7, num docs w changed topics 156
iter 8, num docs w changed topics 216
iter 9, num docs w changed topics 135
iter 10, num docs w changed topics 118
iter 11, num docs w changed topics 67
iter 12, num docs w changed topics 62
iter 13, num docs w changed topics 158
iter 14, num docs w changed topics 102
iter 15, num docs w changed topics 141
iter 16, num docs w changed topics 102
iter 17, num docs w changed topics 89
iter 18, num docs w changed topics 131
iter 19, num docs w changed topics 36
iter 20, num docs w changed topics 61
iter 21, num docs w changed topics 94
iter 22, num docs w changed topics 64
iter 23, num docs w changed topics 215
iter 24, num docs w changed topics 140
iter 25, num docs w changed topics 171
iter 26, num docs w changed topics 105
it

In [10]:
pi

array([13, 13, 13, ..., 36, 36, 13])

In [11]:
for i in range(K):
    print(np.count_nonzero(pi==i))

1513
36
18
21
39
20
29
20
30
466
34
30
20
982
1459
21
82
991
33
964
914
13
38
27
29
30
26
24
28
21
31
26
36
28
40
127
965
124
23
31
906
22
563
25
24
35
290
21
23
16


In [12]:
for topic in range(K):
    total_words_in_topic = 0
    word_dist_in_topic = np.zeros((W))
    for doc in range(D):
        if pi[doc] != topic:
            continue
        word_dist_in_topic += T[doc]
        total_words_in_topic += np.sum(T[doc])
    word_dist_in_topic /= total_words_in_topic
    mu[topic] = word_dist_in_topic

In [13]:
for topic in range(K):
    top_ten_indices = sorted(range(W), key=lambda w: mu[topic][w])[-10:]
    top_ten_for_this_topic = word_names[top_ten_indices]
    print(f'topic {topic}: {top_ten_for_this_topic}')

topic 0: ['time' 'use' 'does' 'god' 'think' 'know' 'people' 'like' 'just' 'don']
topic 1: ['server' 'usr' 'mit' 'subject' 'pub' 'attack' 'file' 'use' 'available'
 'edu']
topic 2: ['chicago' 'leads' 'san' 'york' 'new' 'idle' 'guide' 'borland' 'lost'
 'won']
topic 3: ['history' 'ra' 'professor' 'church' 'turkish' 'god' 'jesus' 'greek'
 'university' 'bible']
topic 4: ['eof' 'available' 'oname' 'printf' 'ftp' 'op' 'entry' 'program' 'file'
 'output']
topic 5: ['space' 'lens' 'library' 'professor' 'armenian' 'st' 'university'
 'libxmu' 'xmu' 'lib']
topic 6: ['tm' 'di' 'wm' 'ei' 'bxn' 'bhj' 'giz' 'pl' 'max' 'ax']
topic 7: ['dr' 'crypto' 'rates' 'recipient' 'conference' 'program' 'university'
 'barbara' 'santa' 'kk']
topic 8: ['chip' 'law' 'dead' 'police' 'serial' 'bullock' 'number' 'people' 'maria'
 'said']
topic 9: ['com' 'available' 'dos' 'know' 'does' 'windows' 'edu' 'just' 'like' 'use']
topic 10: ['pm' 'committee' 'mail' 'people' 'anonymity' 'anon' 'service' 'posting'
 'edu' 'anonymous']
